if (!require(data.table)) install.packages("data.table")
## Loading required package: data.table
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(data.table)
data(population, package = "tidyr")
I use population
data from tidyr
package as an example.
glimpse(population)
## Observations: 4,060
## Variables: 3
## $ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afgha...
## $ year <int> 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 200...
## $ population <int> 17586073, 18415307, 19021226, 19496836, 19987071, 2...
summary(population)
## country year population
## Length:4060 Min. :1995 Min. :1.129e+03
## Class :character 1st Qu.:1999 1st Qu.:6.029e+05
## Mode :character Median :2004 Median :5.319e+06
## Mean :2004 Mean :3.003e+07
## 3rd Qu.:2009 3rd Qu.:1.855e+07
## Max. :2013 Max. :1.386e+09
class(population)
## [1] "tbl_df" "tbl" "data.frame"
populationDT <- as.data.table(population)
class(populationDT)
## [1] "data.table" "data.frame"
The .SD Method in data.table
t <- proc.time()
tmp1 <- populationDT[
, .SD[population < median(population)], by = c("country")
]
proc.time() - t
## user system elapsed
## 0.063 0.000 0.061
The .I Methond in data.table
t <- proc.time()
tmp2 <- populationDT[
populationDT[, .I[population < median(population)], by = c("country")]$V1
]
proc.time() - t
## user system elapsed
## 0.009 0.000 0.008
The dplyr
method
t <- proc.time()
tmp3 <- population %>%
group_by(country) %>%
filter(population < median(population)) %>%
ungroup()
proc.time() - t
## user system elapsed
## 0.053 0.000 0.053
Whether Is the Fianl Data Identical?
identical(tmp1, tmp2)
## [1] TRUE
setdiff(tmp2, tmp3)
## Empty data.table (0 rows) of 3 cols: country,year,population
I got same data.frame by three different methods. In terms of speed, ,I
in data.table
is the fastest.