Mahbubul Majumder, PhD
Sep 23, 2014
It takes time to read the data
How can we handle this?
Two convenient packages, extremely fast. Faster than base R
data.table
by Matt Dowledplyr
by Hadley Wickhamdata.table
uses []
for all data manipulation
[]
it is sometimes not very convenient to readdplyr
is based on a consistent grammar very easy to understand
dplyr
provides database functionality as welldplyr
works with data.table
install.packages("data.table")
library(data.table)
fread()
, its faster than read.table()
but works similarly
nrows = 0
to see the column names of a massive data setdata.table()
data.table
also works as a data.frame
, so other functions can work with it
All syntax must be inside []
DT[row, column, by]
DT[where, select|update, by]
library(data.table)
DT = data.table(x = rep(
c("a","b","c"), each=3),
y = c(1,3,6), v = 1:9)
DT
x y v
1: a 1 1
2: a 3 2
3: a 6 3
4: b 1 4
5: b 3 5
6: b 6 6
7: c 1 7
8: c 3 8
9: c 6 9
DT[2] = ?
DT[x == "a", ]
x y v
1: a 1 1
2: a 3 2
3: a 6 3
DT[ , sum(v), by=x]
x V1
1: a 6
2: b 15
3: c 24
Same as tapply(DT$v,DT$x,sum)
Much faster than tapply()
DT[ , z := y%%2, by=x]
x y v z
1: a 1 1 1
2: a 3 2 1
3: a 6 3 0
4: b 1 4 1
5: b 3 5 1
6: b 6 6 0
7: c 1 7 1
8: c 3 8 1
9: c 6 9 0
DT[ , sum(v), by = list(x,z)]
x z V1
1: a 1 3
2: a 0 3
3: b 1 9
4: b 0 6
5: c 1 15
6: c 0 9
DT[,':='(w=sum(v),ww=mean(v)),
by=x]
x y v z w ww
1: a 1 1 1 6 2
2: a 3 2 1 6 2
3: a 6 3 0 6 2
4: b 1 4 1 15 5
5: b 3 5 1 15 5
6: b 6 6 0 15 5
7: c 1 7 1 24 8
8: c 3 8 1 24 8
9: c 6 9 0 24 8
DT[,list(var1=sum(v),
var2=min(v),
var3=max(v)),
by = list(x,z)]
setkey(DT,x)
DT['a']
x y v z w ww
1: a 1 1 1 6 2
2: a 3 2 1 6 2
3: a 6 3 0 6 2
DT[x=='a']
W <- data.table(x=c("b","c","d"), foo=c(4,2,9))
W
x foo
1: b 4
2: c 2
3: d 9
DT[W]
x y v z w ww foo
1: b 1 4 1 15 5 4
2: b 3 5 1 15 5 4
3: b 6 6 0 15 5 4
4: c 1 7 1 24 8 2
5: c 3 8 1 24 8 2
6: c 6 9 0 24 8 2
7: d NA NA NA NA NA 9
DT[W, sum(v)*foo]
x V1
1: b 60
2: c 48
3: d NA
Common tasks we need to do with data
Five verbs in data manipulation
install.packages("dplyr")
library(dplyr)
filter()
select()
arrange()
mutate()
summarize()
groub_by()
summarise_each()
R
approachtips[tips$size==3 & tips$tip > 4, ]
total_bill tip sex smoker day time size
40 31.27 5.00 Male No Sat Dinner 3
171 50.81 10.00 Male Yes Sat Dinner 3
215 28.17 6.50 Female Yes Sat Dinner 3
239 35.83 4.67 Female No Sat Dinner 3
240 29.03 5.92 Male No Sat Dinner 3
dplyr
approachfilter(tips, size==3, tip > 4)
total_bill tip sex smoker day time size
1 31.27 5.00 Male No Sat Dinner 3
2 50.81 10.00 Male Yes Sat Dinner 3
3 28.17 6.50 Female Yes Sat Dinner 3
4 35.83 4.67 Female No Sat Dinner 3
5 29.03 5.92 Male No Sat Dinner 3
df <- head(tips,n=2)
df
total_bill tip sex smoker day time size
1 16.99 1.01 Female No Sun Dinner 2
2 10.34 1.66 Male No Sun Dinner 3
select(df,tip,sex,size)
tip sex size
1 1.01 Female 2
2 1.66 Male 3
select(df,sex:size)
sex smoker day time size
1 Female No Sun Dinner 2
2 Male No Sun Dinner 3
Variables can be selected based on search
starts_with ends_with contains matches num_range
Variables can be dropped and renamed
select(df, contains('i'))
total_bill tip time size
1 16.99 1.01 Dinner 2
2 10.34 1.66 Dinner 3
select(df, ends_with('e'))
time size
1 Dinner 2
2 Dinner 3
select(df, matches('.a.'))
total_bill day
1 16.99 Sun
2 10.34 Sun
select(df, -matches('.i.'))
sex smoker day
1 Female No Sun
2 Male No Sun
df <- arrange(tips, total_bill)
head(df, n=5)
total_bill tip sex smoker day time size
1 3.07 1.00 Female Yes Sat Dinner 1
2 5.75 1.00 Female Yes Fri Dinner 2
3 7.25 1.00 Female No Sat Dinner 1
4 7.25 5.15 Male Yes Sun Dinner 2
5 7.51 2.00 Male No Thur Lunch 2
df <- arrange(tips, total_bill, desc(tip))
head(df, n=5)
total_bill tip sex smoker day time size
1 3.07 1.00 Female Yes Sat Dinner 1
2 5.75 1.00 Female Yes Fri Dinner 2
3 7.25 5.15 Male Yes Sun Dinner 2
4 7.25 1.00 Female No Sat Dinner 1
5 7.51 2.00 Male No Thur Lunch 2
tiprate
df <- mutate(tips, tiprate = tip/total_bill)
head(df)
total_bill tip sex smoker day time size tiprate
1 16.99 1.01 Female No Sun Dinner 2 0.05945
2 10.34 1.66 Male No Sun Dinner 3 0.16054
3 21.01 3.50 Male No Sun Dinner 3 0.16659
4 23.68 3.31 Male No Sun Dinner 2 0.13978
5 24.59 3.61 Female No Sun Dinner 4 0.14681
6 25.29 4.71 Male No Sun Dinner 4 0.18624
R
approach tips$tiprate <- with(tips, tip/total_bill)
%>%
like UNIX pipe |
tips %>%
filter(size > 5) %>%
select(total_bill, tip, sex, day, size) %>%
arrange(tip) %>%
mutate(tiprate = tip/total_bill)
total_bill tip sex day size tiprate
1 29.80 4.2 Female Thur 6 0.1409
2 27.05 5.0 Female Thur 6 0.1848
3 48.17 5.0 Male Sun 6 0.1038
4 34.30 6.7 Male Thur 6 0.1953
df
x y z
1 c 1 6
2 a 1 7
3 a 2 8
4 b 1 5
5 b 2 3
6 c 2 9
7 c 3 2
grp_df <- group_by(df,x)
summarize(grp_df, w=mean(z))
Source: local data frame [3 x 2]
x w
1 a 7.500
2 b 4.000
3 c 5.667
df %>%
group_by(x) %>%
summarize(w = mean(z))
Source: local data frame [3 x 2]
x w
1 a 7.500
2 b 4.000
3 c 5.667
df %>%
group_by(x,y) %>%
summarize(w = mean(z))
tips %>%
filter(size > 3) %>%
select(total_bill, tip, sex, day, size) %>%
arrange(tip) %>%
mutate(tiprate=tip/total_bill) %>%
select(total_bill, tip, sex, day, tiprate) %>%
group_by(sex, day) %>%
summarise_each(funs(mean)) %>%
filter(tiprate > 0.12)
Source: local data frame [5 x 5]
Groups: sex
sex day total_bill tip tiprate
1 Female Sun 29.90 4.540 0.1520
2 Female Thur 31.77 4.458 0.1423
3 Male Sat 30.48 4.256 0.1403
4 Male Sun 27.06 4.001 0.1518
5 Male Thur 30.80 4.925 0.1647
dplyr
joining functions
DT
x y v z w ww
1: a 1 1 1 6 2
2: a 3 2 1 6 2
3: a 6 3 0 6 2
4: b 1 4 1 15 5
5: b 3 5 1 15 5
6: b 6 6 0 15 5
7: c 1 7 1 24 8
8: c 3 8 1 24 8
9: c 6 9 0 24 8
W
with common column x
DT
and W
only with matching x
. W
x foo
1: b 4
2: c 2
3: d 9
inner_join(DT,W)
x y v z w ww foo
1 b 1 4 1 15 5 4
2 b 3 5 1 15 5 4
3 b 6 6 0 15 5 4
4 c 1 7 1 24 8 2
5 c 3 8 1 24 8 2
6 c 6 9 0 24 8 2
DT
and W
are not returned.left_join(DT,W)
x y v z w ww foo
1 a 1 1 1 6 2 NA
2 a 3 2 1 6 2 NA
3 a 6 3 0 6 2 NA
4 b 1 4 1 15 5 4
5 b 3 5 1 15 5 4
6 b 6 6 0 15 5 4
7 c 1 7 1 24 8 2
8 c 3 8 1 24 8 2
9 c 6 9 0 24 8 2
anti_join(DT,W)
x y v z w ww
1 a 1 1 1 6 2
2 a 3 2 1 6 2
3 a 6 3 0 6 2
semi_join(DT,W)
x y v z w ww
1 b 1 4 1 15 5
2 b 3 5 1 15 5
3 b 6 6 0 15 5
4 c 1 7 1 24 8
5 c 3 8 1 24 8
6 c 6 9 0 24 8
anti_join()
and semi_join()
are just subsetting DTouter_join()
?
merge()
df <- data.frame(
x = c(1:3,2,4),
y=c("A","B",NA,"B","D"))
df
x y
1 1 A
2 2 B
3 3 <NA>
4 2 B
5 4 D
df %>% filter(
complete.cases(df))
x y
1 1 A
2 2 B
3 2 B
4 4 D
df %>% filter(
complete.cases(df)) %>%
filter(
!duplicated(cbind(x,y)))
x y
1 1 A
2 2 B
3 4 D
distinct()
? duplicated
? complete.cases
Website for data.table
http://datatable.r-forge.r-project.org/
Github repository of data.table
https://github.com/Rdatatable/data.table/
Documentation on dplyr from RStudio
http://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html