https://www.rstudio.com/resources/webinars/data-wrangling-with-r-and-rstudio/ https://github.com/rstudio/webinars/blob/master/05-Data-Wrangling-with-R-and-RStudio/wrangling-webinar.pdf
tidyr and functions ?gather() ?spread() ?seperate() ?unite()
dplyr and functions ?select ?flter ?arrange ?mutate ?summarise ?group_by **********************************************************************************************************
#install.packages('dplyr')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages('tidyr')
library(tidyr)
#install.packages('devtools')
#devtools::install_github("rstudio/EDAWR")
library(EDAWR)
##
## Attaching package: 'EDAWR'
## The following objects are masked from 'package:tidyr':
##
## population, who
# Datasets for practice
#?strom
#?pollution
#?cases
#?songs
#?artists
#install.packages('ggplot2')
library(ggplot2)
data(diamonds)
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
#This command is usefull
tbl_df(diamonds)
## # A tibble: 53,940 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
## # ... with 53,930 more rows
#View(diamonds)
select(diamonds,cut)
## # A tibble: 53,940 × 1
## cut
## <ord>
## 1 Ideal
## 2 Premium
## 3 Good
## 4 Premium
## 5 Good
## 6 Very Good
## 7 Very Good
## 8 Very Good
## 9 Fair
## 10 Very Good
## # ... with 53,930 more rows
names(diamonds)
## [1] "carat" "cut" "color" "clarity" "depth" "table" "price"
## [8] "x" "y" "z"
select(diamonds,cut:depth)
## # A tibble: 53,940 × 4
## cut color clarity depth
## <ord> <ord> <ord> <dbl>
## 1 Ideal E SI2 61.5
## 2 Premium E SI1 59.8
## 3 Good E VS1 56.9
## 4 Premium I VS2 62.4
## 5 Good J SI2 63.3
## 6 Very Good J VVS2 62.8
## 7 Very Good I VVS1 62.3
## 8 Very Good H SI1 61.9
## 9 Fair E VS2 65.1
## 10 Very Good H VS1 59.4
## # ... with 53,930 more rows
also written the below way
diamonds%>% select(cut:depth)
## # A tibble: 53,940 × 4
## cut color clarity depth
## <ord> <ord> <ord> <dbl>
## 1 Ideal E SI2 61.5
## 2 Premium E SI1 59.8
## 3 Good E VS1 56.9
## 4 Premium I VS2 62.4
## 5 Good J SI2 63.3
## 6 Very Good J VVS2 62.8
## 7 Very Good I VVS1 62.3
## 8 Very Good H SI1 61.9
## 9 Fair E VS2 65.1
## 10 Very Good H VS1 59.4
## # ... with 53,930 more rows
Selecting observations
diamonds%>% filter(cut=='Ideal')
## # A tibble: 21,551 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Ideal J VS1 62.8 56 340 3.93 3.90 2.46
## 3 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 4 0.30 Ideal I SI2 62.0 54 348 4.31 4.34 2.68
## 5 0.33 Ideal I SI2 61.8 55 403 4.49 4.51 2.78
## 6 0.33 Ideal I SI2 61.2 56 403 4.49 4.50 2.75
## 7 0.33 Ideal J SI1 61.1 56 403 4.49 4.55 2.76
## 8 0.23 Ideal G VS1 61.9 54 404 3.93 3.95 2.44
## 9 0.32 Ideal I SI1 60.9 55 404 4.45 4.48 2.72
## 10 0.30 Ideal I SI2 61.0 59 405 4.30 4.33 2.63
## # ... with 21,541 more rows
diamonds$x %>%
round(2) %>%
mean()
## [1] 5.731157
library(tidyr)
cases
## country 2011 2012 2013
## 1 FR 7000 6900 7000
## 2 DE 5800 6000 6200
## 3 US 15000 14000 13000
gather(cases,"year","n",2:4)
## country year n
## 1 FR 2011 7000
## 2 DE 2011 5800
## 3 US 2011 15000
## 4 FR 2012 6900
## 5 DE 2012 6000
## 6 US 2012 14000
## 7 FR 2013 7000
## 8 DE 2013 6200
## 9 US 2013 13000
pollution
## city size amount
## 1 New York large 23
## 2 New York small 14
## 3 London large 22
## 4 London small 16
## 5 Beijing large 121
## 6 Beijing small 56
spread(pollution,city,amount)
## size Beijing London New York
## 1 large 121 22 23
## 2 small 56 16 14
spread(pollution,size,amount)
## city large small
## 1 Beijing 121 56
## 2 London 22 16
## 3 New York 23 14
diamonds
## # A tibble: 53,940 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
## # ... with 53,930 more rows
diamonds$row<-1:nrow(diamonds) #To add explicit row numbers
diamonds_d<-na.omit(diamonds)%>%
select(row,color,price)%>%
arrange(color)
spread(diamonds_d,color,price)
## # A tibble: 53,940 × 8
## row D E F G H I J
## * <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 NA 326 NA NA NA NA NA
## 2 2 NA 326 NA NA NA NA NA
## 3 3 NA 327 NA NA NA NA NA
## 4 4 NA NA NA NA NA 334 NA
## 5 5 NA NA NA NA NA NA 335
## 6 6 NA NA NA NA NA NA 336
## 7 7 NA NA NA NA NA 336 NA
## 8 8 NA NA NA NA 337 NA NA
## 9 9 NA 337 NA NA NA NA NA
## 10 10 NA NA NA NA 338 NA NA
## # ... with 53,930 more rows
storms
## # A tibble: 6 × 4
## storm wind pressure date
## <chr> <int> <int> <date>
## 1 Alberto 110 1007 2000-08-03
## 2 Alex 45 1009 1998-07-27
## 3 Allison 65 1005 1995-06-03
## 4 Ana 40 1013 1997-06-30
## 5 Arlene 50 1010 1999-06-11
## 6 Arthur 45 1010 1996-06-17
storms_sepdate<-separate(storms,date,c("year","month","day"),sep="-")
storms_sepdate
## # A tibble: 6 × 6
## storm wind pressure year month day
## * <chr> <int> <int> <chr> <chr> <chr>
## 1 Alberto 110 1007 2000 08 03
## 2 Alex 45 1009 1998 07 27
## 3 Allison 65 1005 1995 06 03
## 4 Ana 40 1013 1997 06 30
## 5 Arlene 50 1010 1999 06 11
## 6 Arthur 45 1010 1996 06 17
storms_unite<-unite(storms_sepdate,"date",year,month,day,sep="-")
Ways to access information 1. Extract existing variables select() 2. Extract existing observatios filter() 3. Derive new variables mutuate() 4. Change unit of variance summarise()
storms%>% select(pressure,date)%>% filter(pressure>1000)
## # A tibble: 6 × 2
## pressure date
## <int> <date>
## 1 1007 2000-08-03
## 2 1009 1998-07-27
## 3 1005 1995-06-03
## 4 1013 1997-06-30
## 5 1010 1999-06-11
## 6 1010 1996-06-17
select(storms,pressure,date)
## # A tibble: 6 × 2
## pressure date
## <int> <date>
## 1 1007 2000-08-03
## 2 1009 1998-07-27
## 3 1005 1995-06-03
## 4 1013 1997-06-30
## 5 1010 1999-06-11
## 6 1010 1996-06-17
filter(storms,pressure>1000 & wind==110)
## # A tibble: 1 × 4
## storm wind pressure date
## <chr> <int> <int> <date>
## 1 Alberto 110 1007 2000-08-03
mutate(storms,ratio=pressure/wind,inverse=ratio**-1)
## # A tibble: 6 × 6
## storm wind pressure date ratio inverse
## <chr> <int> <int> <date> <dbl> <dbl>
## 1 Alberto 110 1007 2000-08-03 9.154545 0.10923535
## 2 Alex 45 1009 1998-07-27 22.422222 0.04459861
## 3 Allison 65 1005 1995-06-03 15.461538 0.06467662
## 4 Ana 40 1013 1997-06-30 25.325000 0.03948667
## 5 Arlene 50 1010 1999-06-11 20.200000 0.04950495
## 6 Arthur 45 1010 1996-06-17 22.444444 0.04455446
storms%>% summarise(mean=mean(pressure),variance=var(pressure),sd=sd(pressure))
## # A tibble: 1 × 3
## mean variance sd
## <dbl> <dbl> <dbl>
## 1 1009 7.6 2.75681
storms%>%
mutate(ratio=pressure/wind)%>%
select(storm,ratio)
## # A tibble: 6 × 2
## storm ratio
## <chr> <dbl>
## 1 Alberto 9.154545
## 2 Alex 22.422222
## 3 Allison 15.461538
## 4 Ana 25.325000
## 5 Arlene 20.200000
## 6 Arthur 22.444444
arrange(diamonds,cut,color)
## # A tibble: 53,940 × 11
## carat cut color clarity depth table price x y z row
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>
## 1 0.75 Fair D SI2 64.6 57 2848 5.74 5.72 3.70 677
## 2 0.71 Fair D VS2 56.9 65 2858 5.89 5.84 3.34 772
## 3 0.90 Fair D SI2 66.9 57 2885 6.02 5.90 3.99 940
## 4 1.00 Fair D SI2 69.3 58 2974 5.96 5.87 4.10 1439
## 5 1.01 Fair D SI2 64.6 56 3003 6.31 6.24 4.05 1555
## 6 0.73 Fair D VS1 66.0 54 3047 5.56 5.66 3.70 1769
## 7 0.71 Fair D VS2 64.7 58 3077 5.61 5.58 3.62 1897
## 8 0.91 Fair D SI2 62.5 66 3079 6.08 6.01 3.78 1904
## 9 0.90 Fair D SI2 65.9 59 3205 6.00 5.95 3.94 2534
## 10 0.90 Fair D SI2 66.0 58 3205 6.00 5.97 3.95 2535
## # ... with 53,930 more rows
arrange(storms,wind)
## # A tibble: 6 × 4
## storm wind pressure date
## <chr> <int> <int> <date>
## 1 Ana 40 1013 1997-06-30
## 2 Alex 45 1009 1998-07-27
## 3 Arthur 45 1010 1996-06-17
## 4 Arlene 50 1010 1999-06-11
## 5 Allison 65 1005 1995-06-03
## 6 Alberto 110 1007 2000-08-03
arrange(storms,wind,pressure)
## # A tibble: 6 × 4
## storm wind pressure date
## <chr> <int> <int> <date>
## 1 Ana 40 1013 1997-06-30
## 2 Alex 45 1009 1998-07-27
## 3 Arthur 45 1010 1996-06-17
## 4 Arlene 50 1010 1999-06-11
## 5 Allison 65 1005 1995-06-03
## 6 Alberto 110 1007 2000-08-03
pollution
## city size amount
## 1 New York large 23
## 2 New York small 14
## 3 London large 22
## 4 London small 16
## 5 Beijing large 121
## 6 Beijing small 56
pollution %>%
group_by(city) %>%
summarise(mean=mean(amount),sum=sum(amount),n=n())
## # A tibble: 3 × 4
## city mean sum n
## <chr> <dbl> <dbl> <int>
## 1 Beijing 88.5 177 2
## 2 London 19.0 38 2
## 3 New York 18.5 37 2
pollution %>%
summarise(mean=mean(amount),sum=sum(amount),n=n())
## mean sum n
## 1 42 252 6
tb
## # A tibble: 3,800 × 6
## country year sex child adult elderly
## * <chr> <int> <chr> <int> <int> <int>
## 1 Afghanistan 1995 female NA NA NA
## 2 Afghanistan 1995 male NA NA NA
## 3 Afghanistan 1996 female NA NA NA
## 4 Afghanistan 1996 male NA NA NA
## 5 Afghanistan 1997 female 5 96 1
## 6 Afghanistan 1997 male 0 26 0
## 7 Afghanistan 1998 female 45 1142 20
## 8 Afghanistan 1998 male 30 500 41
## 9 Afghanistan 1999 female 25 484 8
## 10 Afghanistan 1999 male 8 212 8
## # ... with 3,790 more rows
tb %>%
group_by(country,year) %>%
summarize(child_sum=sum(child),adult_sum=sum(adult))
## Source: local data frame [1,900 x 4]
## Groups: country [?]
##
## country year child_sum adult_sum
## <chr> <int> <int> <int>
## 1 Afghanistan 1995 NA NA
## 2 Afghanistan 1996 NA NA
## 3 Afghanistan 1997 5 122
## 4 Afghanistan 1998 75 1642
## 5 Afghanistan 1999 33 696
## 6 Afghanistan 2000 145 2405
## 7 Afghanistan 2001 275 4172
## 8 Afghanistan 2002 282 5820
## 9 Afghanistan 2003 372 5795
## 10 Afghanistan 2004 395 7284
## # ... with 1,890 more rows
na.omit(tb) %>%
group_by(country) %>%
summarize(child_sum=sum(child),adult_sum=sum(adult))
## # A tibble: 100 × 3
## country child_sum adult_sum
## <chr> <int> <int>
## 1 Afghanistan 10710 116676
## 2 Algeria 2734 113899
## 3 Angola 22565 275101
## 4 Argentina 9070 94516
## 5 Azerbaijan 1548 27314
## 6 Bangladesh 39014 1312443
## 7 Belarus 234 31071
## 8 Benin 962 45507
## 9 Bolivia (Plurinational State of) 6792 102444
## 10 Botswana 4285 63599
## # ... with 90 more rows
na.omit(tb) %>%
group_by(country,year) %>%
summarize(child=sum(child))
## Source: local data frame [1,691 x 3]
## Groups: country [?]
##
## country year child
## <chr> <int> <int>
## 1 Afghanistan 1997 5
## 2 Afghanistan 1998 75
## 3 Afghanistan 1999 33
## 4 Afghanistan 2000 145
## 5 Afghanistan 2001 275
## 6 Afghanistan 2002 282
## 7 Afghanistan 2003 372
## 8 Afghanistan 2004 395
## 9 Afghanistan 2005 471
## 10 Afghanistan 2006 635
## # ... with 1,681 more rows
na.omit(tb) %>%
group_by(country,year) %>%
summarize(child=sum(child)) %>%
summarize(child=sum(child)) %>%
summarize(child=sum(child))
## # A tibble: 1 × 1
## child
## <int>
## 1 1597557
st1<-storms
st2<-storms
bind_rows(st1,st2)
## # A tibble: 12 × 4
## storm wind pressure date
## <chr> <int> <int> <date>
## 1 Alberto 110 1007 2000-08-03
## 2 Alex 45 1009 1998-07-27
## 3 Allison 65 1005 1995-06-03
## 4 Ana 40 1013 1997-06-30
## 5 Arlene 50 1010 1999-06-11
## 6 Arthur 45 1010 1996-06-17
## 7 Alberto 110 1007 2000-08-03
## 8 Alex 45 1009 1998-07-27
## 9 Allison 65 1005 1995-06-03
## 10 Ana 40 1013 1997-06-30
## 11 Arlene 50 1010 1999-06-11
## 12 Arthur 45 1010 1996-06-17
bind_cols(st1,st2)
## # A tibble: 6 × 8
## storm wind pressure date storm wind pressure date
## <chr> <int> <int> <date> <chr> <int> <int> <date>
## 1 Alberto 110 1007 2000-08-03 Alberto 110 1007 2000-08-03
## 2 Alex 45 1009 1998-07-27 Alex 45 1009 1998-07-27
## 3 Allison 65 1005 1995-06-03 Allison 65 1005 1995-06-03
## 4 Ana 40 1013 1997-06-30 Ana 40 1013 1997-06-30
## 5 Arlene 50 1010 1999-06-11 Arlene 50 1010 1999-06-11
## 6 Arthur 45 1010 1996-06-17 Arthur 45 1010 1996-06-17
union(st1,st2)
## # A tibble: 6 × 4
## storm wind pressure date
## <chr> <int> <int> <date>
## 1 Arthur 45 1010 1996-06-17
## 2 Arlene 50 1010 1999-06-11
## 3 Ana 40 1013 1997-06-30
## 4 Allison 65 1005 1995-06-03
## 5 Alex 45 1009 1998-07-27
## 6 Alberto 110 1007 2000-08-03
intersect(st1,st2)
## # A tibble: 6 × 4
## storm wind pressure date
## <chr> <int> <int> <date>
## 1 Alberto 110 1007 2000-08-03
## 2 Alex 45 1009 1998-07-27
## 3 Allison 65 1005 1995-06-03
## 4 Ana 40 1013 1997-06-30
## 5 Arlene 50 1010 1999-06-11
## 6 Arthur 45 1010 1996-06-17
setdiff(st1,st2)
## # A tibble: 0 × 4
## # ... with 4 variables: storm <chr>, wind <int>, pressure <int>,
## # date <date>
data(songs)
data(artists)
songs
## song name
## 1 Across the Universe John
## 2 Come Together John
## 3 Hello, Goodbye Paul
## 4 Peggy Sue Buddy
artists
## name plays
## 1 George sitar
## 2 John guitar
## 3 Paul bass
## 4 Ringo drums
left_join(songs,artists,by='name')
## song name plays
## 1 Across the Universe John guitar
## 2 Come Together John guitar
## 3 Hello, Goodbye Paul bass
## 4 Peggy Sue Buddy <NA>
right_join(songs,artists,by='name')
## song name plays
## 1 <NA> George sitar
## 2 Across the Universe John guitar
## 3 Come Together John guitar
## 4 Hello, Goodbye Paul bass
## 5 <NA> Ringo drums
inner_join(songs,artists,by='name')
## song name plays
## 1 Across the Universe John guitar
## 2 Come Together John guitar
## 3 Hello, Goodbye Paul bass
semi_join(songs,artists,by='name') # Not actuallt a join but a lookup kind of thing. After lookup Return observations which are found
## song name
## 1 Across the Universe John
## 2 Come Together John
## 3 Hello, Goodbye Paul
anti_join(songs,artists,by='name') # After lookup returns not found observations
## song name
## 1 Peggy Sue Buddy
#install.packages('nycflights13')
library(nycflights13)