Resource:

https://www.rstudio.com/resources/webinars/data-wrangling-with-r-and-rstudio/ https://github.com/rstudio/webinars/blob/master/05-Data-Wrangling-with-R-and-RStudio/wrangling-webinar.pdf

tidyr and functions ?gather() ?spread() ?seperate() ?unite()

dplyr and functions ?select ?flter ?arrange ?mutate ?summarise ?group_by **********************************************************************************************************

pratice libraries and datasets

#install.packages('dplyr')
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#install.packages('tidyr')
library(tidyr)

#install.packages('devtools')
#devtools::install_github("rstudio/EDAWR")
library(EDAWR)

## 
## Attaching package: 'EDAWR'

## The following objects are masked from 'package:tidyr':
## 
##     population, who

# Datasets for practice
#?strom
#?pollution
#?cases
#?songs
#?artists

#install.packages('ggplot2')
library(ggplot2)

data(diamonds)
str(diamonds)

## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

When we have number of columns. The extra columns which will not fit on the screen will be displayed at the bottom. Very convenient to view large datasets.

#This command is usefull
tbl_df(diamonds)

## # A tibble: 53,940 × 10
##    carat       cut color clarity depth table price     x     y     z
##    <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1   0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2   0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
## 3   0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
## 4   0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
## 5   0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
## 6   0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
## 7   0.24 Very Good     I    VVS1  62.3    57   336  3.95  3.98  2.47
## 8   0.26 Very Good     H     SI1  61.9    55   337  4.07  4.11  2.53
## 9   0.22      Fair     E     VS2  65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good     H     VS1  59.4    61   338  4.00  4.05  2.39
## # ... with 53,930 more rows

Just to view the diamonds dataset

#View(diamonds)

selecting the columns

select(diamonds,cut)

## # A tibble: 53,940 × 1
##          cut
##        <ord>
## 1      Ideal
## 2    Premium
## 3       Good
## 4    Premium
## 5       Good
## 6  Very Good
## 7  Very Good
## 8  Very Good
## 9       Fair
## 10 Very Good
## # ... with 53,930 more rows

names(diamonds)

##  [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
##  [8] "x"       "y"       "z"

select(diamonds,cut:depth)

## # A tibble: 53,940 × 4
##          cut color clarity depth
##        <ord> <ord>   <ord> <dbl>
## 1      Ideal     E     SI2  61.5
## 2    Premium     E     SI1  59.8
## 3       Good     E     VS1  56.9
## 4    Premium     I     VS2  62.4
## 5       Good     J     SI2  63.3
## 6  Very Good     J    VVS2  62.8
## 7  Very Good     I    VVS1  62.3
## 8  Very Good     H     SI1  61.9
## 9       Fair     E     VS2  65.1
## 10 Very Good     H     VS1  59.4
## # ... with 53,930 more rows

also written the below way

diamonds%>% select(cut:depth)

## # A tibble: 53,940 × 4
##          cut color clarity depth
##        <ord> <ord>   <ord> <dbl>
## 1      Ideal     E     SI2  61.5
## 2    Premium     E     SI1  59.8
## 3       Good     E     VS1  56.9
## 4    Premium     I     VS2  62.4
## 5       Good     J     SI2  63.3
## 6  Very Good     J    VVS2  62.8
## 7  Very Good     I    VVS1  62.3
## 8  Very Good     H     SI1  61.9
## 9       Fair     E     VS2  65.1
## 10 Very Good     H     VS1  59.4
## # ... with 53,930 more rows

Selecting observations

diamonds%>% filter(cut=='Ideal')

## # A tibble: 21,551 × 10
##    carat   cut color clarity depth table price     x     y     z
##    <dbl> <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1   0.23 Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2   0.23 Ideal     J     VS1  62.8    56   340  3.93  3.90  2.46
## 3   0.31 Ideal     J     SI2  62.2    54   344  4.35  4.37  2.71
## 4   0.30 Ideal     I     SI2  62.0    54   348  4.31  4.34  2.68
## 5   0.33 Ideal     I     SI2  61.8    55   403  4.49  4.51  2.78
## 6   0.33 Ideal     I     SI2  61.2    56   403  4.49  4.50  2.75
## 7   0.33 Ideal     J     SI1  61.1    56   403  4.49  4.55  2.76
## 8   0.23 Ideal     G     VS1  61.9    54   404  3.93  3.95  2.44
## 9   0.32 Ideal     I     SI1  60.9    55   404  4.45  4.48  2.72
## 10  0.30 Ideal     I     SI2  61.0    59   405  4.30  4.33  2.63
## # ... with 21,541 more rows

diamonds$x %>%
  round(2) %>%
  mean()

## [1] 5.731157

tidyr package functions

library(tidyr)
cases

##   country  2011  2012  2013
## 1      FR  7000  6900  7000
## 2      DE  5800  6000  6200
## 3      US 15000 14000 13000

gather(cases,"year","n",2:4)

##   country year     n
## 1      FR 2011  7000
## 2      DE 2011  5800
## 3      US 2011 15000
## 4      FR 2012  6900
## 5      DE 2012  6000
## 6      US 2012 14000
## 7      FR 2013  7000
## 8      DE 2013  6200
## 9      US 2013 13000

pollution

##       city  size amount
## 1 New York large     23
## 2 New York small     14
## 3   London large     22
## 4   London small     16
## 5  Beijing large    121
## 6  Beijing small     56

spread(pollution,city,amount)

##    size Beijing London New York
## 1 large     121     22       23
## 2 small      56     16       14

spread(pollution,size,amount)

##       city large small
## 1  Beijing   121    56
## 2   London    22    16
## 3 New York    23    14

spread diamonds dataset

diamonds

## # A tibble: 53,940 × 10
##    carat       cut color clarity depth table price     x     y     z
##    <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1   0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2   0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
## 3   0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
## 4   0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
## 5   0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
## 6   0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
## 7   0.24 Very Good     I    VVS1  62.3    57   336  3.95  3.98  2.47
## 8   0.26 Very Good     H     SI1  61.9    55   337  4.07  4.11  2.53
## 9   0.22      Fair     E     VS2  65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good     H     VS1  59.4    61   338  4.00  4.05  2.39
## # ... with 53,930 more rows

diamonds$row<-1:nrow(diamonds)   #To add explicit row numbers

diamonds_d<-na.omit(diamonds)%>% 
  select(row,color,price)%>% 
  arrange(color)

spread(diamonds_d,color,price)

## # A tibble: 53,940 × 8
##      row     D     E     F     G     H     I     J
## *  <int> <int> <int> <int> <int> <int> <int> <int>
## 1      1    NA   326    NA    NA    NA    NA    NA
## 2      2    NA   326    NA    NA    NA    NA    NA
## 3      3    NA   327    NA    NA    NA    NA    NA
## 4      4    NA    NA    NA    NA    NA   334    NA
## 5      5    NA    NA    NA    NA    NA    NA   335
## 6      6    NA    NA    NA    NA    NA    NA   336
## 7      7    NA    NA    NA    NA    NA   336    NA
## 8      8    NA    NA    NA    NA   337    NA    NA
## 9      9    NA   337    NA    NA    NA    NA    NA
## 10    10    NA    NA    NA    NA   338    NA    NA
## # ... with 53,930 more rows

unite and seperate

storms

## # A tibble: 6 × 4
##     storm  wind pressure       date
##     <chr> <int>    <int>     <date>
## 1 Alberto   110     1007 2000-08-03
## 2    Alex    45     1009 1998-07-27
## 3 Allison    65     1005 1995-06-03
## 4     Ana    40     1013 1997-06-30
## 5  Arlene    50     1010 1999-06-11
## 6  Arthur    45     1010 1996-06-17

storms_sepdate<-separate(storms,date,c("year","month","day"),sep="-")

storms_sepdate

## # A tibble: 6 × 6
##     storm  wind pressure  year month   day
## *   <chr> <int>    <int> <chr> <chr> <chr>
## 1 Alberto   110     1007  2000    08    03
## 2    Alex    45     1009  1998    07    27
## 3 Allison    65     1005  1995    06    03
## 4     Ana    40     1013  1997    06    30
## 5  Arlene    50     1010  1999    06    11
## 6  Arthur    45     1010  1996    06    17

storms_unite<-unite(storms_sepdate,"date",year,month,day,sep="-")

dplyr

Ways to access information 1. Extract existing variables select() 2. Extract existing observatios filter() 3. Derive new variables mutuate() 4. Change unit of variance summarise()

storms%>% select(pressure,date)%>% filter(pressure>1000)

## # A tibble: 6 × 2
##   pressure       date
##      <int>     <date>
## 1     1007 2000-08-03
## 2     1009 1998-07-27
## 3     1005 1995-06-03
## 4     1013 1997-06-30
## 5     1010 1999-06-11
## 6     1010 1996-06-17

select(storms,pressure,date)

## # A tibble: 6 × 2
##   pressure       date
##      <int>     <date>
## 1     1007 2000-08-03
## 2     1009 1998-07-27
## 3     1005 1995-06-03
## 4     1013 1997-06-30
## 5     1010 1999-06-11
## 6     1010 1996-06-17

filter(storms,pressure>1000 & wind==110)

## # A tibble: 1 × 4
##     storm  wind pressure       date
##     <chr> <int>    <int>     <date>
## 1 Alberto   110     1007 2000-08-03

mutate(storms,ratio=pressure/wind,inverse=ratio**-1)

## # A tibble: 6 × 6
##     storm  wind pressure       date     ratio    inverse
##     <chr> <int>    <int>     <date>     <dbl>      <dbl>
## 1 Alberto   110     1007 2000-08-03  9.154545 0.10923535
## 2    Alex    45     1009 1998-07-27 22.422222 0.04459861
## 3 Allison    65     1005 1995-06-03 15.461538 0.06467662
## 4     Ana    40     1013 1997-06-30 25.325000 0.03948667
## 5  Arlene    50     1010 1999-06-11 20.200000 0.04950495
## 6  Arthur    45     1010 1996-06-17 22.444444 0.04455446

storms%>% summarise(mean=mean(pressure),variance=var(pressure),sd=sd(pressure))

## # A tibble: 1 × 3
##    mean variance      sd
##   <dbl>    <dbl>   <dbl>
## 1  1009      7.6 2.75681

storms%>% 
  mutate(ratio=pressure/wind)%>% 
  select(storm,ratio)

## # A tibble: 6 × 2
##     storm     ratio
##     <chr>     <dbl>
## 1 Alberto  9.154545
## 2    Alex 22.422222
## 3 Allison 15.461538
## 4     Ana 25.325000
## 5  Arlene 20.200000
## 6  Arthur 22.444444

arrange(diamonds,cut,color)

## # A tibble: 53,940 × 11
##    carat   cut color clarity depth table price     x     y     z   row
##    <dbl> <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>
## 1   0.75  Fair     D     SI2  64.6    57  2848  5.74  5.72  3.70   677
## 2   0.71  Fair     D     VS2  56.9    65  2858  5.89  5.84  3.34   772
## 3   0.90  Fair     D     SI2  66.9    57  2885  6.02  5.90  3.99   940
## 4   1.00  Fair     D     SI2  69.3    58  2974  5.96  5.87  4.10  1439
## 5   1.01  Fair     D     SI2  64.6    56  3003  6.31  6.24  4.05  1555
## 6   0.73  Fair     D     VS1  66.0    54  3047  5.56  5.66  3.70  1769
## 7   0.71  Fair     D     VS2  64.7    58  3077  5.61  5.58  3.62  1897
## 8   0.91  Fair     D     SI2  62.5    66  3079  6.08  6.01  3.78  1904
## 9   0.90  Fair     D     SI2  65.9    59  3205  6.00  5.95  3.94  2534
## 10  0.90  Fair     D     SI2  66.0    58  3205  6.00  5.97  3.95  2535
## # ... with 53,930 more rows

arrange(storms,wind)

## # A tibble: 6 × 4
##     storm  wind pressure       date
##     <chr> <int>    <int>     <date>
## 1     Ana    40     1013 1997-06-30
## 2    Alex    45     1009 1998-07-27
## 3  Arthur    45     1010 1996-06-17
## 4  Arlene    50     1010 1999-06-11
## 5 Allison    65     1005 1995-06-03
## 6 Alberto   110     1007 2000-08-03

arrange(storms,wind,pressure)

## # A tibble: 6 × 4
##     storm  wind pressure       date
##     <chr> <int>    <int>     <date>
## 1     Ana    40     1013 1997-06-30
## 2    Alex    45     1009 1998-07-27
## 3  Arthur    45     1010 1996-06-17
## 4  Arlene    50     1010 1999-06-11
## 5 Allison    65     1005 1995-06-03
## 6 Alberto   110     1007 2000-08-03

changing unit of variance group_y()+summarise() . Changing the unit of analysis

pollution

##       city  size amount
## 1 New York large     23
## 2 New York small     14
## 3   London large     22
## 4   London small     16
## 5  Beijing large    121
## 6  Beijing small     56

pollution %>% 
  group_by(city) %>% 
  summarise(mean=mean(amount),sum=sum(amount),n=n())

## # A tibble: 3 × 4
##       city  mean   sum     n
##      <chr> <dbl> <dbl> <int>
## 1  Beijing  88.5   177     2
## 2   London  19.0    38     2
## 3 New York  18.5    37     2

pollution %>% 
    summarise(mean=mean(amount),sum=sum(amount),n=n())

##   mean sum n
## 1   42 252 6

tb

## # A tibble: 3,800 × 6
##        country  year    sex child adult elderly
## *        <chr> <int>  <chr> <int> <int>   <int>
## 1  Afghanistan  1995 female    NA    NA      NA
## 2  Afghanistan  1995   male    NA    NA      NA
## 3  Afghanistan  1996 female    NA    NA      NA
## 4  Afghanistan  1996   male    NA    NA      NA
## 5  Afghanistan  1997 female     5    96       1
## 6  Afghanistan  1997   male     0    26       0
## 7  Afghanistan  1998 female    45  1142      20
## 8  Afghanistan  1998   male    30   500      41
## 9  Afghanistan  1999 female    25   484       8
## 10 Afghanistan  1999   male     8   212       8
## # ... with 3,790 more rows

tb %>% 
  group_by(country,year) %>% 
  summarize(child_sum=sum(child),adult_sum=sum(adult))

## Source: local data frame [1,900 x 4]
## Groups: country [?]
## 
##        country  year child_sum adult_sum
##          <chr> <int>     <int>     <int>
## 1  Afghanistan  1995        NA        NA
## 2  Afghanistan  1996        NA        NA
## 3  Afghanistan  1997         5       122
## 4  Afghanistan  1998        75      1642
## 5  Afghanistan  1999        33       696
## 6  Afghanistan  2000       145      2405
## 7  Afghanistan  2001       275      4172
## 8  Afghanistan  2002       282      5820
## 9  Afghanistan  2003       372      5795
## 10 Afghanistan  2004       395      7284
## # ... with 1,890 more rows

na.omit(tb) %>% 
  group_by(country) %>% 
  summarize(child_sum=sum(child),adult_sum=sum(adult))

## # A tibble: 100 × 3
##                             country child_sum adult_sum
##                               <chr>     <int>     <int>
## 1                       Afghanistan     10710    116676
## 2                           Algeria      2734    113899
## 3                            Angola     22565    275101
## 4                         Argentina      9070     94516
## 5                        Azerbaijan      1548     27314
## 6                        Bangladesh     39014   1312443
## 7                           Belarus       234     31071
## 8                             Benin       962     45507
## 9  Bolivia (Plurinational State of)      6792    102444
## 10                         Botswana      4285     63599
## # ... with 90 more rows

na.omit(tb) %>% 
  group_by(country,year) %>% 
  summarize(child=sum(child))

## Source: local data frame [1,691 x 3]
## Groups: country [?]
## 
##        country  year child
##          <chr> <int> <int>
## 1  Afghanistan  1997     5
## 2  Afghanistan  1998    75
## 3  Afghanistan  1999    33
## 4  Afghanistan  2000   145
## 5  Afghanistan  2001   275
## 6  Afghanistan  2002   282
## 7  Afghanistan  2003   372
## 8  Afghanistan  2004   395
## 9  Afghanistan  2005   471
## 10 Afghanistan  2006   635
## # ... with 1,681 more rows

na.omit(tb) %>% 
  group_by(country,year) %>% 
  summarize(child=sum(child)) %>% 
  summarize(child=sum(child)) %>% 
  summarize(child=sum(child))

## # A tibble: 1 × 1
##     child
##     <int>
## 1 1597557

Joining data sets

st1<-storms
st2<-storms
bind_rows(st1,st2)

## # A tibble: 12 × 4
##      storm  wind pressure       date
##      <chr> <int>    <int>     <date>
## 1  Alberto   110     1007 2000-08-03
## 2     Alex    45     1009 1998-07-27
## 3  Allison    65     1005 1995-06-03
## 4      Ana    40     1013 1997-06-30
## 5   Arlene    50     1010 1999-06-11
## 6   Arthur    45     1010 1996-06-17
## 7  Alberto   110     1007 2000-08-03
## 8     Alex    45     1009 1998-07-27
## 9  Allison    65     1005 1995-06-03
## 10     Ana    40     1013 1997-06-30
## 11  Arlene    50     1010 1999-06-11
## 12  Arthur    45     1010 1996-06-17

bind_cols(st1,st2)

## # A tibble: 6 × 8
##     storm  wind pressure       date   storm  wind pressure       date
##     <chr> <int>    <int>     <date>   <chr> <int>    <int>     <date>
## 1 Alberto   110     1007 2000-08-03 Alberto   110     1007 2000-08-03
## 2    Alex    45     1009 1998-07-27    Alex    45     1009 1998-07-27
## 3 Allison    65     1005 1995-06-03 Allison    65     1005 1995-06-03
## 4     Ana    40     1013 1997-06-30     Ana    40     1013 1997-06-30
## 5  Arlene    50     1010 1999-06-11  Arlene    50     1010 1999-06-11
## 6  Arthur    45     1010 1996-06-17  Arthur    45     1010 1996-06-17

union(st1,st2)

## # A tibble: 6 × 4
##     storm  wind pressure       date
##     <chr> <int>    <int>     <date>
## 1  Arthur    45     1010 1996-06-17
## 2  Arlene    50     1010 1999-06-11
## 3     Ana    40     1013 1997-06-30
## 4 Allison    65     1005 1995-06-03
## 5    Alex    45     1009 1998-07-27
## 6 Alberto   110     1007 2000-08-03

intersect(st1,st2)

## # A tibble: 6 × 4
##     storm  wind pressure       date
##     <chr> <int>    <int>     <date>
## 1 Alberto   110     1007 2000-08-03
## 2    Alex    45     1009 1998-07-27
## 3 Allison    65     1005 1995-06-03
## 4     Ana    40     1013 1997-06-30
## 5  Arlene    50     1010 1999-06-11
## 6  Arthur    45     1010 1996-06-17

setdiff(st1,st2)

## # A tibble: 0 × 4
## # ... with 4 variables: storm <chr>, wind <int>, pressure <int>,
## #   date <date>

data(songs)
data(artists)
songs

##                  song  name
## 1 Across the Universe  John
## 2       Come Together  John
## 3      Hello, Goodbye  Paul
## 4           Peggy Sue Buddy

artists

##     name  plays
## 1 George  sitar
## 2   John guitar
## 3   Paul   bass
## 4  Ringo  drums

left_join(songs,artists,by='name')

##                  song  name  plays
## 1 Across the Universe  John guitar
## 2       Come Together  John guitar
## 3      Hello, Goodbye  Paul   bass
## 4           Peggy Sue Buddy   <NA>

right_join(songs,artists,by='name')

##                  song   name  plays
## 1                <NA> George  sitar
## 2 Across the Universe   John guitar
## 3       Come Together   John guitar
## 4      Hello, Goodbye   Paul   bass
## 5                <NA>  Ringo  drums

inner_join(songs,artists,by='name')

##                  song name  plays
## 1 Across the Universe John guitar
## 2       Come Together John guitar
## 3      Hello, Goodbye Paul   bass

semi_join(songs,artists,by='name')    # Not actuallt a join but a lookup kind of thing. After lookup Return observations which are found

##                  song name
## 1 Across the Universe John
## 2       Come Together John
## 3      Hello, Goodbye Paul

anti_join(songs,artists,by='name')    # After lookup returns not found observations

##        song  name
## 1 Peggy Sue Buddy

#install.packages('nycflights13')
library(nycflights13)

Wrangling

V M KISHORE

2/9/2017