Managing Data Frames with dplyr

##Select(), download chicago data

#load the package, ignore warnings
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#the maximum number of columns on a line used in printing vectors
options(width = 105)

#read the RDS file
chicago <-readRDS("./data/chicago.rds")

#inspect the data 
dim(chicago)

## [1] 6940    8

#[1] 6940    8

str(chicago)

## 'data.frame':    6940 obs. of  8 variables:
##  $ city      : chr  "chic" "chic" "chic" "chic" ...
##  $ tmpd      : num  31.5 33 33 29 32 40 34.5 29 26.5 32.5 ...
##  $ dptp      : num  31.5 29.9 27.4 28.6 28.9 ...
##  $ date      : Date, format: "1987-01-01" "1987-01-02" "1987-01-03" ...
##  $ pm25tmean2: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ pm10tmean2: num  34 NA 34.2 47 NA ...
##  $ o3tmean2  : num  4.25 3.3 3.33 4.38 4.75 ...
##  $ no2tmean2 : num  20 23.2 23.8 30.4 30.3 ...

names(chicago)

## [1] "city"       "tmpd"       "dptp"       "date"       "pm25tmean2" "pm10tmean2" "o3tmean2"  
## [8] "no2tmean2"

##select() function

#select() a range of columns
head(select(chicago, city:dptp))

##   city tmpd   dptp
## 1 chic 31.5 31.500
## 2 chic 33.0 29.875
## 3 chic 33.0 27.375
## 4 chic 29.0 28.625
## 5 chic 32.0 28.875
## 6 chic 40.0 35.125

#exclude from selection a range of columns
head(select(chicago, -(city:dptp)))

##         date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 1987-01-01         NA   34.00000 4.250000  19.98810
## 2 1987-01-02         NA         NA 3.304348  23.19099
## 3 1987-01-03         NA   34.16667 3.333333  23.81548
## 4 1987-01-04         NA   47.00000 4.375000  30.43452
## 5 1987-01-05         NA         NA 4.750000  30.33333
## 6 1987-01-06         NA   48.00000 5.833333  25.77233

##filter() function

#filter() subsets based on a sequence
chic.f <-filter(chicago, pm25tmean2 > 30)
head(chic.f, 5)

##   city tmpd dptp       date pm25tmean2 pm10tmean2  o3tmean2 no2tmean2
## 1 chic   23 21.9 1998-01-17      38.10   32.46154  3.180556  25.30000
## 2 chic   28 25.8 1998-01-23      33.95   38.69231  1.750000  29.37630
## 3 chic   55 51.3 1998-04-30      39.40   34.00000 10.786232  25.31310
## 4 chic   59 53.7 1998-05-01      35.40   28.50000 14.295125  31.42905
## 5 chic   57 52.0 1998-05-02      33.30   35.00000 20.662879  26.79861

#here's a more complex example with the and operator
chic.f <-filter(chicago, pm25tmean2 > 30 & tmpd >80)
head(chic.f, 5)

##   city tmpd dptp       date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic   81 71.2 1998-08-23    39.6000       59.0 45.86364  14.32639
## 2 chic   81 70.4 1998-09-06    31.5000       50.5 50.66250  20.31250
## 3 chic   82 72.2 2001-07-20    32.3000       58.5 33.00380  33.67500
## 4 chic   84 72.9 2001-08-01    43.7000       81.5 45.17736  27.44239
## 5 chic   85 72.6 2001-08-08    38.8375       70.0 37.98047  27.62743

##arrange() function

#arrange() orders the variables
chicago <-arrange(chicago, date)
head(chicago)

##   city tmpd   dptp       date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 31.5 31.500 1987-01-01         NA   34.00000 4.250000  19.98810
## 2 chic 33.0 29.875 1987-01-02         NA         NA 3.304348  23.19099
## 3 chic 33.0 27.375 1987-01-03         NA   34.16667 3.333333  23.81548
## 4 chic 29.0 28.625 1987-01-04         NA   47.00000 4.375000  30.43452
## 5 chic 32.0 28.875 1987-01-05         NA         NA 4.750000  30.33333
## 6 chic 40.0 35.125 1987-01-06         NA   48.00000 5.833333  25.77233

#arrange in decending order
chicago <-arrange(chicago, desc(date))

##rename() function

#rename() columns is now easy in R!
chicago <- rename(chicago, pm25=pm25tmean2, dewpoint=dptp)
head(chicago)

##   city tmpd dewpoint       date     pm25 pm10tmean2  o3tmean2 no2tmean2
## 1 chic   35     30.1 2005-12-31 15.00000       23.5  2.531250  13.25000
## 2 chic   36     31.0 2005-12-30 15.05714       19.2  3.034420  22.80556
## 3 chic   35     29.4 2005-12-29  7.45000       23.5  6.794837  19.97222
## 4 chic   37     34.5 2005-12-28 17.75000       27.5  3.260417  19.28563
## 5 chic   40     33.6 2005-12-27 23.56000       27.0  4.468750  23.50000
## 6 chic   35     29.6 2005-12-26  8.40000        8.5 14.041667  16.81944

##mutate() function

#mutate() is used to transform data or create variables
# here to center a variable
chicago <-mutate(chicago, pm25detrend=pm25-mean(pm25, na.rm = TRUE))
head(select(chicago, pm25, pm25detrend))

##       pm25 pm25detrend
## 1 15.00000   -1.230958
## 2 15.05714   -1.173815
## 3  7.45000   -8.780958
## 4 17.75000    1.519042
## 5 23.56000    7.329042
## 6  8.40000   -7.830958

##group_by() function

#group_by() splits a dataframe by categorical variables 
#first let's create a new variable temperature category variable 'temcat'
#to see if the temperatue was hot or cold on a particular day, depending 
#on whether the temperature was > 80°F or not.
chicago <- mutate(chicago, temcat = factor(1*(tmpd > 80), labels = c("cold", "hot")))
#use group_by() to create a new data frame 
hotcold <- group_by(chicago, temcat)
str(hotcold)

## tibble [6,940 x 10] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ city       : chr [1:6940] "chic" "chic" "chic" "chic" ...
##  $ tmpd       : num [1:6940] 35 36 35 37 40 35 35 37 41 22 ...
##  $ dewpoint   : num [1:6940] 30.1 31 29.4 34.5 33.6 29.6 32.1 35.2 32.6 23.3 ...
##  $ date       : Date[1:6940], format: "2005-12-31" "2005-12-30" "2005-12-29" ...
##  $ pm25       : num [1:6940] 15 15.06 7.45 17.75 23.56 ...
##  $ pm10tmean2 : num [1:6940] 23.5 19.2 23.5 27.5 27 8.5 8 25.2 34.5 42.5 ...
##  $ o3tmean2   : num [1:6940] 2.53 3.03 6.79 3.26 4.47 ...
##  $ no2tmean2  : num [1:6940] 13.2 22.8 20 19.3 23.5 ...
##  $ pm25detrend: num [1:6940] -1.23 -1.17 -8.78 1.52 7.33 ...
##  $ temcat     : Factor w/ 2 levels "cold","hot": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "groups")= tibble [3 x 2] (S3: tbl_df/tbl/data.frame)
##   ..$ temcat: Factor w/ 2 levels "cold","hot": 1 2 NA
##   ..$ .rows : list<int> [1:3] 
##   .. ..$ : int [1:6737] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..$ : int [1:202] 110 150 151 153 160 161 167 168 169 185 ...
##   .. ..$ : int 1037
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE

summarise(hotcold)

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 3 x 1
##   temcat
##   <fct> 
## 1 cold  
## 2 hot   
## 3 <NA>

#  temcat
#  <fct> 
#1 cold  
#2 hot   
#3 NA  
#the dataframe has NA values, so I need to use 'na.rm=TRUE'
summarise(hotcold, pm25 =mean(pm25, na.rm = TRUE), o3 =max(o3tmean2) , no2=median(no2tmean2,na.rm = TRUE))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 3 x 4
##   temcat  pm25    o3   no2
##   <fct>  <dbl> <dbl> <dbl>
## 1 cold    16.0 66.6   24.5
## 2 hot     26.5 63.0   24.9
## 3 <NA>    47.7  9.42  37.4

##group_by() with years example

#create a new variable, 'years'

chicago <- mutate(chicago, year=as.POSIXlt(date)$year + 1900)
years <- group_by(chicago, year)


#group_by() with years example
summarise(years, pm25 =mean(pm25, na.rm = TRUE), o3 =max(o3tmean2) , no2=median(no2tmean2,na.rm = TRUE))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 19 x 4
##     year  pm25    o3   no2
##    <dbl> <dbl> <dbl> <dbl>
##  1  1987 NaN    63.0  23.5
##  2  1988 NaN    61.7  24.5
##  3  1989 NaN    59.7  26.1
##  4  1990 NaN    52.2  22.6
##  5  1991 NaN    63.1  21.4
##  6  1992 NaN    50.8  24.8
##  7  1993 NaN    44.3  25.8
##  8  1994 NaN    52.2  28.5
##  9  1995 NaN    66.6  27.3
## 10  1996 NaN    58.4  26.4
## 11  1997 NaN    56.5  25.5
## 12  1998  18.3  50.7  24.6
## 13  1999  18.5  57.5  24.7
## 14  2000  16.9  55.8  23.5
## 15  2001  16.9  51.8  25.1
## 16  2002  15.3  54.9  22.7
## 17  2003  15.2  56.2  24.6
## 18  2004  14.6  44.5  23.4
## 19  2005  16.2  58.8  22.6

using a pipeline %>%

# A piple allows you to chain operation, in a readble and powerful way
# dplr can work with other data frame "backends"
# use data.table for large fast tables
# and a SQL interface with the DBI package

chicago %>% mutate(month=as.POSIXlt(date)$mon + 1) %>% group_by(month) %>% summarise(pm25 =mean(pm25, na.rm = TRUE), o3 =max(o3tmean2) , no2=median(no2tmean2))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 12 x 4
##    month  pm25    o3   no2
##    <dbl> <dbl> <dbl> <dbl>
##  1     1  17.8  28.2  25.4
##  2     2  20.4  37.4  26.8
##  3     3  17.4  39.0  26.8
##  4     4  13.9  47.9  25.0
##  5     5  14.1  52.8  24.2
##  6     6  15.9  66.6  25.0
##  7     7  16.6  59.5  22.4
##  8     8  16.9  54.0  23.0
##  9     9  15.9  57.5  24.5
## 10    10  14.2  47.1  24.2
## 11    11  15.2  29.5  23.6
## 12    12  17.5  27.7  24.5

This is an R Markdown document, see the blog post for finer details.

Managing Data Frames with dplyr

Linda Angulo Lopez

03/07/2020

using a pipeline %>%