# Source Exploratory Data Analysis with R by Roger D. Peng
# library(tidyverse)
# objects(grep("tidyverse",search()))
# dplyr Grammar
# select()
# filter()
# arrange()
# rename()
# mutate()
# summarise / summarize
# %>% the pipe operator
# 1. The first argument is a data frame
# 2. The subsequent arguments describe what to do with the
# data frame specified in the first argument
# 3. The return result of a function is a new data frame
# 4. Data frames must be properly formatted (tidy)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(nycflights13)
setwd("C:\\Users\\Cruz\\OneDrive\\Documents\\R-Classes\\classes_R")
chicago<-readRDS("chicago.rds")
head(chicago)
## city tmpd dptp date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 31.5 31.500 1987-01-01 NA 34.00000 4.250000 19.98810
## 2 chic 33.0 29.875 1987-01-02 NA NA 3.304348 23.19099
## 3 chic 33.0 27.375 1987-01-03 NA 34.16667 3.333333 23.81548
## 4 chic 29.0 28.625 1987-01-04 NA 47.00000 4.375000 30.43452
## 5 chic 32.0 28.875 1987-01-05 NA NA 4.750000 30.33333
## 6 chic 40.0 35.125 1987-01-06 NA 48.00000 5.833333 25.77233
str(chicago)
## 'data.frame': 6940 obs. of 8 variables:
## $ city : chr "chic" "chic" "chic" "chic" ...
## $ tmpd : num 31.5 33 33 29 32 40 34.5 29 26.5 32.5 ...
## $ dptp : num 31.5 29.9 27.4 28.6 28.9 ...
## $ date : Date, format: "1987-01-01" "1987-01-02" ...
## $ pm25tmean2: num NA NA NA NA NA NA NA NA NA NA ...
## $ pm10tmean2: num 34 NA 34.2 47 NA ...
## $ o3tmean2 : num 4.25 3.3 3.33 4.38 4.75 ...
## $ no2tmean2 : num 20 23.2 23.8 30.4 30.3 ...
summary(chicago)
## city tmpd dptp date
## Length:6940 Min. :-16.00 Min. :-25.62 Min. :1987-01-01
## Class :character 1st Qu.: 35.00 1st Qu.: 27.00 1st Qu.:1991-10-01
## Mode :character Median : 51.00 Median : 39.88 Median :1996-07-01
## Mean : 50.31 Mean : 40.34 Mean :1996-07-01
## 3rd Qu.: 67.00 3rd Qu.: 55.75 3rd Qu.:2001-04-01
## Max. : 92.00 Max. : 78.25 Max. :2005-12-31
## NA's :1 NA's :2
## pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## Min. : 1.70 Min. : 2.00 Min. : 0.1528 Min. : 6.158
## 1st Qu.: 9.70 1st Qu.: 21.50 1st Qu.:10.0729 1st Qu.:19.654
## Median :14.66 Median : 30.28 Median :18.5218 Median :24.556
## Mean :16.23 Mean : 33.90 Mean :19.4355 Mean :25.232
## 3rd Qu.:20.60 3rd Qu.: 42.00 3rd Qu.:27.0010 3rd Qu.:30.139
## Max. :61.50 Max. :365.00 Max. :66.5875 Max. :62.480
## NA's :4447 NA's :242
dim(chicago)
## [1] 6940 8
# the select() function can be used to select columns of a data frame
subset <- select(chicago,city:dptp)
head(subset)
## city tmpd dptp
## 1 chic 31.5 31.500
## 2 chic 33.0 29.875
## 3 chic 33.0 27.375
## 4 chic 29.0 28.625
## 5 chic 32.0 28.875
## 6 chic 40.0 35.125
# the filter function is used to extract subsets of rows from a
# a data frame
chic.f <- filter(chicago,pm25tmean2 > 30)
head(chic.f)
## city tmpd dptp date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 23 21.9 1998-01-17 38.10 32.46154 3.180556 25.30000
## 2 chic 28 25.8 1998-01-23 33.95 38.69231 1.750000 29.37630
## 3 chic 55 51.3 1998-04-30 39.40 34.00000 10.786232 25.31310
## 4 chic 59 53.7 1998-05-01 35.40 28.50000 14.295125 31.42905
## 5 chic 57 52.0 1998-05-02 33.30 35.00000 20.662879 26.79861
## 6 chic 57 56.0 1998-05-07 32.10 34.50000 24.270422 33.99167
str(chic.f)
## 'data.frame': 194 obs. of 8 variables:
## $ city : chr "chic" "chic" "chic" "chic" ...
## $ tmpd : num 23 28 55 59 57 57 75 61 73 78 ...
## $ dptp : num 21.9 25.8 51.3 53.7 52 56 65.8 59 60.3 67.1 ...
## $ date : Date, format: "1998-01-17" "1998-01-23" ...
## $ pm25tmean2: num 38.1 34 39.4 35.4 33.3 ...
## $ pm10tmean2: num 32.5 38.7 34 28.5 35 ...
## $ o3tmean2 : num 3.18 1.75 10.79 14.3 20.66 ...
## $ no2tmean2 : num 25.3 29.4 25.3 31.4 26.8 ...
summary(chic.f$pm25tmean2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.05 32.12 35.04 36.63 39.53 61.50
hist(chic.f$pm25tmean2)

chic.f1 <- filter(chicago,pm25tmean2 >30 & tmpd > 80)
select(chic.f1,date,tmpd,pm25tmean2)
## date tmpd pm25tmean2
## 1 1998-08-23 81 39.60000
## 2 1998-09-06 81 31.50000
## 3 2001-07-20 82 32.30000
## 4 2001-08-01 84 43.70000
## 5 2001-08-08 85 38.83750
## 6 2001-08-09 84 38.20000
## 7 2002-06-20 82 33.00000
## 8 2002-06-23 82 42.50000
## 9 2002-07-08 81 33.10000
## 10 2002-07-18 82 38.85000
## 11 2003-06-25 82 33.90000
## 12 2003-07-04 84 32.90000
## 13 2005-06-24 86 31.85714
## 14 2005-06-27 82 51.53750
## 15 2005-06-28 85 31.20000
## 16 2005-07-17 84 32.70000
## 17 2005-08-03 84 37.90000
# arrange() is used to reorder rows of a data frame according
# to one of the variables/columns.
chicago <- arrange(chicago, date)
head(select(chicago,date,pm25tmean2),3)
## date pm25tmean2
## 1 1987-01-01 NA
## 2 1987-01-02 NA
## 3 1987-01-03 NA
tail(select(chicago,date,pm25tmean2),3)
## date pm25tmean2
## 6938 2005-12-29 7.45000
## 6939 2005-12-30 15.05714
## 6940 2005-12-31 15.00000
chicago <- arrange(chicago,desc(date))
# The rename() function renames variables
chicago <- rename(chicago,dewpoint=dptp,pm25=pm25tmean2)
head(chicago[,1:5],3)
## city tmpd dewpoint date pm25
## 1 chic 35 30.1 2005-12-31 15.00000
## 2 chic 36 31.0 2005-12-30 15.05714
## 3 chic 35 29.4 2005-12-29 7.45000
# the mutate() function exist to compute transformations of
# variables in a data frame
# For example, with air pollution data, we often want to detrend
# the data by substrating the mean from the data. That way we can
# look at whether a given day's air pollution level is higher than
# or less than the average (as opposed to looking at its absolute
# level)
# Here we create a pm25detrend variable that substract the mean
# from the pm25 variable
chicago <- mutate(chicago,pm25detrend = pm25 - mean(pm25,na.rm = TRUE))
head(chicago)
## city tmpd dewpoint date pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 35 30.1 2005-12-31 15.00000 23.5 2.531250 13.25000
## 2 chic 36 31.0 2005-12-30 15.05714 19.2 3.034420 22.80556
## 3 chic 35 29.4 2005-12-29 7.45000 23.5 6.794837 19.97222
## 4 chic 37 34.5 2005-12-28 17.75000 27.5 3.260417 19.28563
## 5 chic 40 33.6 2005-12-27 23.56000 27.0 4.468750 23.50000
## 6 chic 35 29.6 2005-12-26 8.40000 8.5 14.041667 16.81944
## pm25detrend
## 1 -1.230958
## 2 -1.173815
## 3 -8.780958
## 4 1.519042
## 5 7.329042
## 6 -7.830958
#There is also the related transmute()function, which does the same
# thing as mutate() but then drops all non-transformed variables
head(transmute(chicago,
pm10detrend = pm10tmean2 - mean(pm10tmean2,na.rm = TRUE),
o3detrend = o3tmean2 - mean(o3tmean2,na.rm = TRUE)))
## pm10detrend o3detrend
## 1 -10.395206 -16.904263
## 2 -14.695206 -16.401093
## 3 -10.395206 -12.640676
## 4 -6.395206 -16.175096
## 5 -6.895206 -14.966763
## 6 -25.395206 -5.393846
# group_by()
# the group_by() function is used to generate summary statistics
# see page 13 for details
# the general operation here is a combination of splitting a data
# frame into separate pieces defined by a variable or group of
# variables (group_by()), and then applying a summary function
# across those subsets (summarize()).
# first, we can create a year variable using as.POSXlt()
chicago <- mutate(chicago,year=as.POSIXlt(date)$year+1900)
# now we can create a separate data frame that splits the original
# data frame by year
years <- group_by(chicago,year)
# finally, we compute summary statistics for each year in the data
# frame with the summarize() function
summarise(years,pm25 = mean(pm25,na.rm = TRUE),
o3 = max(o3tmean2,na.rm = TRUE),
no2 = median(no2tmean2,na.rm = TRUE))
## # A tibble: 19 x 4
## year pm25 o3 no2
## <dbl> <dbl> <dbl> <dbl>
## 1 1987 NaN 62.96966 23.49369
## 2 1988 NaN 61.67708 24.52296
## 3 1989 NaN 59.72727 26.14062
## 4 1990 NaN 52.22917 22.59583
## 5 1991 NaN 63.10417 21.38194
## 6 1992 NaN 50.82870 24.78921
## 7 1993 NaN 44.30093 25.76993
## 8 1994 NaN 52.17844 28.47500
## 9 1995 NaN 66.58750 27.26042
## 10 1996 NaN 58.39583 26.38715
## 11 1997 NaN 56.54167 25.48143
## 12 1998 18.26467 50.66250 24.58649
## 13 1999 18.49646 57.48864 24.66667
## 14 2000 16.93806 55.76103 23.46082
## 15 2001 16.92632 51.81984 25.06522
## 16 2002 15.27335 54.88043 22.73750
## 17 2003 15.23183 56.16608 24.62500
## 18 2004 14.62864 44.48240 23.39130
## 19 2005 16.18556 58.84126 22.62387