setwd('/Users/nohaelprince/Google Drive/DataScience/Rtut/EDAScripts')
chicago <- readRDS("data/chicago.rds")
dim(chicago)
## [1] 6940 8
str(chicago)
## 'data.frame': 6940 obs. of 8 variables:
## $ city : chr "chic" "chic" "chic" "chic" ...
## $ tmpd : num 31.5 33 33 29 32 40 34.5 29 26.5 32.5 ...
## $ dptp : num 31.5 29.9 27.4 28.6 28.9 ...
## $ date : Date, format: "1987-01-01" "1987-01-02" ...
## $ pm25tmean2: num NA NA NA NA NA NA NA NA NA NA ...
## $ pm10tmean2: num 34 NA 34.2 47 NA ...
## $ o3tmean2 : num 4.25 3.3 3.33 4.38 4.75 ...
## $ no2tmean2 : num 20 23.2 23.8 30.4 30.3 ...
names(chicago[1:3])
## [1] "city" "tmpd" "dptp"
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
subset <- select(chicago, city:dptp)
head(subset)
## city tmpd dptp
## 1 chic 31.5 31.500
## 2 chic 33.0 29.875
## 3 chic 33.0 27.375
## 4 chic 29.0 28.625
## 5 chic 32.0 28.875
## 6 chic 40.0 35.125
# omit some variables
s1 <- select(chicago, -(city:dptp))
head(s1)
## date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 1987-01-01 NA 34.00000 4.250000 19.98810
## 2 1987-01-02 NA NA 3.304348 23.19099
## 3 1987-01-03 NA 34.16667 3.333333 23.81548
## 4 1987-01-04 NA 47.00000 4.375000 30.43452
## 5 1987-01-05 NA NA 4.750000 30.33333
## 6 1987-01-06 NA 48.00000 5.833333 25.77233
# keep variables that start with a "d"
names(chicago)
## [1] "city" "tmpd" "dptp" "date" "pm25tmean2"
## [6] "pm10tmean2" "o3tmean2" "no2tmean2"
s2 <- select(chicago, starts_with("d"))
head(s2)
## dptp date
## 1 31.500 1987-01-01
## 2 29.875 1987-01-02
## 3 27.375 1987-01-03
## 4 28.625 1987-01-04
## 5 28.875 1987-01-05
## 6 35.125 1987-01-06
# keep variables that ends with 2
s3 <- select(chicago, ends_with("2"))
names(s3)
## [1] "pm25tmean2" "pm10tmean2" "o3tmean2" "no2tmean2"
# The filter() function is used to extract subsets of rows from a data frame. This function is similar to the existing subset() function in R but is quite a bit faster
chic.f <- filter(chicago, pm25tmean2 > 30)
str(chic.f)
## 'data.frame': 194 obs. of 8 variables:
## $ city : chr "chic" "chic" "chic" "chic" ...
## $ tmpd : num 23 28 55 59 57 57 75 61 73 78 ...
## $ dptp : num 21.9 25.8 51.3 53.7 52 56 65.8 59 60.3 67.1 ...
## $ date : Date, format: "1998-01-17" "1998-01-23" ...
## $ pm25tmean2: num 38.1 34 39.4 35.4 33.3 ...
## $ pm10tmean2: num 32.5 38.7 34 28.5 35 ...
## $ o3tmean2 : num 3.18 1.75 10.79 14.3 20.66 ...
## $ no2tmean2 : num 25.3 29.4 25.3 31.4 26.8 ...
# there are only 194 rows in the dataframe and the distribution of pm25tmean2 is:
summary(chic.f$pm25tmean2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.05 32.12 35.04 36.63 39.53 61.50
boxplot(chic.f$pm25tmean2, col=c("cyan")) # data is right skewed
# order rows by date (asc)
chicago <- arrange(chicago, date)
head(chicago)
## city tmpd dptp date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 31.5 31.500 1987-01-01 NA 34.00000 4.250000 19.98810
## 2 chic 33.0 29.875 1987-01-02 NA NA 3.304348 23.19099
## 3 chic 33.0 27.375 1987-01-03 NA 34.16667 3.333333 23.81548
## 4 chic 29.0 28.625 1987-01-04 NA 47.00000 4.375000 30.43452
## 5 chic 32.0 28.875 1987-01-05 NA NA 4.750000 30.33333
## 6 chic 40.0 35.125 1987-01-06 NA 48.00000 5.833333 25.77233
# order rows by date(desc)
chicago <- arrange(chicago, desc(date))
head(chicago)
## city tmpd dptp date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 35 30.1 2005-12-31 15.00000 23.5 2.531250 13.25000
## 2 chic 36 31.0 2005-12-30 15.05714 19.2 3.034420 22.80556
## 3 chic 35 29.4 2005-12-29 7.45000 23.5 6.794837 19.97222
## 4 chic 37 34.5 2005-12-28 17.75000 27.5 3.260417 19.28563
## 5 chic 40 33.6 2005-12-27 23.56000 27.0 4.468750 23.50000
## 6 chic 35 29.6 2005-12-26 8.40000 8.5 14.041667 16.81944
# rename columns dptp to dewpoint and pm25tmean2 to pm25
chicago <- rename(chicago, dewpoint = dptp, pm25 = pm25tmean2)
head(chicago)
## city tmpd dewpoint date pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 35 30.1 2005-12-31 15.00000 23.5 2.531250 13.25000
## 2 chic 36 31.0 2005-12-30 15.05714 19.2 3.034420 22.80556
## 3 chic 35 29.4 2005-12-29 7.45000 23.5 6.794837 19.97222
## 4 chic 37 34.5 2005-12-28 17.75000 27.5 3.260417 19.28563
## 5 chic 40 33.6 2005-12-27 23.56000 27.0 4.468750 23.50000
## 6 chic 35 29.6 2005-12-26 8.40000 8.5 14.041667 16.81944
# we often want to detrend the data by subtracting the mean from the data. That way we can look at whether a given day’s air pollution level is higher than or less than average (as opposed to looking at its absolute level). Here we create a pm25detrend variable that subtracts the mean from the pm25 variable.
chicago <- mutate(chicago, pm25detrend = pm25 - mean(pm25, na.rm = TRUE))
head(chicago)
## city tmpd dewpoint date pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 35 30.1 2005-12-31 15.00000 23.5 2.531250 13.25000
## 2 chic 36 31.0 2005-12-30 15.05714 19.2 3.034420 22.80556
## 3 chic 35 29.4 2005-12-29 7.45000 23.5 6.794837 19.97222
## 4 chic 37 34.5 2005-12-28 17.75000 27.5 3.260417 19.28563
## 5 chic 40 33.6 2005-12-27 23.56000 27.0 4.468750 23.50000
## 6 chic 35 29.6 2005-12-26 8.40000 8.5 14.041667 16.81944
## pm25detrend
## 1 -1.230958
## 2 -1.173815
## 3 -8.780958
## 4 1.519042
## 5 7.329042
## 6 -7.830958
# mutate is rather faster than transform
# transmute() function, which does the same thing as mutate() but then drops all non-transformed variables.
head(transmute(chicago, pm10detrend = pm10tmean2 - mean(pm10tmean2, na.rm = TRUE), o3detrend = o3tmean2 - mean(o3tmean2, na.rm = TRUE)))
## pm10detrend o3detrend
## 1 -10.395206 -16.904263
## 2 -14.695206 -16.401093
## 3 -10.395206 -12.640676
## 4 -6.395206 -16.175096
## 5 -6.895206 -14.966763
## 6 -25.395206 -5.393846
# Note that there are only two columns in the transmuted data frame.
# in this air pollution dataset, you might want to know what the average annual level of PM2.5 is. So the stratum is the year.
#The general operation here is a combination of splitting a data frame into separate pieces defined by a variable or group of variables (group_by()), and then applying a summary function across those subsets (summarize()).
chicago <- mutate(chicago, year = as.POSIXlt(date)$year + 1900)
head(chicago)
## city tmpd dewpoint date pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 35 30.1 2005-12-31 15.00000 23.5 2.531250 13.25000
## 2 chic 36 31.0 2005-12-30 15.05714 19.2 3.034420 22.80556
## 3 chic 35 29.4 2005-12-29 7.45000 23.5 6.794837 19.97222
## 4 chic 37 34.5 2005-12-28 17.75000 27.5 3.260417 19.28563
## 5 chic 40 33.6 2005-12-27 23.56000 27.0 4.468750 23.50000
## 6 chic 35 29.6 2005-12-26 8.40000 8.5 14.041667 16.81944
## pm25detrend year
## 1 -1.230958 2005
## 2 -1.173815 2005
## 3 -8.780958 2005
## 4 1.519042 2005
## 5 7.329042 2005
## 6 -7.830958 2005
# Now we can create a separate data frame that splits the original data frame by year.
years <- group_by(chicago, year)
# Finally, we compute summary statistics for each year in the data frame with the summarize() function.
summarize(years, pm25 = mean(pm25, na.rm = TRUE),
o3 = max(o3tmean2, na.rm = TRUE),
no2 = median(no2tmean2, na.rm = TRUE))
## Source: local data frame [19 x 4]
##
## year pm25 o3 no2
## (dbl) (dbl) (dbl) (dbl)
## 1 1987 NaN 62.96966 23.49369
## 2 1988 NaN 61.67708 24.52296
## 3 1989 NaN 59.72727 26.14062
## 4 1990 NaN 52.22917 22.59583
## 5 1991 NaN 63.10417 21.38194
## 6 1992 NaN 50.82870 24.78921
## 7 1993 NaN 44.30093 25.76993
## 8 1994 NaN 52.17844 28.47500
## 9 1995 NaN 66.58750 27.26042
## 10 1996 NaN 58.39583 26.38715
## 11 1997 NaN 56.54167 25.48143
## 12 1998 18.26467 50.66250 24.58649
## 13 1999 18.49646 57.48864 24.66667
## 14 2000 16.93806 55.76103 23.46082
## 15 2001 16.92632 51.81984 25.06522
## 16 2002 15.27335 54.88043 22.73750
## 17 2003 15.23183 56.16608 24.62500
## 18 2004 14.62864 44.48240 23.39130
## 19 2005 16.18556 58.84126 22.62387
# summarize() returns a data frame with year as the first column, and then the annual averages of pm25, o3, and no2.
# The pipeline operater %>% is very handy for stringing together multiple dplyr functions in a sequence of operations
# the above commands can be written as:
#1. create a new variable pm25.quint
#2. split the data frame by that new variable
#3. compute the mean of o3 and no2 in the sub-groups defined by pm25.quint
#mutate(chicago, pm25.quint = cut(pm25, qq)) %>%
# group_by(pm25.quint) %>%
# summarize(o3 = mean(o3tmean2, na.rm = TRUE),
# no2 = mean(no2tmean2, na.rm = TRUE))
# Another example might be computing the average pollutant level by month. This could be useful to see if there are any seasonal trends in the data.
mutate(chicago, month = as.POSIXlt(date)$mon + 1) %>%
group_by(month) %>%
summarize(pm25 = mean(pm25, na.rm = TRUE),
o3 = max(o3tmean2, na.rm = TRUE),
no2 = median(no2tmean2, na.rm = TRUE))
## Source: local data frame [12 x 4]
##
## month pm25 o3 no2
## (dbl) (dbl) (dbl) (dbl)
## 1 1 17.76996 28.22222 25.35417
## 2 2 20.37513 37.37500 26.78034
## 3 3 17.40818 39.05000 26.76984
## 4 4 13.85879 47.94907 25.03125
## 5 5 14.07420 52.75000 24.22222
## 6 6 15.86461 66.58750 25.01140
## 7 7 16.57087 59.54167 22.38442
## 8 8 16.93380 53.96701 22.98333
## 9 9 15.91279 57.48864 24.47917
## 10 10 14.23557 47.09275 24.15217
## 11 11 15.15794 29.45833 23.56537
## 12 12 17.52221 27.70833 24.45773
Here we can see that o3 tends to be low in the winter months and high in the summer while no2 is higher in the winter and lower in the summer.