Analysis01

Import data (Chicago air pollution) & summarize

setwd('/Users/nohaelprince/Google Drive/DataScience/Rtut/EDAScripts')
chicago <- readRDS("data/chicago.rds") 
dim(chicago)

## [1] 6940    8

str(chicago)

## 'data.frame':    6940 obs. of  8 variables:
##  $ city      : chr  "chic" "chic" "chic" "chic" ...
##  $ tmpd      : num  31.5 33 33 29 32 40 34.5 29 26.5 32.5 ...
##  $ dptp      : num  31.5 29.9 27.4 28.6 28.9 ...
##  $ date      : Date, format: "1987-01-01" "1987-01-02" ...
##  $ pm25tmean2: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ pm10tmean2: num  34 NA 34.2 47 NA ...
##  $ o3tmean2  : num  4.25 3.3 3.33 4.38 4.75 ...
##  $ no2tmean2 : num  20 23.2 23.8 30.4 30.3 ...

exploration using dplyr

names(chicago[1:3])

## [1] "city" "tmpd" "dptp"

library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

subset <- select(chicago, city:dptp)
head(subset)

##   city tmpd   dptp
## 1 chic 31.5 31.500
## 2 chic 33.0 29.875
## 3 chic 33.0 27.375
## 4 chic 29.0 28.625
## 5 chic 32.0 28.875
## 6 chic 40.0 35.125

# omit some variables
s1 <- select(chicago, -(city:dptp))
head(s1)

##         date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 1987-01-01         NA   34.00000 4.250000  19.98810
## 2 1987-01-02         NA         NA 3.304348  23.19099
## 3 1987-01-03         NA   34.16667 3.333333  23.81548
## 4 1987-01-04         NA   47.00000 4.375000  30.43452
## 5 1987-01-05         NA         NA 4.750000  30.33333
## 6 1987-01-06         NA   48.00000 5.833333  25.77233

# keep variables that start with a "d"
names(chicago)

## [1] "city"       "tmpd"       "dptp"       "date"       "pm25tmean2"
## [6] "pm10tmean2" "o3tmean2"   "no2tmean2"

s2 <- select(chicago, starts_with("d"))
head(s2)

##     dptp       date
## 1 31.500 1987-01-01
## 2 29.875 1987-01-02
## 3 27.375 1987-01-03
## 4 28.625 1987-01-04
## 5 28.875 1987-01-05
## 6 35.125 1987-01-06

# keep variables that ends with 2
s3 <- select(chicago, ends_with("2"))
names(s3)

## [1] "pm25tmean2" "pm10tmean2" "o3tmean2"   "no2tmean2"

# The filter() function is used to extract subsets of rows from a data frame. This function is similar to the existing subset() function in R but is quite a bit faster 

chic.f <- filter(chicago, pm25tmean2 > 30)
str(chic.f)

## 'data.frame':    194 obs. of  8 variables:
##  $ city      : chr  "chic" "chic" "chic" "chic" ...
##  $ tmpd      : num  23 28 55 59 57 57 75 61 73 78 ...
##  $ dptp      : num  21.9 25.8 51.3 53.7 52 56 65.8 59 60.3 67.1 ...
##  $ date      : Date, format: "1998-01-17" "1998-01-23" ...
##  $ pm25tmean2: num  38.1 34 39.4 35.4 33.3 ...
##  $ pm10tmean2: num  32.5 38.7 34 28.5 35 ...
##  $ o3tmean2  : num  3.18 1.75 10.79 14.3 20.66 ...
##  $ no2tmean2 : num  25.3 29.4 25.3 31.4 26.8 ...

# there are only 194 rows in the dataframe and the distribution of pm25tmean2 is:
summary(chic.f$pm25tmean2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   30.05   32.12   35.04   36.63   39.53   61.50

boxplot(chic.f$pm25tmean2, col=c("cyan")) # data is right skewed

# order rows by date (asc)
chicago <- arrange(chicago, date) 
head(chicago)

##   city tmpd   dptp       date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 31.5 31.500 1987-01-01         NA   34.00000 4.250000  19.98810
## 2 chic 33.0 29.875 1987-01-02         NA         NA 3.304348  23.19099
## 3 chic 33.0 27.375 1987-01-03         NA   34.16667 3.333333  23.81548
## 4 chic 29.0 28.625 1987-01-04         NA   47.00000 4.375000  30.43452
## 5 chic 32.0 28.875 1987-01-05         NA         NA 4.750000  30.33333
## 6 chic 40.0 35.125 1987-01-06         NA   48.00000 5.833333  25.77233

# order rows by date(desc)
chicago <- arrange(chicago, desc(date))
head(chicago)

##   city tmpd dptp       date pm25tmean2 pm10tmean2  o3tmean2 no2tmean2
## 1 chic   35 30.1 2005-12-31   15.00000       23.5  2.531250  13.25000
## 2 chic   36 31.0 2005-12-30   15.05714       19.2  3.034420  22.80556
## 3 chic   35 29.4 2005-12-29    7.45000       23.5  6.794837  19.97222
## 4 chic   37 34.5 2005-12-28   17.75000       27.5  3.260417  19.28563
## 5 chic   40 33.6 2005-12-27   23.56000       27.0  4.468750  23.50000
## 6 chic   35 29.6 2005-12-26    8.40000        8.5 14.041667  16.81944

# rename columns dptp to dewpoint and pm25tmean2 to pm25
chicago <- rename(chicago, dewpoint = dptp, pm25 = pm25tmean2)
head(chicago)

##   city tmpd dewpoint       date     pm25 pm10tmean2  o3tmean2 no2tmean2
## 1 chic   35     30.1 2005-12-31 15.00000       23.5  2.531250  13.25000
## 2 chic   36     31.0 2005-12-30 15.05714       19.2  3.034420  22.80556
## 3 chic   35     29.4 2005-12-29  7.45000       23.5  6.794837  19.97222
## 4 chic   37     34.5 2005-12-28 17.75000       27.5  3.260417  19.28563
## 5 chic   40     33.6 2005-12-27 23.56000       27.0  4.468750  23.50000
## 6 chic   35     29.6 2005-12-26  8.40000        8.5 14.041667  16.81944

data transformations

# we often want to detrend the data by subtracting the mean from the data. That way we can look at whether a given day’s air pollution level is higher than or less than average (as opposed to looking at its absolute level). Here we create a pm25detrend variable that subtracts the mean from the pm25 variable.

chicago <- mutate(chicago, pm25detrend = pm25 - mean(pm25, na.rm = TRUE))
head(chicago)

##   city tmpd dewpoint       date     pm25 pm10tmean2  o3tmean2 no2tmean2
## 1 chic   35     30.1 2005-12-31 15.00000       23.5  2.531250  13.25000
## 2 chic   36     31.0 2005-12-30 15.05714       19.2  3.034420  22.80556
## 3 chic   35     29.4 2005-12-29  7.45000       23.5  6.794837  19.97222
## 4 chic   37     34.5 2005-12-28 17.75000       27.5  3.260417  19.28563
## 5 chic   40     33.6 2005-12-27 23.56000       27.0  4.468750  23.50000
## 6 chic   35     29.6 2005-12-26  8.40000        8.5 14.041667  16.81944
##   pm25detrend
## 1   -1.230958
## 2   -1.173815
## 3   -8.780958
## 4    1.519042
## 5    7.329042
## 6   -7.830958

# mutate is rather faster than transform
# transmute() function, which does the same thing as mutate() but then drops all non-transformed variables.
head(transmute(chicago, pm10detrend = pm10tmean2 - mean(pm10tmean2, na.rm = TRUE), o3detrend = o3tmean2 - mean(o3tmean2, na.rm = TRUE)))

##   pm10detrend  o3detrend
## 1  -10.395206 -16.904263
## 2  -14.695206 -16.401093
## 3  -10.395206 -12.640676
## 4   -6.395206 -16.175096
## 5   -6.895206 -14.966763
## 6  -25.395206  -5.393846

# Note that there are only two columns in the transmuted data frame.

generate summary statistics

# in this air pollution dataset, you might want to know what the average annual level of PM2.5 is. So the stratum is the year.
#The general operation here is a combination of splitting a data frame into separate pieces defined by a variable or group of variables (group_by()), and then applying a summary function across those subsets (summarize()).

chicago <- mutate(chicago, year = as.POSIXlt(date)$year + 1900)
head(chicago)

##   city tmpd dewpoint       date     pm25 pm10tmean2  o3tmean2 no2tmean2
## 1 chic   35     30.1 2005-12-31 15.00000       23.5  2.531250  13.25000
## 2 chic   36     31.0 2005-12-30 15.05714       19.2  3.034420  22.80556
## 3 chic   35     29.4 2005-12-29  7.45000       23.5  6.794837  19.97222
## 4 chic   37     34.5 2005-12-28 17.75000       27.5  3.260417  19.28563
## 5 chic   40     33.6 2005-12-27 23.56000       27.0  4.468750  23.50000
## 6 chic   35     29.6 2005-12-26  8.40000        8.5 14.041667  16.81944
##   pm25detrend year
## 1   -1.230958 2005
## 2   -1.173815 2005
## 3   -8.780958 2005
## 4    1.519042 2005
## 5    7.329042 2005
## 6   -7.830958 2005

# Now we can create a separate data frame that splits the original data frame by year.
years <- group_by(chicago, year)
# Finally, we compute summary statistics for each year in the data frame with the summarize() function.
summarize(years, pm25 = mean(pm25, na.rm = TRUE),
                 o3 = max(o3tmean2, na.rm = TRUE),
                 no2 = median(no2tmean2, na.rm = TRUE))

## Source: local data frame [19 x 4]
## 
##     year     pm25       o3      no2
##    (dbl)    (dbl)    (dbl)    (dbl)
## 1   1987      NaN 62.96966 23.49369
## 2   1988      NaN 61.67708 24.52296
## 3   1989      NaN 59.72727 26.14062
## 4   1990      NaN 52.22917 22.59583
## 5   1991      NaN 63.10417 21.38194
## 6   1992      NaN 50.82870 24.78921
## 7   1993      NaN 44.30093 25.76993
## 8   1994      NaN 52.17844 28.47500
## 9   1995      NaN 66.58750 27.26042
## 10  1996      NaN 58.39583 26.38715
## 11  1997      NaN 56.54167 25.48143
## 12  1998 18.26467 50.66250 24.58649
## 13  1999 18.49646 57.48864 24.66667
## 14  2000 16.93806 55.76103 23.46082
## 15  2001 16.92632 51.81984 25.06522
## 16  2002 15.27335 54.88043 22.73750
## 17  2003 15.23183 56.16608 24.62500
## 18  2004 14.62864 44.48240 23.39130
## 19  2005 16.18556 58.84126 22.62387

# summarize() returns a data frame with year as the first column, and then the annual averages of pm25, o3, and no2.

# The pipeline operater %>% is very handy for stringing together multiple dplyr functions in a sequence of operations
# the above commands can be written as:
#1. create a new variable pm25.quint
#2. split the data frame by that new variable
#3. compute the mean of o3 and no2 in the sub-groups defined by pm25.quint

#mutate(chicago, pm25.quint = cut(pm25, qq)) %>%
 #     group_by(pm25.quint) %>%
 #     summarize(o3 = mean(o3tmean2, na.rm = TRUE),
 #               no2 = mean(no2tmean2, na.rm = TRUE))


# Another example might be computing the average pollutant level by month. This could be useful to see if there are any seasonal trends in the data.
mutate(chicago, month = as.POSIXlt(date)$mon + 1) %>%
                group_by(month) %>%
                summarize(pm25 = mean(pm25, na.rm = TRUE),
                          o3 = max(o3tmean2, na.rm = TRUE),
                          no2 = median(no2tmean2, na.rm = TRUE))

## Source: local data frame [12 x 4]
## 
##    month     pm25       o3      no2
##    (dbl)    (dbl)    (dbl)    (dbl)
## 1      1 17.76996 28.22222 25.35417
## 2      2 20.37513 37.37500 26.78034
## 3      3 17.40818 39.05000 26.76984
## 4      4 13.85879 47.94907 25.03125
## 5      5 14.07420 52.75000 24.22222
## 6      6 15.86461 66.58750 25.01140
## 7      7 16.57087 59.54167 22.38442
## 8      8 16.93380 53.96701 22.98333
## 9      9 15.91279 57.48864 24.47917
## 10    10 14.23557 47.09275 24.15217
## 11    11 15.15794 29.45833 23.56537
## 12    12 17.52221 27.70833 24.45773

Here we can see that o3 tends to be low in the winter months and high in the summer while no2 is higher in the winter and lower in the summer.

Note: dplyr can work with other data frame “backends” such as SQL databases. There is an SQL interface for relational databases via the DBI package
dplyr can be integrated with the data.table package for large fast tables

Analysis01

Noha Elprince

February 15, 2016

Import data (Chicago air pollution) & summarize

exploration using dplyr

data transformations

generate summary statistics