exploratory_data_analysis.R

# Source Exploratory Data Analysis with R by Roger D. Peng
# library(tidyverse)
# objects(grep("tidyverse",search()))

# dplyr Grammar
# select()
# filter()
# arrange()
# rename()
# mutate()
# summarise / summarize
# %>% the pipe operator
# 1. The first argument is a data frame
# 2. The subsequent arguments describe what to do with the
# data frame specified in the first argument
# 3. The return result of a function is a new data frame
# 4. Data frames must be properly formatted (tidy) 
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(nycflights13)
setwd("C:\\Users\\Cruz\\OneDrive\\Documents\\R-Classes\\classes_R")
chicago<-readRDS("chicago.rds")
head(chicago)

##   city tmpd   dptp       date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 31.5 31.500 1987-01-01         NA   34.00000 4.250000  19.98810
## 2 chic 33.0 29.875 1987-01-02         NA         NA 3.304348  23.19099
## 3 chic 33.0 27.375 1987-01-03         NA   34.16667 3.333333  23.81548
## 4 chic 29.0 28.625 1987-01-04         NA   47.00000 4.375000  30.43452
## 5 chic 32.0 28.875 1987-01-05         NA         NA 4.750000  30.33333
## 6 chic 40.0 35.125 1987-01-06         NA   48.00000 5.833333  25.77233

str(chicago)

## 'data.frame':    6940 obs. of  8 variables:
##  $ city      : chr  "chic" "chic" "chic" "chic" ...
##  $ tmpd      : num  31.5 33 33 29 32 40 34.5 29 26.5 32.5 ...
##  $ dptp      : num  31.5 29.9 27.4 28.6 28.9 ...
##  $ date      : Date, format: "1987-01-01" "1987-01-02" ...
##  $ pm25tmean2: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ pm10tmean2: num  34 NA 34.2 47 NA ...
##  $ o3tmean2  : num  4.25 3.3 3.33 4.38 4.75 ...
##  $ no2tmean2 : num  20 23.2 23.8 30.4 30.3 ...

summary(chicago)

##      city                tmpd             dptp             date           
##  Length:6940        Min.   :-16.00   Min.   :-25.62   Min.   :1987-01-01  
##  Class :character   1st Qu.: 35.00   1st Qu.: 27.00   1st Qu.:1991-10-01  
##  Mode  :character   Median : 51.00   Median : 39.88   Median :1996-07-01  
##                     Mean   : 50.31   Mean   : 40.34   Mean   :1996-07-01  
##                     3rd Qu.: 67.00   3rd Qu.: 55.75   3rd Qu.:2001-04-01  
##                     Max.   : 92.00   Max.   : 78.25   Max.   :2005-12-31  
##                     NA's   :1        NA's   :2                            
##    pm25tmean2      pm10tmean2        o3tmean2         no2tmean2     
##  Min.   : 1.70   Min.   :  2.00   Min.   : 0.1528   Min.   : 6.158  
##  1st Qu.: 9.70   1st Qu.: 21.50   1st Qu.:10.0729   1st Qu.:19.654  
##  Median :14.66   Median : 30.28   Median :18.5218   Median :24.556  
##  Mean   :16.23   Mean   : 33.90   Mean   :19.4355   Mean   :25.232  
##  3rd Qu.:20.60   3rd Qu.: 42.00   3rd Qu.:27.0010   3rd Qu.:30.139  
##  Max.   :61.50   Max.   :365.00   Max.   :66.5875   Max.   :62.480  
##  NA's   :4447    NA's   :242

dim(chicago)

## [1] 6940    8

# the select() function can be used to select columns of a data frame
subset <- select(chicago,city:dptp)
head(subset)

##   city tmpd   dptp
## 1 chic 31.5 31.500
## 2 chic 33.0 29.875
## 3 chic 33.0 27.375
## 4 chic 29.0 28.625
## 5 chic 32.0 28.875
## 6 chic 40.0 35.125

# the filter function is used to extract subsets of rows from a 
# a data frame
chic.f <- filter(chicago,pm25tmean2 > 30)
head(chic.f)

##   city tmpd dptp       date pm25tmean2 pm10tmean2  o3tmean2 no2tmean2
## 1 chic   23 21.9 1998-01-17      38.10   32.46154  3.180556  25.30000
## 2 chic   28 25.8 1998-01-23      33.95   38.69231  1.750000  29.37630
## 3 chic   55 51.3 1998-04-30      39.40   34.00000 10.786232  25.31310
## 4 chic   59 53.7 1998-05-01      35.40   28.50000 14.295125  31.42905
## 5 chic   57 52.0 1998-05-02      33.30   35.00000 20.662879  26.79861
## 6 chic   57 56.0 1998-05-07      32.10   34.50000 24.270422  33.99167

str(chic.f)

## 'data.frame':    194 obs. of  8 variables:
##  $ city      : chr  "chic" "chic" "chic" "chic" ...
##  $ tmpd      : num  23 28 55 59 57 57 75 61 73 78 ...
##  $ dptp      : num  21.9 25.8 51.3 53.7 52 56 65.8 59 60.3 67.1 ...
##  $ date      : Date, format: "1998-01-17" "1998-01-23" ...
##  $ pm25tmean2: num  38.1 34 39.4 35.4 33.3 ...
##  $ pm10tmean2: num  32.5 38.7 34 28.5 35 ...
##  $ o3tmean2  : num  3.18 1.75 10.79 14.3 20.66 ...
##  $ no2tmean2 : num  25.3 29.4 25.3 31.4 26.8 ...

summary(chic.f$pm25tmean2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   30.05   32.12   35.04   36.63   39.53   61.50

hist(chic.f$pm25tmean2)

chic.f1 <- filter(chicago,pm25tmean2 >30 & tmpd > 80)
select(chic.f1,date,tmpd,pm25tmean2)

##          date tmpd pm25tmean2
## 1  1998-08-23   81   39.60000
## 2  1998-09-06   81   31.50000
## 3  2001-07-20   82   32.30000
## 4  2001-08-01   84   43.70000
## 5  2001-08-08   85   38.83750
## 6  2001-08-09   84   38.20000
## 7  2002-06-20   82   33.00000
## 8  2002-06-23   82   42.50000
## 9  2002-07-08   81   33.10000
## 10 2002-07-18   82   38.85000
## 11 2003-06-25   82   33.90000
## 12 2003-07-04   84   32.90000
## 13 2005-06-24   86   31.85714
## 14 2005-06-27   82   51.53750
## 15 2005-06-28   85   31.20000
## 16 2005-07-17   84   32.70000
## 17 2005-08-03   84   37.90000

# arrange() is used to reorder rows of a data frame according
# to one of the variables/columns.
chicago <- arrange(chicago, date)
head(select(chicago,date,pm25tmean2),3)

##         date pm25tmean2
## 1 1987-01-01         NA
## 2 1987-01-02         NA
## 3 1987-01-03         NA

tail(select(chicago,date,pm25tmean2),3)

##            date pm25tmean2
## 6938 2005-12-29    7.45000
## 6939 2005-12-30   15.05714
## 6940 2005-12-31   15.00000

chicago <- arrange(chicago,desc(date))
# The rename() function renames variables
chicago <- rename(chicago,dewpoint=dptp,pm25=pm25tmean2)
head(chicago[,1:5],3)

##   city tmpd dewpoint       date     pm25
## 1 chic   35     30.1 2005-12-31 15.00000
## 2 chic   36     31.0 2005-12-30 15.05714
## 3 chic   35     29.4 2005-12-29  7.45000

# the mutate() function exist to compute transformations of
# variables in a data frame
# For example, with air pollution data, we often want to detrend
# the data by substrating the mean from the data. That way we can
# look at whether a given day's air pollution level is higher than
# or less than the average (as opposed to looking at its absolute
# level)
# Here we create a pm25detrend variable that substract the mean
# from the pm25 variable
chicago <- mutate(chicago,pm25detrend = pm25 - mean(pm25,na.rm = TRUE))
head(chicago)

##   city tmpd dewpoint       date     pm25 pm10tmean2  o3tmean2 no2tmean2
## 1 chic   35     30.1 2005-12-31 15.00000       23.5  2.531250  13.25000
## 2 chic   36     31.0 2005-12-30 15.05714       19.2  3.034420  22.80556
## 3 chic   35     29.4 2005-12-29  7.45000       23.5  6.794837  19.97222
## 4 chic   37     34.5 2005-12-28 17.75000       27.5  3.260417  19.28563
## 5 chic   40     33.6 2005-12-27 23.56000       27.0  4.468750  23.50000
## 6 chic   35     29.6 2005-12-26  8.40000        8.5 14.041667  16.81944
##   pm25detrend
## 1   -1.230958
## 2   -1.173815
## 3   -8.780958
## 4    1.519042
## 5    7.329042
## 6   -7.830958

#There is also the related transmute()function, which does the same
# thing as mutate() but then drops all non-transformed variables
head(transmute(chicago,
               pm10detrend = pm10tmean2 - mean(pm10tmean2,na.rm = TRUE),
               o3detrend = o3tmean2 - mean(o3tmean2,na.rm = TRUE)))

##   pm10detrend  o3detrend
## 1  -10.395206 -16.904263
## 2  -14.695206 -16.401093
## 3  -10.395206 -12.640676
## 4   -6.395206 -16.175096
## 5   -6.895206 -14.966763
## 6  -25.395206  -5.393846

# group_by()
# the group_by() function is used to generate summary statistics
# see page 13 for details
# the general operation here is a combination of splitting a data
# frame into separate  pieces defined by a variable or group of
# variables (group_by()), and then applying a summary function
# across those subsets (summarize()).
# first, we can create a year variable using as.POSXlt()
chicago <- mutate(chicago,year=as.POSIXlt(date)$year+1900)
# now we can create a separate data frame that splits the original
# data frame by year
years <- group_by(chicago,year)
# finally, we compute summary statistics for each year in the data
# frame with the summarize() function
summarise(years,pm25 = mean(pm25,na.rm = TRUE),
          o3 = max(o3tmean2,na.rm = TRUE),
          no2 = median(no2tmean2,na.rm = TRUE))

## # A tibble: 19 x 4
##     year     pm25       o3      no2
##    <dbl>    <dbl>    <dbl>    <dbl>
##  1  1987      NaN 62.96966 23.49369
##  2  1988      NaN 61.67708 24.52296
##  3  1989      NaN 59.72727 26.14062
##  4  1990      NaN 52.22917 22.59583
##  5  1991      NaN 63.10417 21.38194
##  6  1992      NaN 50.82870 24.78921
##  7  1993      NaN 44.30093 25.76993
##  8  1994      NaN 52.17844 28.47500
##  9  1995      NaN 66.58750 27.26042
## 10  1996      NaN 58.39583 26.38715
## 11  1997      NaN 56.54167 25.48143
## 12  1998 18.26467 50.66250 24.58649
## 13  1999 18.49646 57.48864 24.66667
## 14  2000 16.93806 55.76103 23.46082
## 15  2001 16.92632 51.81984 25.06522
## 16  2002 15.27335 54.88043 22.73750
## 17  2003 15.23183 56.16608 24.62500
## 18  2004 14.62864 44.48240 23.39130
## 19  2005 16.18556 58.84126 22.62387

exploratory_data_analysis.R

Cruz

Thu Sep 28 21:11:36 2017