##Select(), download chicago data
#load the package, ignore warnings
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#the maximum number of columns on a line used in printing vectors
options(width = 105)
#read the RDS file
chicago <-readRDS("./data/chicago.rds")
#inspect the data
dim(chicago)
## [1] 6940 8
#[1] 6940 8
str(chicago)
## 'data.frame': 6940 obs. of 8 variables:
## $ city : chr "chic" "chic" "chic" "chic" ...
## $ tmpd : num 31.5 33 33 29 32 40 34.5 29 26.5 32.5 ...
## $ dptp : num 31.5 29.9 27.4 28.6 28.9 ...
## $ date : Date, format: "1987-01-01" "1987-01-02" "1987-01-03" ...
## $ pm25tmean2: num NA NA NA NA NA NA NA NA NA NA ...
## $ pm10tmean2: num 34 NA 34.2 47 NA ...
## $ o3tmean2 : num 4.25 3.3 3.33 4.38 4.75 ...
## $ no2tmean2 : num 20 23.2 23.8 30.4 30.3 ...
names(chicago)
## [1] "city" "tmpd" "dptp" "date" "pm25tmean2" "pm10tmean2" "o3tmean2"
## [8] "no2tmean2"
##select() function
#select() a range of columns
head(select(chicago, city:dptp))
## city tmpd dptp
## 1 chic 31.5 31.500
## 2 chic 33.0 29.875
## 3 chic 33.0 27.375
## 4 chic 29.0 28.625
## 5 chic 32.0 28.875
## 6 chic 40.0 35.125
#exclude from selection a range of columns
head(select(chicago, -(city:dptp)))
## date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 1987-01-01 NA 34.00000 4.250000 19.98810
## 2 1987-01-02 NA NA 3.304348 23.19099
## 3 1987-01-03 NA 34.16667 3.333333 23.81548
## 4 1987-01-04 NA 47.00000 4.375000 30.43452
## 5 1987-01-05 NA NA 4.750000 30.33333
## 6 1987-01-06 NA 48.00000 5.833333 25.77233
##filter() function
#filter() subsets based on a sequence
chic.f <-filter(chicago, pm25tmean2 > 30)
head(chic.f, 5)
## city tmpd dptp date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 23 21.9 1998-01-17 38.10 32.46154 3.180556 25.30000
## 2 chic 28 25.8 1998-01-23 33.95 38.69231 1.750000 29.37630
## 3 chic 55 51.3 1998-04-30 39.40 34.00000 10.786232 25.31310
## 4 chic 59 53.7 1998-05-01 35.40 28.50000 14.295125 31.42905
## 5 chic 57 52.0 1998-05-02 33.30 35.00000 20.662879 26.79861
#here's a more complex example with the and operator
chic.f <-filter(chicago, pm25tmean2 > 30 & tmpd >80)
head(chic.f, 5)
## city tmpd dptp date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 81 71.2 1998-08-23 39.6000 59.0 45.86364 14.32639
## 2 chic 81 70.4 1998-09-06 31.5000 50.5 50.66250 20.31250
## 3 chic 82 72.2 2001-07-20 32.3000 58.5 33.00380 33.67500
## 4 chic 84 72.9 2001-08-01 43.7000 81.5 45.17736 27.44239
## 5 chic 85 72.6 2001-08-08 38.8375 70.0 37.98047 27.62743
##arrange() function
#arrange() orders the variables
chicago <-arrange(chicago, date)
head(chicago)
## city tmpd dptp date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 31.5 31.500 1987-01-01 NA 34.00000 4.250000 19.98810
## 2 chic 33.0 29.875 1987-01-02 NA NA 3.304348 23.19099
## 3 chic 33.0 27.375 1987-01-03 NA 34.16667 3.333333 23.81548
## 4 chic 29.0 28.625 1987-01-04 NA 47.00000 4.375000 30.43452
## 5 chic 32.0 28.875 1987-01-05 NA NA 4.750000 30.33333
## 6 chic 40.0 35.125 1987-01-06 NA 48.00000 5.833333 25.77233
#arrange in decending order
chicago <-arrange(chicago, desc(date))
##rename() function
#rename() columns is now easy in R!
chicago <- rename(chicago, pm25=pm25tmean2, dewpoint=dptp)
head(chicago)
## city tmpd dewpoint date pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 35 30.1 2005-12-31 15.00000 23.5 2.531250 13.25000
## 2 chic 36 31.0 2005-12-30 15.05714 19.2 3.034420 22.80556
## 3 chic 35 29.4 2005-12-29 7.45000 23.5 6.794837 19.97222
## 4 chic 37 34.5 2005-12-28 17.75000 27.5 3.260417 19.28563
## 5 chic 40 33.6 2005-12-27 23.56000 27.0 4.468750 23.50000
## 6 chic 35 29.6 2005-12-26 8.40000 8.5 14.041667 16.81944
##mutate() function
#mutate() is used to transform data or create variables
# here to center a variable
chicago <-mutate(chicago, pm25detrend=pm25-mean(pm25, na.rm = TRUE))
head(select(chicago, pm25, pm25detrend))
## pm25 pm25detrend
## 1 15.00000 -1.230958
## 2 15.05714 -1.173815
## 3 7.45000 -8.780958
## 4 17.75000 1.519042
## 5 23.56000 7.329042
## 6 8.40000 -7.830958
##group_by() function
#group_by() splits a dataframe by categorical variables
#first let's create a new variable temperature category variable 'temcat'
#to see if the temperatue was hot or cold on a particular day, depending
#on whether the temperature was > 80°F or not.
chicago <- mutate(chicago, temcat = factor(1*(tmpd > 80), labels = c("cold", "hot")))
#use group_by() to create a new data frame
hotcold <- group_by(chicago, temcat)
str(hotcold)
## tibble [6,940 x 10] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ city : chr [1:6940] "chic" "chic" "chic" "chic" ...
## $ tmpd : num [1:6940] 35 36 35 37 40 35 35 37 41 22 ...
## $ dewpoint : num [1:6940] 30.1 31 29.4 34.5 33.6 29.6 32.1 35.2 32.6 23.3 ...
## $ date : Date[1:6940], format: "2005-12-31" "2005-12-30" "2005-12-29" ...
## $ pm25 : num [1:6940] 15 15.06 7.45 17.75 23.56 ...
## $ pm10tmean2 : num [1:6940] 23.5 19.2 23.5 27.5 27 8.5 8 25.2 34.5 42.5 ...
## $ o3tmean2 : num [1:6940] 2.53 3.03 6.79 3.26 4.47 ...
## $ no2tmean2 : num [1:6940] 13.2 22.8 20 19.3 23.5 ...
## $ pm25detrend: num [1:6940] -1.23 -1.17 -8.78 1.52 7.33 ...
## $ temcat : Factor w/ 2 levels "cold","hot": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "groups")= tibble [3 x 2] (S3: tbl_df/tbl/data.frame)
## ..$ temcat: Factor w/ 2 levels "cold","hot": 1 2 NA
## ..$ .rows : list<int> [1:3]
## .. ..$ : int [1:6737] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..$ : int [1:202] 110 150 151 153 160 161 167 168 169 185 ...
## .. ..$ : int 1037
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
summarise(hotcold)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 1
## temcat
## <fct>
## 1 cold
## 2 hot
## 3 <NA>
# temcat
# <fct>
#1 cold
#2 hot
#3 NA
#the dataframe has NA values, so I need to use 'na.rm=TRUE'
summarise(hotcold, pm25 =mean(pm25, na.rm = TRUE), o3 =max(o3tmean2) , no2=median(no2tmean2,na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 4
## temcat pm25 o3 no2
## <fct> <dbl> <dbl> <dbl>
## 1 cold 16.0 66.6 24.5
## 2 hot 26.5 63.0 24.9
## 3 <NA> 47.7 9.42 37.4
##group_by() with years example
#create a new variable, 'years'
chicago <- mutate(chicago, year=as.POSIXlt(date)$year + 1900)
years <- group_by(chicago, year)
#group_by() with years example
summarise(years, pm25 =mean(pm25, na.rm = TRUE), o3 =max(o3tmean2) , no2=median(no2tmean2,na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 19 x 4
## year pm25 o3 no2
## <dbl> <dbl> <dbl> <dbl>
## 1 1987 NaN 63.0 23.5
## 2 1988 NaN 61.7 24.5
## 3 1989 NaN 59.7 26.1
## 4 1990 NaN 52.2 22.6
## 5 1991 NaN 63.1 21.4
## 6 1992 NaN 50.8 24.8
## 7 1993 NaN 44.3 25.8
## 8 1994 NaN 52.2 28.5
## 9 1995 NaN 66.6 27.3
## 10 1996 NaN 58.4 26.4
## 11 1997 NaN 56.5 25.5
## 12 1998 18.3 50.7 24.6
## 13 1999 18.5 57.5 24.7
## 14 2000 16.9 55.8 23.5
## 15 2001 16.9 51.8 25.1
## 16 2002 15.3 54.9 22.7
## 17 2003 15.2 56.2 24.6
## 18 2004 14.6 44.5 23.4
## 19 2005 16.2 58.8 22.6
# A piple allows you to chain operation, in a readble and powerful way
# dplr can work with other data frame "backends"
# use data.table for large fast tables
# and a SQL interface with the DBI package
chicago %>% mutate(month=as.POSIXlt(date)$mon + 1) %>% group_by(month) %>% summarise(pm25 =mean(pm25, na.rm = TRUE), o3 =max(o3tmean2) , no2=median(no2tmean2))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 4
## month pm25 o3 no2
## <dbl> <dbl> <dbl> <dbl>
## 1 1 17.8 28.2 25.4
## 2 2 20.4 37.4 26.8
## 3 3 17.4 39.0 26.8
## 4 4 13.9 47.9 25.0
## 5 5 14.1 52.8 24.2
## 6 6 15.9 66.6 25.0
## 7 7 16.6 59.5 22.4
## 8 8 16.9 54.0 23.0
## 9 9 15.9 57.5 24.5
## 10 10 14.2 47.1 24.2
## 11 11 15.2 29.5 23.6
## 12 12 17.5 27.7 24.5
This is an R Markdown document, see the blog post for finer details.