tidyr & lubridate & stringr & dplyr & data.table & some tricks
#setwd("/home/creatrol/ws/R/Tutorials")
#library(devtools)
#devtools::install_github("hadley/tibble")
#devtools::install_github("hadley/dplyr")
#devtools::install_github("hadley/tidyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
wide_df <- data.frame(col = c("X_a","y_b"),
A = c(1,2), b = c(2,5), C= c(3,6))
wide_df
## col A b C
## 1 X_a 1 2 3
## 2 y_b 2 5 6
(long_df <- gather(wide_df, my_key, my_val, - col))
## col my_key my_val
## 1 X_a A 1
## 2 y_b A 2
## 3 X_a b 2
## 4 y_b b 5
## 5 X_a C 3
## 6 y_b C 6
spread(long_df, my_key, my_val)
## col A b C
## 1 X_a 1 2 3
## 2 y_b 2 5 6
(separate_df <- separate(wide_df, col, c("upper", "lower")))
## upper lower A b C
## 1 X a 1 2 3
## 2 y b 2 5 6
(unite_df <- unite(separate_df, col, upper, lower, sep = "-"))
## col A b C
## 1 X-a 1 2 3
## 2 y-b 2 5 6
arrange(separate_df, lower)
## upper lower A b C
## 1 X a 1 2 3
## 2 y b 2 5 6
arrange(separate_df, desc(lower))
## upper lower A b C
## 1 y b 2 5 6
## 2 X a 1 2 3
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
ymd("2016-08-31")
## [1] "2016-08-31"
ymd("2016 August 31")
## [1] "2016-08-31"
mdy("August 31, 2016")
## [1] "2016-08-31"
hms("13:30:09")
## [1] "13H 30M 9S"
ymd_hms("2016/08/31 13.30.09")
## [1] "2016-08-31 13:30:09 UTC"
library(stringr)
# Trim leading and trailing white space
str_trim(" this is a test ")
## [1] "this is a test"
# Pad string with x
str_pad("22343", width = 7, side = "left", pad = "x")
## [1] "xx22343"
# Create character vector of names
friend <- c("Sarah", "Tom", "Tony")
# Search for string in vector
str_detect(friend, "Tony")
## [1] FALSE FALSE TRUE
# Replace string in vector
str_replace(friend, "Tony", "David")
## [1] "Sarah" "Tom" "David"
tolower("I am Not TALKING BOY!!")
## [1] "i am not talking boy!!"
toupper("i am a good one..")
## [1] "I AM A GOOD ONE.."
#setwd("/home/creatrol/ws/R/Tutorials")
library(dplyr)
chicago <- readRDS("chicago.rds")
# To show all columns
options(dplyr.width = Inf)
chicagoTable <- tbl_df(chicago)
class(chicago)
## [1] "data.frame"
class(chicagoTable)
## [1] "tbl_df" "tbl" "data.frame"
names(chicago)
## [1] "city" "tmpd" "dptp" "date" "pm25tmean2"
## [6] "pm10tmean2" "o3tmean2" "no2tmean2"
names(chicagoTable)
## [1] "city" "tmpd" "dptp" "date" "pm25tmean2"
## [6] "pm10tmean2" "o3tmean2" "no2tmean2"
cities <- chicagoTable$city
head(cities)
## [1] "chic" "chic" "chic" "chic" "chic" "chic"
summary(chicago)
## city tmpd dptp date
## Length:6940 Min. :-16.00 Min. :-25.62 Min. :1987-01-01
## Class :character 1st Qu.: 35.00 1st Qu.: 27.00 1st Qu.:1991-10-01
## Mode :character Median : 51.00 Median : 39.88 Median :1996-07-01
## Mean : 50.31 Mean : 40.34 Mean :1996-07-01
## 3rd Qu.: 67.00 3rd Qu.: 55.75 3rd Qu.:2001-04-01
## Max. : 92.00 Max. : 78.25 Max. :2005-12-31
## NA's :1 NA's :2
## pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## Min. : 1.70 Min. : 2.00 Min. : 0.1528 Min. : 6.158
## 1st Qu.: 9.70 1st Qu.: 21.50 1st Qu.:10.0729 1st Qu.:19.654
## Median :14.66 Median : 30.28 Median :18.5218 Median :24.556
## Mean :16.23 Mean : 33.90 Mean :19.4355 Mean :25.232
## 3rd Qu.:20.60 3rd Qu.: 42.00 3rd Qu.:27.0010 3rd Qu.:30.139
## Max. :61.50 Max. :365.00 Max. :66.5875 Max. :62.480
## NA's :4447 NA's :242
summary(chicagoTable)
## city tmpd dptp date
## Length:6940 Min. :-16.00 Min. :-25.62 Min. :1987-01-01
## Class :character 1st Qu.: 35.00 1st Qu.: 27.00 1st Qu.:1991-10-01
## Mode :character Median : 51.00 Median : 39.88 Median :1996-07-01
## Mean : 50.31 Mean : 40.34 Mean :1996-07-01
## 3rd Qu.: 67.00 3rd Qu.: 55.75 3rd Qu.:2001-04-01
## Max. : 92.00 Max. : 78.25 Max. :2005-12-31
## NA's :1 NA's :2
## pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## Min. : 1.70 Min. : 2.00 Min. : 0.1528 Min. : 6.158
## 1st Qu.: 9.70 1st Qu.: 21.50 1st Qu.:10.0729 1st Qu.:19.654
## Median :14.66 Median : 30.28 Median :18.5218 Median :24.556
## Mean :16.23 Mean : 33.90 Mean :19.4355 Mean :25.232
## 3rd Qu.:20.60 3rd Qu.: 42.00 3rd Qu.:27.0010 3rd Qu.:30.139
## Max. :61.50 Max. :365.00 Max. :66.5875 Max. :62.480
## NA's :4447 NA's :242
chicagoTable
## # A tibble: 6,940 × 8
## city tmpd dptp date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## * <chr> <dbl> <dbl> <date> <dbl> <dbl> <dbl> <dbl>
## 1 chic 31.5 31.500 1987-01-01 NA 34.00000 4.250000 19.98810
## 2 chic 33.0 29.875 1987-01-02 NA NA 3.304348 23.19099
## 3 chic 33.0 27.375 1987-01-03 NA 34.16667 3.333333 23.81548
## 4 chic 29.0 28.625 1987-01-04 NA 47.00000 4.375000 30.43452
## 5 chic 32.0 28.875 1987-01-05 NA NA 4.750000 30.33333
## 6 chic 40.0 35.125 1987-01-06 NA 48.00000 5.833333 25.77233
## 7 chic 34.5 26.750 1987-01-07 NA 41.00000 9.291667 20.58171
## 8 chic 29.0 22.000 1987-01-08 NA 36.00000 11.291667 17.03723
## 9 chic 26.5 29.000 1987-01-09 NA 33.28571 4.500000 23.38889
## 10 chic 32.5 27.750 1987-01-10 NA NA 4.958333 19.54167
## # ... with 6,930 more rows
str(chicago)
## 'data.frame': 6940 obs. of 8 variables:
## $ city : chr "chic" "chic" "chic" "chic" ...
## $ tmpd : num 31.5 33 33 29 32 40 34.5 29 26.5 32.5 ...
## $ dptp : num 31.5 29.9 27.4 28.6 28.9 ...
## $ date : Date, format: "1987-01-01" "1987-01-02" ...
## $ pm25tmean2: num NA NA NA NA NA NA NA NA NA NA ...
## $ pm10tmean2: num 34 NA 34.2 47 NA ...
## $ o3tmean2 : num 4.25 3.3 3.33 4.38 4.75 ...
## $ no2tmean2 : num 20 23.2 23.8 30.4 30.3 ...
glimpse(chicagoTable)
## Observations: 6,940
## Variables: 8
## $ city <chr> "chic", "chic", "chic", "chic", "chic", "chic", "ch...
## $ tmpd <dbl> 31.5, 33.0, 33.0, 29.0, 32.0, 40.0, 34.5, 29.0, 26....
## $ dptp <dbl> 31.500, 29.875, 27.375, 28.625, 28.875, 35.125, 26....
## $ date <date> 1987-01-01, 1987-01-02, 1987-01-03, 1987-01-04, 19...
## $ pm25tmean2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ pm10tmean2 <dbl> 34.00000, NA, 34.16667, 47.00000, NA, 48.00000, 41....
## $ o3tmean2 <dbl> 4.250000, 3.304348, 3.333333, 4.375000, 4.750000, 5...
## $ no2tmean2 <dbl> 19.98810, 23.19099, 23.81548, 30.43452, 30.33333, 2...
unique(chicagoTable$city)
## [1] "chic"
chicagoTable[1,1]
## # A tibble: 1 × 1
## city
## <chr>
## 1 chic
changeName <- c("chic" = "chicago", "ny" = "newYork")
chicagoTable$cityName <- changeName[chicagoTable$city]
glimpse(chicagoTable)
## Observations: 6,940
## Variables: 9
## $ city <chr> "chic", "chic", "chic", "chic", "chic", "chic", "ch...
## $ tmpd <dbl> 31.5, 33.0, 33.0, 29.0, 32.0, 40.0, 34.5, 29.0, 26....
## $ dptp <dbl> 31.500, 29.875, 27.375, 28.625, 28.875, 35.125, 26....
## $ date <date> 1987-01-01, 1987-01-02, 1987-01-03, 1987-01-04, 19...
## $ pm25tmean2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ pm10tmean2 <dbl> 34.00000, NA, 34.16667, 47.00000, NA, 48.00000, 41....
## $ o3tmean2 <dbl> 4.250000, 3.304348, 3.333333, 4.375000, 4.750000, 5...
## $ no2tmean2 <dbl> 19.98810, 23.19099, 23.81548, 30.43452, 30.33333, 2...
## $ cityName <chr> "chicago", "chicago", "chicago", "chicago", "chicag...
chicago$cityName <- changeName[chicago$city]
glimpse(chicago)
## Observations: 6,940
## Variables: 9
## $ city <chr> "chic", "chic", "chic", "chic", "chic", "chic", "ch...
## $ tmpd <dbl> 31.5, 33.0, 33.0, 29.0, 32.0, 40.0, 34.5, 29.0, 26....
## $ dptp <dbl> 31.500, 29.875, 27.375, 28.625, 28.875, 35.125, 26....
## $ date <date> 1987-01-01, 1987-01-02, 1987-01-03, 1987-01-04, 19...
## $ pm25tmean2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ pm10tmean2 <dbl> 34.00000, NA, 34.16667, 47.00000, NA, 48.00000, 41....
## $ o3tmean2 <dbl> 4.250000, 3.304348, 3.333333, 4.375000, 4.750000, 5...
## $ no2tmean2 <dbl> 19.98810, 23.19099, 23.81548, 30.43452, 30.33333, 2...
## $ cityName <chr> "chicago", "chicago", "chicago", "chicago", "chicag...
chicago <- readRDS("chicago.rds")
dim(chicago)
## [1] 6940 8
head(select(chicago,1:5))
## city tmpd dptp date pm25tmean2
## 1 chic 31.5 31.500 1987-01-01 NA
## 2 chic 33.0 29.875 1987-01-02 NA
## 3 chic 33.0 27.375 1987-01-03 NA
## 4 chic 29.0 28.625 1987-01-04 NA
## 5 chic 32.0 28.875 1987-01-05 NA
## 6 chic 40.0 35.125 1987-01-06 NA
names(chicago)[1:3]
## [1] "city" "tmpd" "dptp"
head(select(chicago,city:dptp))
## city tmpd dptp
## 1 chic 31.5 31.500
## 2 chic 33.0 29.875
## 3 chic 33.0 27.375
## 4 chic 29.0 28.625
## 5 chic 32.0 28.875
## 6 chic 40.0 35.125
head(select(chicagoTable,city:dptp))
## # A tibble: 6 × 3
## city tmpd dptp
## <chr> <dbl> <dbl>
## 1 chic 31.5 31.500
## 2 chic 33.0 29.875
## 3 chic 33.0 27.375
## 4 chic 29.0 28.625
## 5 chic 32.0 28.875
## 6 chic 40.0 35.125
head(select(chicago,1:3, pm25tmean2),10)
## city tmpd dptp pm25tmean2
## 1 chic 31.5 31.500 NA
## 2 chic 33.0 29.875 NA
## 3 chic 33.0 27.375 NA
## 4 chic 29.0 28.625 NA
## 5 chic 32.0 28.875 NA
## 6 chic 40.0 35.125 NA
## 7 chic 34.5 26.750 NA
## 8 chic 29.0 22.000 NA
## 9 chic 26.5 29.000 NA
## 10 chic 32.5 27.750 NA
head(select(chicago, -(city:dptp)))
## date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 1987-01-01 NA 34.00000 4.250000 19.98810
## 2 1987-01-02 NA NA 3.304348 23.19099
## 3 1987-01-03 NA 34.16667 3.333333 23.81548
## 4 1987-01-04 NA 47.00000 4.375000 30.43452
## 5 1987-01-05 NA NA 4.750000 30.33333
## 6 1987-01-06 NA 48.00000 5.833333 25.77233
#################################### Equivalent base R
i <- match("city", names(chicago))
j <- match("dptp", names(chicago))
head(chicago[,-(i:j)])
## date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 1987-01-01 NA 34.00000 4.250000 19.98810
## 2 1987-01-02 NA NA 3.304348 23.19099
## 3 1987-01-03 NA 34.16667 3.333333 23.81548
## 4 1987-01-04 NA 47.00000 4.375000 30.43452
## 5 1987-01-05 NA NA 4.750000 30.33333
## 6 1987-01-06 NA 48.00000 5.833333 25.77233
####################################
head(select(chicago, starts_with("d")))
## dptp date
## 1 31.500 1987-01-01
## 2 29.875 1987-01-02
## 3 27.375 1987-01-03
## 4 28.625 1987-01-04
## 5 28.875 1987-01-05
## 6 35.125 1987-01-06
head(select(chicago, ends_with("2")))
## pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 NA 34.00000 4.250000 19.98810
## 2 NA NA 3.304348 23.19099
## 3 NA 34.16667 3.333333 23.81548
## 4 NA 47.00000 4.375000 30.43452
## 5 NA NA 4.750000 30.33333
## 6 NA 48.00000 5.833333 25.77233
head(select(chicago, matches("mean")))
## pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 NA 34.00000 4.250000 19.98810
## 2 NA NA 3.304348 23.19099
## 3 NA 34.16667 3.333333 23.81548
## 4 NA 47.00000 4.375000 30.43452
## 5 NA NA 4.750000 30.33333
## 6 NA 48.00000 5.833333 25.77233
head(select(chicago, contains("mean")))
## pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 NA 34.00000 4.250000 19.98810
## 2 NA NA 3.304348 23.19099
## 3 NA 34.16667 3.333333 23.81548
## 4 NA 47.00000 4.375000 30.43452
## 5 NA NA 4.750000 30.33333
## 6 NA 48.00000 5.833333 25.77233
chi.f <- filter(chicago, pm25tmean2 > 30)
head(select(chi.f, 1:3, pm25tmean2), 10)
## city tmpd dptp pm25tmean2
## 1 chic 23 21.9 38.10
## 2 chic 28 25.8 33.95
## 3 chic 55 51.3 39.40
## 4 chic 59 53.7 35.40
## 5 chic 57 52.0 33.30
## 6 chic 57 56.0 32.10
## 7 chic 75 65.8 56.50
## 8 chic 61 59.0 33.80
## 9 chic 73 60.3 30.30
## 10 chic 78 67.1 41.40
chi.f <- filter(chicagoTable, pm25tmean2 > 30)
head(select(chi.f, 1:3, pm25tmean2), 10)
## # A tibble: 10 × 4
## city tmpd dptp pm25tmean2
## <chr> <dbl> <dbl> <dbl>
## 1 chic 23 21.9 38.10
## 2 chic 28 25.8 33.95
## 3 chic 55 51.3 39.40
## 4 chic 59 53.7 35.40
## 5 chic 57 52.0 33.30
## 6 chic 57 56.0 32.10
## 7 chic 75 65.8 56.50
## 8 chic 61 59.0 33.80
## 9 chic 73 60.3 30.30
## 10 chic 78 67.1 41.40
chi.f <- filter(chicago, pm25tmean2 > 30 & tmpd > 80)
head(select(chi.f, 1:3, pm25tmean2, tmpd), 10)
## city tmpd dptp pm25tmean2
## 1 chic 81 71.2 39.6000
## 2 chic 81 70.4 31.5000
## 3 chic 82 72.2 32.3000
## 4 chic 84 72.9 43.7000
## 5 chic 85 72.6 38.8375
## 6 chic 84 72.6 38.2000
## 7 chic 82 67.4 33.0000
## 8 chic 82 63.5 42.5000
## 9 chic 81 70.4 33.1000
## 10 chic 82 66.2 38.8500
chicago <- arrange(chicago,date)
head(select(chicago,date,pm25tmean2),3)
## date pm25tmean2
## 1 1987-01-01 NA
## 2 1987-01-02 NA
## 3 1987-01-03 NA
tail(select(chicago,date,pm25tmean2),3)
## date pm25tmean2
## 6938 2005-12-29 7.45000
## 6939 2005-12-30 15.05714
## 6940 2005-12-31 15.00000
chicagoTable <- arrange(chicagoTable,date)
head(select(chicagoTable,date,pm25tmean2),3)
## # A tibble: 3 × 2
## date pm25tmean2
## <date> <dbl>
## 1 1987-01-01 NA
## 2 1987-01-02 NA
## 3 1987-01-03 NA
head(select(chicagoTable,date,pm25tmean2),3)
## # A tibble: 3 × 2
## date pm25tmean2
## <date> <dbl>
## 1 1987-01-01 NA
## 2 1987-01-02 NA
## 3 1987-01-03 NA
chicago <- arrange(chicago, desc(date))
head(select(chicago,date,pm25tmean2),3)
## date pm25tmean2
## 1 2005-12-31 15.00000
## 2 2005-12-30 15.05714
## 3 2005-12-29 7.45000
tail(select(chicago,date,pm25tmean2),3)
## date pm25tmean2
## 6938 1987-01-03 NA
## 6939 1987-01-02 NA
## 6940 1987-01-01 NA
head(chicago[,1:5],3)
## city tmpd dptp date pm25tmean2
## 1 chic 35 30.1 2005-12-31 15.00000
## 2 chic 36 31.0 2005-12-30 15.05714
## 3 chic 35 29.4 2005-12-29 7.45000
chicago <- rename(chicago, dewpoint = dptp,
pm25 = pm25tmean2)
head(chicago[,1:5],3)
## city tmpd dewpoint date pm25
## 1 chic 35 30.1 2005-12-31 15.00000
## 2 chic 36 31.0 2005-12-30 15.05714
## 3 chic 35 29.4 2005-12-29 7.45000
chicago <- mutate(chicago,
pm25detrend = pm25 - mean(pm25, na.rm = T))
head(select(chicago, pm25, pm25detrend))
## pm25 pm25detrend
## 1 15.00000 -1.230958
## 2 15.05714 -1.173815
## 3 7.45000 -8.780958
## 4 17.75000 1.519042
## 5 23.56000 7.329042
## 6 8.40000 -7.830958
tail(select(chicago, pm25, pm25detrend))
## pm25 pm25detrend
## 6935 NA NA
## 6936 NA NA
## 6937 NA NA
## 6938 NA NA
## 6939 NA NA
## 6940 NA NA
chicago <- mutate(chicago,
tempcatifelse = ifelse(tmpd <= 80, "cold", "hot"))
head(chicago,3)
## city tmpd dewpoint date pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 35 30.1 2005-12-31 15.00000 23.5 2.531250 13.25000
## 2 chic 36 31.0 2005-12-30 15.05714 19.2 3.034420 22.80556
## 3 chic 35 29.4 2005-12-29 7.45000 23.5 6.794837 19.97222
## pm25detrend tempcatifelse
## 1 -1.230958 cold
## 2 -1.173815 cold
## 3 -8.780958 cold
head(chicago,3)
## city tmpd dewpoint date pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 35 30.1 2005-12-31 15.00000 23.5 2.531250 13.25000
## 2 chic 36 31.0 2005-12-30 15.05714 19.2 3.034420 22.80556
## 3 chic 35 29.4 2005-12-29 7.45000 23.5 6.794837 19.97222
## pm25detrend tempcatifelse
## 1 -1.230958 cold
## 2 -1.173815 cold
## 3 -8.780958 cold
chicago <- mutate(chicago,
tempcat = factor(1 * (tmpd > 80),
labels = c("cold","hot")))
head(chicago,3)
## city tmpd dewpoint date pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 35 30.1 2005-12-31 15.00000 23.5 2.531250 13.25000
## 2 chic 36 31.0 2005-12-30 15.05714 19.2 3.034420 22.80556
## 3 chic 35 29.4 2005-12-29 7.45000 23.5 6.794837 19.97222
## pm25detrend tempcatifelse tempcat
## 1 -1.230958 cold cold
## 2 -1.173815 cold cold
## 3 -8.780958 cold cold
hotcold <- group_by(chicago, tempcat)
class(hotcold)
## [1] "grouped_df" "tbl_df" "tbl" "data.frame"
summarize(hotcold, pm25 = mean(pm25, na.rm = T),
o3 = max(o3tmean2),
no2 = median(no2tmean2))
## # A tibble: 3 × 4
## tempcat pm25 o3 no2
## <fctr> <dbl> <dbl> <dbl>
## 1 cold 15.97807 66.587500 24.54924
## 2 hot 26.48118 62.969656 24.93870
## 3 NA 47.73750 9.416667 37.44444
chicago <- readRDS("chicago.rds")
head(chicago)
## city tmpd dptp date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 31.5 31.500 1987-01-01 NA 34.00000 4.250000 19.98810
## 2 chic 33.0 29.875 1987-01-02 NA NA 3.304348 23.19099
## 3 chic 33.0 27.375 1987-01-03 NA 34.16667 3.333333 23.81548
## 4 chic 29.0 28.625 1987-01-04 NA 47.00000 4.375000 30.43452
## 5 chic 32.0 28.875 1987-01-05 NA NA 4.750000 30.33333
## 6 chic 40.0 35.125 1987-01-06 NA 48.00000 5.833333 25.77233
summarise(chicago, num = nth(tmpd, 2))
## num
## 1 33
summarise(chicago, uniqueNumber = n_distinct(city))
## uniqueNumber
## 1 1
chicago %>% mutate(month = as.POSIXlt(date)$mon + 1) %>% group_by(month) %>% summarize(pm25 = mean(pm25tmean2, na.rm = T),
o3 = max(o3tmean2, na.rm = T),
no2 = median(no2tmean2, na.rm = T))
## # A tibble: 12 × 4
## month pm25 o3 no2
## <dbl> <dbl> <dbl> <dbl>
## 1 1 17.76996 28.22222 25.35417
## 2 2 20.37513 37.37500 26.78034
## 3 3 17.40818 39.05000 26.76984
## 4 4 13.85879 47.94907 25.03125
## 5 5 14.07420 52.75000 24.22222
## 6 6 15.86461 66.58750 25.01140
## 7 7 16.57087 59.54167 22.38442
## 8 8 16.93380 53.96701 22.98333
## 9 9 15.91279 57.48864 24.47917
## 10 10 14.23557 47.09275 24.15217
## 11 11 15.15794 29.45833 23.56537
## 12 12 17.52221 27.70833 24.45773
# Set up a connection to the mysql database
my_db <- src_mysql(dbname = "dplyr",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Reference a table within that source: nycflights
nycflights <- tbl(my_db, "dplyr")
library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, mday, month, quarter, wday, week, yday, year
## The following objects are masked from 'package:dplyr':
##
## between, last
DT <- data.table(x = c("A", "B", "C"), y = 1:9, z = (2:10) *2)
DT[3:4] # no need ","
## x y z
## 1: C 3 8
## 2: A 4 10
DT[.N] # the last row
## x y z
## 1: C 9 20
# select and mutate in dplyr (a little diff)
DT[,.(x, y, diff = z-y, Ymean = mean(y))]
## x y diff Ymean
## 1: A 1 3 5
## 2: B 2 4 5
## 3: C 3 5 5
## 4: A 4 6 5
## 5: B 5 7 5
## 6: C 6 8 5
## 7: A 7 9 5
## 8: B 8 10 5
## 9: C 9 11 5
# summarize in dplyr
DT[,.(Zmean = mean(z), Yvar = var(y))]
## Zmean Yvar
## 1: 12 7.5
# Insert functions
DT[, plot(y,z)]
## NULL
DT[, {print(x)
hist(z)
NULL }]
## [1] "A" "B" "C" "A" "B" "C" "A" "B" "C"
## NULL
DT[, .N, by = .(x)] # the number of rows in this group
## x N
## 1: A 3
## 2: B 3
## 3: C 3
DT[,.(Ymean = mean(y)), by = .(x)]
## x Ymean
## 1: A 4
## 2: B 5
## 3: C 6
DT[, .(y, Zmean = mean(z)), by = .(Grp = y %% 2)]
## Grp y Zmean
## 1: 1 1 12
## 2: 1 3 12
## 3: 1 5 12
## 4: 1 7 12
## 5: 1 9 12
## 6: 0 2 12
## 7: 0 4 12
## 8: 0 6 12
## 9: 0 8 12
DT[, .(Zmean = mean(z)), by = .(Grp = y %% 2)]
## Grp Zmean
## 1: 1 12
## 2: 0 12
DT[, mean(z), by = y %% 2]
## y V1
## 1: 1 12
## 2: 0 12
DT[2:7, mean(z), by = y %% 2]
## y V1
## 1: 0 10
## 2: 1 12
ans <- DT[2:9, mean(z), by = x]
ans[order(x)]
## x V1
## 1: A 13
## 2: B 12
## 3: C 14
####################################### by Chaining
ans <- DT[2:9, mean(z), by = x][order(x)]
ans
## x V1
## 1: A 13
## 2: B 12
## 3: C 14
DT[, lapply(.SD, median), by = x]
## x y z
## 1: A 4 10
## 2: B 5 12
## 3: C 6 14
DT[, lapply(.SD, median), .SDcols = 2:3]
## y z
## 1: 5 12
DT[, lapply(.SD, median), .SDcols = 2:3, by = x]
## x y z
## 1: A 4 10
## 2: B 5 12
## 3: C 6 14
DT[, lapply(.SD, median), by = x, .SDcols = c("y","z")]
## x y z
## 1: A 4 10
## 2: B 5 12
## 3: C 6 14
DT[, .(lapply(.SD, median)), by = x, .SDcols = c("y","z")]
## x V1
## 1: A 4
## 2: A 10
## 3: B 5
## 4: B 12
## 5: C 6
## 6: C 14
DT[, .(lapply(.SD, median), n = .N), by = .(x, y>5), .SDcols = c("y","z")]
## x y V1 n
## 1: A FALSE 2.5 2
## 2: A FALSE 7 2
## 3: B FALSE 3.5 2
## 4: B FALSE 9 2
## 5: C FALSE 3 1
## 6: C FALSE 8 1
## 7: C TRUE 7.5 2
## 8: C TRUE 17 2
## 9: A TRUE 7 1
## 10: A TRUE 16 1
## 11: B TRUE 8 1
## 12: B TRUE 18 1
# Add and update columns (mutate in dplyr)
DT[, c("j","k") := .(rev(y),10:2)];DT
## x y z j k
## 1: A 1 4 9 10
## 2: B 2 6 8 9
## 3: C 3 8 7 8
## 4: A 4 10 6 7
## 5: B 5 12 5 6
## 6: C 6 14 4 5
## 7: A 7 16 3 4
## 8: B 8 18 2 3
## 9: C 9 20 1 2
# Remove the columns
DT[, j := NULL];DT
## x y z k
## 1: A 1 4 10
## 2: B 2 6 9
## 3: C 3 8 8
## 4: A 4 10 7
## 5: B 5 12 6
## 6: C 6 14 5
## 7: A 7 16 4
## 8: B 8 18 3
## 9: C 9 20 2
DT[, c("k") := NULL];DT
## x y z
## 1: A 1 4
## 2: B 2 6
## 3: C 3 8
## 4: A 4 10
## 5: B 5 12
## 6: C 6 14
## 7: A 7 16
## 8: B 8 18
## 9: C 9 20
# Other way to remove
DT[, c("j","k") := .(rev(y),10:2)]
Mycols <- c("j", "k")
DT[, (Mycols) := NULL];DT
## x y z
## 1: A 1 4
## 2: B 2 6
## 3: C 3 8
## 4: A 4 10
## 5: B 5 12
## 6: C 6 14
## 7: A 7 16
## 8: B 8 18
## 9: C 9 20
# Functional
DT[, `:=` (j = rev(y), k = 10:2)];DT
## x y z j k
## 1: A 1 4 9 10
## 2: B 2 6 8 9
## 3: C 3 8 7 8
## 4: A 4 10 6 7
## 5: B 5 12 5 6
## 6: C 6 14 4 5
## 7: A 7 16 3 4
## 8: B 8 18 2 3
## 9: C 9 20 1 2
# by Group
DT[, c := sum(y), by = x];DT
## x y z j k c
## 1: A 1 4 9 10 12
## 2: B 2 6 8 9 15
## 3: C 3 8 7 8 18
## 4: A 4 10 6 7 12
## 5: B 5 12 5 6 15
## 6: C 6 14 4 5 18
## 7: A 7 16 3 4 12
## 8: B 8 18 2 3 15
## 9: C 9 20 1 2 18
for (i in 1:9) DT[, set := i+1]
DT
###################### by set(DT, index, column, value)
for (i in 1:9) set(DT, i , 7L, i + 1)
DT
## x y z j k c set
## 1: A 1 4 9 10 12 2
## 2: B 2 6 8 9 15 3
## 3: C 3 8 7 8 18 4
## 4: A 4 10 6 7 12 5
## 5: B 5 12 5 6 15 6
## 6: C 6 14 4 5 18 7
## 7: A 7 16 3 4 12 8
## 8: B 8 18 2 3 15 9
## 9: C 9 20 1 2 18 10
# rename column
setnames(DT, "y", "yrename")
names(DT)
## [1] "x" "yrename" "z" "j" "k" "c" "set"
# order columns
setcolorder(DT, c("set", "x", "yrename", "k", "j", "c", "z"))
names(DT)
## [1] "set" "x" "yrename" "k" "j" "c" "z"
DT[x == "A"]
## set x yrename k j c z
## 1: 2 A 1 10 9 12 4
## 2: 5 A 4 7 6 12 10
## 3: 8 A 7 4 3 12 16
DT[x %in% c("A", "B")]
## set x yrename k j c z
## 1: 2 A 1 10 9 12 4
## 2: 3 B 2 9 8 15 6
## 3: 5 A 4 7 6 12 10
## 4: 6 B 5 6 5 15 12
## 5: 8 A 7 4 3 12 16
## 6: 9 B 8 3 2 15 18
# Another way
(w <- DT[, x == "A"])
## [1] TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
DT[w]
## set x yrename k j c z
## 1: 2 A 1 10 9 12 4
## 2: 5 A 4 7 6 12 10
## 3: 8 A 7 4 3 12 16
# create and use
setkey(DT,x)
DT
## set x yrename k j c z
## 1: 2 A 1 10 9 12 4
## 2: 5 A 4 7 6 12 10
## 3: 8 A 7 4 3 12 16
## 4: 3 B 2 9 8 15 6
## 5: 6 B 5 6 5 15 12
## 6: 9 B 8 3 2 15 18
## 7: 4 C 3 8 7 18 8
## 8: 7 C 6 5 4 18 14
## 9: 10 C 9 2 1 18 20
DT["A"]
## set x yrename k j c z
## 1: 2 A 1 10 9 12 4
## 2: 5 A 4 7 6 12 10
## 3: 8 A 7 4 3 12 16
# mult
DT["A", mult = "first"]
## set x yrename k j c z
## 1: 2 A 1 10 9 12 4
DT["A", mult = "last"]
## set x yrename k j c z
## 1: 8 A 7 4 3 12 16
# nomatch
DT[c("A", "D")]
## set x yrename k j c z
## 1: 2 A 1 10 9 12 4
## 2: 5 A 4 7 6 12 10
## 3: 8 A 7 4 3 12 16
## 4: NA D NA NA NA NA NA
DT[c("A", "D"), nomatch = NA] # default
## set x yrename k j c z
## 1: 2 A 1 10 9 12 4
## 2: 5 A 4 7 6 12 10
## 3: 8 A 7 4 3 12 16
## 4: NA D NA NA NA NA NA
DT[c("A", "D"), nomatch = 0]
## set x yrename k j c z
## 1: 2 A 1 10 9 12 4
## 2: 5 A 4 7 6 12 10
## 3: 8 A 7 4 3 12 16
# a two-column key
setkey(DT, x, z)
DT["A"]
## set x yrename k j c z
## 1: 2 A 1 10 9 12 4
## 2: 5 A 4 7 6 12 10
## 3: 8 A 7 4 3 12 16
DT[.("A", 10)]
## set x yrename k j c z
## 1: 5 A 4 7 6 12 10
DT <- DT[, .(x,k,z)]
DT
## x k z
## 1: A 10 4
## 2: A 7 10
## 3: A 4 16
## 4: B 9 6
## 5: B 6 12
## 6: B 3 18
## 7: C 8 8
## 8: C 5 14
## 9: C 2 20
setkey(DT, x, k)
DT[.("B")]
## x k z
## 1: B 3 18
## 2: B 6 12
## 3: B 9 6
DT[.("B", 5)]
## x k z
## 1: B 5 NA
# Ordered joins
DT[.("B", 5), roll = TRUE] # forwards by the second key
## x k z
## 1: B 5 18
DT[.("B", 5), roll = "nearest"] # nearest by the second key
## x k z
## 1: B 5 12
# Forwards and backwards
DT[.("B", 5), roll = +Inf] # always forward
## x k z
## 1: B 5 18
DT[.("B", 5), roll = -Inf] # always backword
## x k z
## 1: B 5 12
# Limited saleness
DT[.("B", 8), roll = 1] # forward with limited steps
## x k z
## 1: B 8 NA
DT[.("B", 8), roll = 2]
## x k z
## 1: B 8 12
DT[.("B", 7), roll = -2] # backward with limited steps
## x k z
## 1: B 7 6
DT[.("B", 7), roll = -1]
## x k z
## 1: B 7 NA
# End control
DT[.("B", 11:12)]
## x k z
## 1: B 11 NA
## 2: B 12 NA
DT[.("B", 11:12), roll = TRUE]
## x k z
## 1: B 11 6
## 2: B 12 6
DT[.("B", 11:12), roll = TRUE, rollends = FALSE]
## x k z
## 1: B 11 NA
## 2: B 12 NA
df <- data.frame(A = c(1,NA,8,NA),
B = c(3,NA,88,23),
C = c(2,45,3,1))
is.na(df)
## A B C
## [1,] FALSE FALSE FALSE
## [2,] TRUE TRUE FALSE
## [3,] FALSE FALSE FALSE
## [4,] TRUE FALSE FALSE
# Check if there is a NA
any(is.na(df))
## [1] TRUE
# Count the # of NA
sum(is.na(df))
## [1] 3
# summary
summary(df)
## A B C
## Min. :1.00 Min. : 3.0 Min. : 1.00
## 1st Qu.:2.75 1st Qu.:13.0 1st Qu.: 1.75
## Median :4.50 Median :23.0 Median : 2.50
## Mean :4.50 Mean :38.0 Mean :12.75
## 3rd Qu.:6.25 3rd Qu.:55.5 3rd Qu.:13.50
## Max. :8.00 Max. :88.0 Max. :45.00
## NA's :2 NA's :1
# Find rows with no NA
complete.cases(df)
## [1] TRUE FALSE TRUE FALSE
# Another way to remove rows with NAs
na.omit(df)
## A B C
## 1 1 3 2
## 3 8 88 3
set.seed(10)
x <- c(rnorm(30, mean = 15, sd = 5), -5, 28, 35)
# View a boxplot
boxplot(x, horizontal = TRUE)
# Create another dataset
df2 <- data.frame(A = rnorm(100,50,10),
B = c(rnorm(99,50,10), 500),
C = c(rnorm(99,50,10), -1))
# View summary
summary(df2)
## A B C
## Min. :31.46 Min. : 26.79 Min. :-1.00
## 1st Qu.:42.21 1st Qu.: 41.35 1st Qu.:45.29
## Median :50.20 Median : 50.67 Median :51.06
## Mean :49.70 Mean : 53.62 Mean :50.88
## 3rd Qu.:57.12 3rd Qu.: 56.57 3rd Qu.:58.13
## Max. :72.21 Max. :500.00 Max. :76.44
# View histogram
hist(df2$B, breaks = 20)
# View boxplot
boxplot(df2)