tidyr & lubridate & stringr & dplyr & data.table & some tricks

tidyr

#setwd("/home/creatrol/ws/R/Tutorials")
#library(devtools)
#devtools::install_github("hadley/tibble")
#devtools::install_github("hadley/dplyr")
#devtools::install_github("hadley/tidyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

Gather columns into key-value pairs

wide_df <- data.frame(col = c("X_a","y_b"),
                      A = c(1,2), b = c(2,5), C= c(3,6))
wide_df
##   col A b C
## 1 X_a 1 2 3
## 2 y_b 2 5 6
(long_df <- gather(wide_df, my_key, my_val, - col))
##   col my_key my_val
## 1 X_a      A      1
## 2 y_b      A      2
## 3 X_a      b      2
## 4 y_b      b      5
## 5 X_a      C      3
## 6 y_b      C      6

Spread key-value pairs into columns

spread(long_df, my_key, my_val)
##   col A b C
## 1 X_a 1 2 3
## 2 y_b 2 5 6

Separate one column intp multiple

  • 4th arguement (… ,sep = “_“)
(separate_df <- separate(wide_df, col, c("upper", "lower")))
##   upper lower A b C
## 1     X     a 1 2 3
## 2     y     b 2 5 6

Unite multiple columns into one

  • 4th arguement (… ,sep = “_“)
(unite_df <- unite(separate_df, col, upper, lower, sep = "-"))
##   col A b C
## 1 X-a 1 2 3
## 2 y-b 2 5 6

arrange

arrange(separate_df, lower)
##   upper lower A b C
## 1     X     a 1 2 3
## 2     y     b 2 5 6
arrange(separate_df, desc(lower))
##   upper lower A b C
## 1     y     b 2 5 6
## 2     X     a 1 2 3

lubridate

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date

Basic functions

ymd("2016-08-31")
## [1] "2016-08-31"
ymd("2016 August 31")
## [1] "2016-08-31"
mdy("August 31, 2016")
## [1] "2016-08-31"
hms("13:30:09")
## [1] "13H 30M 9S"
ymd_hms("2016/08/31 13.30.09")
## [1] "2016-08-31 13:30:09 UTC"

stringr

library(stringr)

key functions

# Trim leading and trailing white space
str_trim("    this is a    test   ")
## [1] "this is a    test"
# Pad string with x
str_pad("22343", width = 7, side = "left", pad = "x")
## [1] "xx22343"
# Create character vector of names
friend <- c("Sarah", "Tom", "Tony")
# Search for string in vector
str_detect(friend, "Tony")
## [1] FALSE FALSE  TRUE
# Replace string in vector
str_replace(friend, "Tony", "David")
## [1] "Sarah" "Tom"   "David"
tolower("I am Not TALKING BOY!!")
## [1] "i am not talking boy!!"
toupper("i am a good one..")
## [1] "I AM A GOOD ONE.."

dplyr

#setwd("/home/creatrol/ws/R/Tutorials")
library(dplyr)
chicago <- readRDS("chicago.rds")
# To show all columns
options(dplyr.width = Inf)

tbl

  • Convert data.frame to table
chicagoTable <- tbl_df(chicago)
class(chicago)
## [1] "data.frame"
class(chicagoTable)
## [1] "tbl_df"     "tbl"        "data.frame"
names(chicago)
## [1] "city"       "tmpd"       "dptp"       "date"       "pm25tmean2"
## [6] "pm10tmean2" "o3tmean2"   "no2tmean2"
names(chicagoTable)
## [1] "city"       "tmpd"       "dptp"       "date"       "pm25tmean2"
## [6] "pm10tmean2" "o3tmean2"   "no2tmean2"
cities <- chicagoTable$city
head(cities)
## [1] "chic" "chic" "chic" "chic" "chic" "chic"
summary(chicago)
##      city                tmpd             dptp             date           
##  Length:6940        Min.   :-16.00   Min.   :-25.62   Min.   :1987-01-01  
##  Class :character   1st Qu.: 35.00   1st Qu.: 27.00   1st Qu.:1991-10-01  
##  Mode  :character   Median : 51.00   Median : 39.88   Median :1996-07-01  
##                     Mean   : 50.31   Mean   : 40.34   Mean   :1996-07-01  
##                     3rd Qu.: 67.00   3rd Qu.: 55.75   3rd Qu.:2001-04-01  
##                     Max.   : 92.00   Max.   : 78.25   Max.   :2005-12-31  
##                     NA's   :1        NA's   :2                            
##    pm25tmean2      pm10tmean2        o3tmean2         no2tmean2     
##  Min.   : 1.70   Min.   :  2.00   Min.   : 0.1528   Min.   : 6.158  
##  1st Qu.: 9.70   1st Qu.: 21.50   1st Qu.:10.0729   1st Qu.:19.654  
##  Median :14.66   Median : 30.28   Median :18.5218   Median :24.556  
##  Mean   :16.23   Mean   : 33.90   Mean   :19.4355   Mean   :25.232  
##  3rd Qu.:20.60   3rd Qu.: 42.00   3rd Qu.:27.0010   3rd Qu.:30.139  
##  Max.   :61.50   Max.   :365.00   Max.   :66.5875   Max.   :62.480  
##  NA's   :4447    NA's   :242
summary(chicagoTable)
##      city                tmpd             dptp             date           
##  Length:6940        Min.   :-16.00   Min.   :-25.62   Min.   :1987-01-01  
##  Class :character   1st Qu.: 35.00   1st Qu.: 27.00   1st Qu.:1991-10-01  
##  Mode  :character   Median : 51.00   Median : 39.88   Median :1996-07-01  
##                     Mean   : 50.31   Mean   : 40.34   Mean   :1996-07-01  
##                     3rd Qu.: 67.00   3rd Qu.: 55.75   3rd Qu.:2001-04-01  
##                     Max.   : 92.00   Max.   : 78.25   Max.   :2005-12-31  
##                     NA's   :1        NA's   :2                            
##    pm25tmean2      pm10tmean2        o3tmean2         no2tmean2     
##  Min.   : 1.70   Min.   :  2.00   Min.   : 0.1528   Min.   : 6.158  
##  1st Qu.: 9.70   1st Qu.: 21.50   1st Qu.:10.0729   1st Qu.:19.654  
##  Median :14.66   Median : 30.28   Median :18.5218   Median :24.556  
##  Mean   :16.23   Mean   : 33.90   Mean   :19.4355   Mean   :25.232  
##  3rd Qu.:20.60   3rd Qu.: 42.00   3rd Qu.:27.0010   3rd Qu.:30.139  
##  Max.   :61.50   Max.   :365.00   Max.   :66.5875   Max.   :62.480  
##  NA's   :4447    NA's   :242
chicagoTable
## # A tibble: 6,940 × 8
##     city  tmpd   dptp       date pm25tmean2 pm10tmean2  o3tmean2 no2tmean2
## *  <chr> <dbl>  <dbl>     <date>      <dbl>      <dbl>     <dbl>     <dbl>
## 1   chic  31.5 31.500 1987-01-01         NA   34.00000  4.250000  19.98810
## 2   chic  33.0 29.875 1987-01-02         NA         NA  3.304348  23.19099
## 3   chic  33.0 27.375 1987-01-03         NA   34.16667  3.333333  23.81548
## 4   chic  29.0 28.625 1987-01-04         NA   47.00000  4.375000  30.43452
## 5   chic  32.0 28.875 1987-01-05         NA         NA  4.750000  30.33333
## 6   chic  40.0 35.125 1987-01-06         NA   48.00000  5.833333  25.77233
## 7   chic  34.5 26.750 1987-01-07         NA   41.00000  9.291667  20.58171
## 8   chic  29.0 22.000 1987-01-08         NA   36.00000 11.291667  17.03723
## 9   chic  26.5 29.000 1987-01-09         NA   33.28571  4.500000  23.38889
## 10  chic  32.5 27.750 1987-01-10         NA         NA  4.958333  19.54167
## # ... with 6,930 more rows
str(chicago)
## 'data.frame':    6940 obs. of  8 variables:
##  $ city      : chr  "chic" "chic" "chic" "chic" ...
##  $ tmpd      : num  31.5 33 33 29 32 40 34.5 29 26.5 32.5 ...
##  $ dptp      : num  31.5 29.9 27.4 28.6 28.9 ...
##  $ date      : Date, format: "1987-01-01" "1987-01-02" ...
##  $ pm25tmean2: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ pm10tmean2: num  34 NA 34.2 47 NA ...
##  $ o3tmean2  : num  4.25 3.3 3.33 4.38 4.75 ...
##  $ no2tmean2 : num  20 23.2 23.8 30.4 30.3 ...
glimpse(chicagoTable)
## Observations: 6,940
## Variables: 8
## $ city       <chr> "chic", "chic", "chic", "chic", "chic", "chic", "ch...
## $ tmpd       <dbl> 31.5, 33.0, 33.0, 29.0, 32.0, 40.0, 34.5, 29.0, 26....
## $ dptp       <dbl> 31.500, 29.875, 27.375, 28.625, 28.875, 35.125, 26....
## $ date       <date> 1987-01-01, 1987-01-02, 1987-01-03, 1987-01-04, 19...
## $ pm25tmean2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ pm10tmean2 <dbl> 34.00000, NA, 34.16667, 47.00000, NA, 48.00000, 41....
## $ o3tmean2   <dbl> 4.250000, 3.304348, 3.333333, 4.375000, 4.750000, 5...
## $ no2tmean2  <dbl> 19.98810, 23.19099, 23.81548, 30.43452, 30.33333, 2...
unique(chicagoTable$city)
## [1] "chic"
chicagoTable[1,1]
## # A tibble: 1 × 1
##    city
##   <chr>
## 1  chic
  • change labels
changeName <- c("chic" = "chicago", "ny" = "newYork")
chicagoTable$cityName <- changeName[chicagoTable$city]
glimpse(chicagoTable)
## Observations: 6,940
## Variables: 9
## $ city       <chr> "chic", "chic", "chic", "chic", "chic", "chic", "ch...
## $ tmpd       <dbl> 31.5, 33.0, 33.0, 29.0, 32.0, 40.0, 34.5, 29.0, 26....
## $ dptp       <dbl> 31.500, 29.875, 27.375, 28.625, 28.875, 35.125, 26....
## $ date       <date> 1987-01-01, 1987-01-02, 1987-01-03, 1987-01-04, 19...
## $ pm25tmean2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ pm10tmean2 <dbl> 34.00000, NA, 34.16667, 47.00000, NA, 48.00000, 41....
## $ o3tmean2   <dbl> 4.250000, 3.304348, 3.333333, 4.375000, 4.750000, 5...
## $ no2tmean2  <dbl> 19.98810, 23.19099, 23.81548, 30.43452, 30.33333, 2...
## $ cityName   <chr> "chicago", "chicago", "chicago", "chicago", "chicag...
chicago$cityName <- changeName[chicago$city]
glimpse(chicago)
## Observations: 6,940
## Variables: 9
## $ city       <chr> "chic", "chic", "chic", "chic", "chic", "chic", "ch...
## $ tmpd       <dbl> 31.5, 33.0, 33.0, 29.0, 32.0, 40.0, 34.5, 29.0, 26....
## $ dptp       <dbl> 31.500, 29.875, 27.375, 28.625, 28.875, 35.125, 26....
## $ date       <date> 1987-01-01, 1987-01-02, 1987-01-03, 1987-01-04, 19...
## $ pm25tmean2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ pm10tmean2 <dbl> 34.00000, NA, 34.16667, 47.00000, NA, 48.00000, 41....
## $ o3tmean2   <dbl> 4.250000, 3.304348, 3.333333, 4.375000, 4.750000, 5...
## $ no2tmean2  <dbl> 19.98810, 23.19099, 23.81548, 30.43452, 30.33333, 2...
## $ cityName   <chr> "chicago", "chicago", "chicago", "chicago", "chicag...

Select (column)

  • starts_with(“X”): every name that starts with “X”,
  • ends_with(“X”): every name that ends with “X”,
  • contains(“X”): every name that contains “X”,
  • matches(“X”): every name that matches “X”, where “X” can be a regular expression,
  • num_range(“x”, 1:5): the variables named x01, x02, x03, x04 and x05,
  • one_of(x): every name that appears in x, which should be a character vector.
chicago <- readRDS("chicago.rds")
dim(chicago)
## [1] 6940    8
head(select(chicago,1:5))
##   city tmpd   dptp       date pm25tmean2
## 1 chic 31.5 31.500 1987-01-01         NA
## 2 chic 33.0 29.875 1987-01-02         NA
## 3 chic 33.0 27.375 1987-01-03         NA
## 4 chic 29.0 28.625 1987-01-04         NA
## 5 chic 32.0 28.875 1987-01-05         NA
## 6 chic 40.0 35.125 1987-01-06         NA
names(chicago)[1:3]
## [1] "city" "tmpd" "dptp"
head(select(chicago,city:dptp))
##   city tmpd   dptp
## 1 chic 31.5 31.500
## 2 chic 33.0 29.875
## 3 chic 33.0 27.375
## 4 chic 29.0 28.625
## 5 chic 32.0 28.875
## 6 chic 40.0 35.125
head(select(chicagoTable,city:dptp))
## # A tibble: 6 × 3
##    city  tmpd   dptp
##   <chr> <dbl>  <dbl>
## 1  chic  31.5 31.500
## 2  chic  33.0 29.875
## 3  chic  33.0 27.375
## 4  chic  29.0 28.625
## 5  chic  32.0 28.875
## 6  chic  40.0 35.125
head(select(chicago,1:3, pm25tmean2),10)
##    city tmpd   dptp pm25tmean2
## 1  chic 31.5 31.500         NA
## 2  chic 33.0 29.875         NA
## 3  chic 33.0 27.375         NA
## 4  chic 29.0 28.625         NA
## 5  chic 32.0 28.875         NA
## 6  chic 40.0 35.125         NA
## 7  chic 34.5 26.750         NA
## 8  chic 29.0 22.000         NA
## 9  chic 26.5 29.000         NA
## 10 chic 32.5 27.750         NA
head(select(chicago, -(city:dptp)))
##         date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 1987-01-01         NA   34.00000 4.250000  19.98810
## 2 1987-01-02         NA         NA 3.304348  23.19099
## 3 1987-01-03         NA   34.16667 3.333333  23.81548
## 4 1987-01-04         NA   47.00000 4.375000  30.43452
## 5 1987-01-05         NA         NA 4.750000  30.33333
## 6 1987-01-06         NA   48.00000 5.833333  25.77233
#################################### Equivalent base R
i <- match("city", names(chicago))
j <- match("dptp", names(chicago))
head(chicago[,-(i:j)])
##         date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 1987-01-01         NA   34.00000 4.250000  19.98810
## 2 1987-01-02         NA         NA 3.304348  23.19099
## 3 1987-01-03         NA   34.16667 3.333333  23.81548
## 4 1987-01-04         NA   47.00000 4.375000  30.43452
## 5 1987-01-05         NA         NA 4.750000  30.33333
## 6 1987-01-06         NA   48.00000 5.833333  25.77233
####################################
head(select(chicago, starts_with("d")))
##     dptp       date
## 1 31.500 1987-01-01
## 2 29.875 1987-01-02
## 3 27.375 1987-01-03
## 4 28.625 1987-01-04
## 5 28.875 1987-01-05
## 6 35.125 1987-01-06
head(select(chicago, ends_with("2")))
##   pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1         NA   34.00000 4.250000  19.98810
## 2         NA         NA 3.304348  23.19099
## 3         NA   34.16667 3.333333  23.81548
## 4         NA   47.00000 4.375000  30.43452
## 5         NA         NA 4.750000  30.33333
## 6         NA   48.00000 5.833333  25.77233
head(select(chicago, matches("mean")))
##   pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1         NA   34.00000 4.250000  19.98810
## 2         NA         NA 3.304348  23.19099
## 3         NA   34.16667 3.333333  23.81548
## 4         NA   47.00000 4.375000  30.43452
## 5         NA         NA 4.750000  30.33333
## 6         NA   48.00000 5.833333  25.77233
head(select(chicago, contains("mean")))
##   pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1         NA   34.00000 4.250000  19.98810
## 2         NA         NA 3.304348  23.19099
## 3         NA   34.16667 3.333333  23.81548
## 4         NA   47.00000 4.375000  30.43452
## 5         NA         NA 4.750000  30.33333
## 6         NA   48.00000 5.833333  25.77233

Filter (row)

chi.f <- filter(chicago, pm25tmean2 > 30)
head(select(chi.f, 1:3, pm25tmean2), 10)
##    city tmpd dptp pm25tmean2
## 1  chic   23 21.9      38.10
## 2  chic   28 25.8      33.95
## 3  chic   55 51.3      39.40
## 4  chic   59 53.7      35.40
## 5  chic   57 52.0      33.30
## 6  chic   57 56.0      32.10
## 7  chic   75 65.8      56.50
## 8  chic   61 59.0      33.80
## 9  chic   73 60.3      30.30
## 10 chic   78 67.1      41.40
chi.f <- filter(chicagoTable, pm25tmean2 > 30)
head(select(chi.f, 1:3, pm25tmean2), 10)
## # A tibble: 10 × 4
##     city  tmpd  dptp pm25tmean2
##    <chr> <dbl> <dbl>      <dbl>
## 1   chic    23  21.9      38.10
## 2   chic    28  25.8      33.95
## 3   chic    55  51.3      39.40
## 4   chic    59  53.7      35.40
## 5   chic    57  52.0      33.30
## 6   chic    57  56.0      32.10
## 7   chic    75  65.8      56.50
## 8   chic    61  59.0      33.80
## 9   chic    73  60.3      30.30
## 10  chic    78  67.1      41.40
chi.f <- filter(chicago, pm25tmean2 > 30 & tmpd > 80)
head(select(chi.f, 1:3, pm25tmean2, tmpd), 10)
##    city tmpd dptp pm25tmean2
## 1  chic   81 71.2    39.6000
## 2  chic   81 70.4    31.5000
## 3  chic   82 72.2    32.3000
## 4  chic   84 72.9    43.7000
## 5  chic   85 72.6    38.8375
## 6  chic   84 72.6    38.2000
## 7  chic   82 67.4    33.0000
## 8  chic   82 63.5    42.5000
## 9  chic   81 70.4    33.1000
## 10 chic   82 66.2    38.8500

Arrange (reorder row)

  • Ascending
chicago <- arrange(chicago,date)
head(select(chicago,date,pm25tmean2),3)
##         date pm25tmean2
## 1 1987-01-01         NA
## 2 1987-01-02         NA
## 3 1987-01-03         NA
tail(select(chicago,date,pm25tmean2),3)
##            date pm25tmean2
## 6938 2005-12-29    7.45000
## 6939 2005-12-30   15.05714
## 6940 2005-12-31   15.00000
chicagoTable <- arrange(chicagoTable,date)
head(select(chicagoTable,date,pm25tmean2),3)
## # A tibble: 3 × 2
##         date pm25tmean2
##       <date>      <dbl>
## 1 1987-01-01         NA
## 2 1987-01-02         NA
## 3 1987-01-03         NA
head(select(chicagoTable,date,pm25tmean2),3)
## # A tibble: 3 × 2
##         date pm25tmean2
##       <date>      <dbl>
## 1 1987-01-01         NA
## 2 1987-01-02         NA
## 3 1987-01-03         NA
  • Descending
chicago <- arrange(chicago, desc(date))
head(select(chicago,date,pm25tmean2),3)
##         date pm25tmean2
## 1 2005-12-31   15.00000
## 2 2005-12-30   15.05714
## 3 2005-12-29    7.45000
tail(select(chicago,date,pm25tmean2),3)
##            date pm25tmean2
## 6938 1987-01-03         NA
## 6939 1987-01-02         NA
## 6940 1987-01-01         NA

Rename (variables / column)

head(chicago[,1:5],3)
##   city tmpd dptp       date pm25tmean2
## 1 chic   35 30.1 2005-12-31   15.00000
## 2 chic   36 31.0 2005-12-30   15.05714
## 3 chic   35 29.4 2005-12-29    7.45000
chicago <- rename(chicago, dewpoint = dptp,
                  pm25 = pm25tmean2)
head(chicago[,1:5],3)
##   city tmpd dewpoint       date     pm25
## 1 chic   35     30.1 2005-12-31 15.00000
## 2 chic   36     31.0 2005-12-30 15.05714
## 3 chic   35     29.4 2005-12-29  7.45000

Mutate

chicago <- mutate(chicago,
                  pm25detrend = pm25 - mean(pm25, na.rm = T))
head(select(chicago, pm25, pm25detrend))
##       pm25 pm25detrend
## 1 15.00000   -1.230958
## 2 15.05714   -1.173815
## 3  7.45000   -8.780958
## 4 17.75000    1.519042
## 5 23.56000    7.329042
## 6  8.40000   -7.830958
tail(select(chicago, pm25, pm25detrend))
##      pm25 pm25detrend
## 6935   NA          NA
## 6936   NA          NA
## 6937   NA          NA
## 6938   NA          NA
## 6939   NA          NA
## 6940   NA          NA
  • ifelse
chicago <- mutate(chicago, 
                  tempcatifelse = ifelse(tmpd <= 80, "cold", "hot"))
head(chicago,3)
##   city tmpd dewpoint       date     pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic   35     30.1 2005-12-31 15.00000       23.5 2.531250  13.25000
## 2 chic   36     31.0 2005-12-30 15.05714       19.2 3.034420  22.80556
## 3 chic   35     29.4 2005-12-29  7.45000       23.5 6.794837  19.97222
##   pm25detrend tempcatifelse
## 1   -1.230958          cold
## 2   -1.173815          cold
## 3   -8.780958          cold

Group_by (statistics summarize) summarize

  • One sample
head(chicago,3)
##   city tmpd dewpoint       date     pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic   35     30.1 2005-12-31 15.00000       23.5 2.531250  13.25000
## 2 chic   36     31.0 2005-12-30 15.05714       19.2 3.034420  22.80556
## 3 chic   35     29.4 2005-12-29  7.45000       23.5 6.794837  19.97222
##   pm25detrend tempcatifelse
## 1   -1.230958          cold
## 2   -1.173815          cold
## 3   -8.780958          cold
chicago <- mutate(chicago,
                  tempcat = factor(1 * (tmpd > 80),
                            labels = c("cold","hot")))
head(chicago,3)
##   city tmpd dewpoint       date     pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic   35     30.1 2005-12-31 15.00000       23.5 2.531250  13.25000
## 2 chic   36     31.0 2005-12-30 15.05714       19.2 3.034420  22.80556
## 3 chic   35     29.4 2005-12-29  7.45000       23.5 6.794837  19.97222
##   pm25detrend tempcatifelse tempcat
## 1   -1.230958          cold    cold
## 2   -1.173815          cold    cold
## 3   -8.780958          cold    cold
hotcold <- group_by(chicago, tempcat)
class(hotcold)
## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"
summarize(hotcold, pm25 = mean(pm25, na.rm = T),
          o3 = max(o3tmean2),
          no2 = median(no2tmean2))
## # A tibble: 3 × 4
##   tempcat     pm25        o3      no2
##    <fctr>    <dbl>     <dbl>    <dbl>
## 1    cold 15.97807 66.587500 24.54924
## 2     hot 26.48118 62.969656 24.93870
## 3      NA 47.73750  9.416667 37.44444

Aggregate functions

  • min(x) - minimum value of vector x.
  • max(x) - maximum value of vector x.
  • mean(x) - mean value of vector x.
  • median(x) - median value of vector x.
  • quantile(x, p) - pth quantile of vector x.
  • sd(x) - standard deviation of vector x.
  • var(x) - variance of vector x.
  • IQR(x) - Inter Quartile Range (IQR) of vector x.
  • diff(range(x)) - total range of vector x.

dplyr aggregate functions

  • first(x) - The first element of vector x.
  • last(x) - The last element of vector x.
  • nth(x, n) - The nth element of vector x.
  • n() - The number of rows in the data.frame or group of observations that summarise() describes.
  • n_distinct(x) - The number of unique values in vector x.
chicago <- readRDS("chicago.rds")
head(chicago)
##   city tmpd   dptp       date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 31.5 31.500 1987-01-01         NA   34.00000 4.250000  19.98810
## 2 chic 33.0 29.875 1987-01-02         NA         NA 3.304348  23.19099
## 3 chic 33.0 27.375 1987-01-03         NA   34.16667 3.333333  23.81548
## 4 chic 29.0 28.625 1987-01-04         NA   47.00000 4.375000  30.43452
## 5 chic 32.0 28.875 1987-01-05         NA         NA 4.750000  30.33333
## 6 chic 40.0 35.125 1987-01-06         NA   48.00000 5.833333  25.77233
summarise(chicago, num = nth(tmpd, 2))
##   num
## 1  33
summarise(chicago, uniqueNumber = n_distinct(city))
##   uniqueNumber
## 1            1

%>% the pipe operator

chicago %>% mutate(month = as.POSIXlt(date)$mon + 1) %>% group_by(month) %>% summarize(pm25 = mean(pm25tmean2, na.rm = T),
          o3 = max(o3tmean2, na.rm = T),
          no2 = median(no2tmean2, na.rm = T))
## # A tibble: 12 × 4
##    month     pm25       o3      no2
##    <dbl>    <dbl>    <dbl>    <dbl>
## 1      1 17.76996 28.22222 25.35417
## 2      2 20.37513 37.37500 26.78034
## 3      3 17.40818 39.05000 26.76984
## 4      4 13.85879 47.94907 25.03125
## 5      5 14.07420 52.75000 24.22222
## 6      6 15.86461 66.58750 25.01140
## 7      7 16.57087 59.54167 22.38442
## 8      8 16.93380 53.96701 22.98333
## 9      9 15.91279 57.48864 24.47917
## 10    10 14.23557 47.09275 24.15217
## 11    11 15.15794 29.45833 23.56537
## 12    12 17.52221 27.70833 24.45773

dplyr and mySQL databases

# Set up a connection to the mysql database
my_db <- src_mysql(dbname = "dplyr", 
                   host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com", 
                   port = 3306, 
                   user = "student",
                   password = "datacamp")

# Reference a table within that source: nycflights
nycflights <- tbl(my_db, "dplyr")

data.table

Create and Subset (filter row i)

library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, mday, month, quarter, wday, week, yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, last
DT <- data.table(x = c("A", "B", "C"), y = 1:9, z = (2:10) *2)
DT[3:4]                                         # no need ","
##    x y  z
## 1: C 3  8
## 2: A 4 10
DT[.N]                                          # the last row
##    x y  z
## 1: C 9 20

Select / Summarize Columns and Add function (j)

# select and mutate in dplyr (a little diff)
DT[,.(x, y, diff = z-y, Ymean = mean(y))]
##    x y diff Ymean
## 1: A 1    3     5
## 2: B 2    4     5
## 3: C 3    5     5
## 4: A 4    6     5
## 5: B 5    7     5
## 6: C 6    8     5
## 7: A 7    9     5
## 8: B 8   10     5
## 9: C 9   11     5
# summarize in dplyr
DT[,.(Zmean = mean(z), Yvar = var(y))]
##    Zmean Yvar
## 1:    12  7.5
# Insert functions
DT[, plot(y,z)]

## NULL
DT[, {print(x)
      hist(z)
      NULL   }]
## [1] "A" "B" "C" "A" "B" "C" "A" "B" "C"

## NULL

Doing (j) by Group

DT[, .N, by = .(x)]                                 # the number of rows in this group
##    x N
## 1: A 3
## 2: B 3
## 3: C 3
DT[,.(Ymean = mean(y)), by = .(x)]
##    x Ymean
## 1: A     4
## 2: B     5
## 3: C     6
DT[, .(y, Zmean = mean(z)), by = .(Grp = y %% 2)]
##    Grp y Zmean
## 1:   1 1    12
## 2:   1 3    12
## 3:   1 5    12
## 4:   1 7    12
## 5:   1 9    12
## 6:   0 2    12
## 7:   0 4    12
## 8:   0 6    12
## 9:   0 8    12
DT[, .(Zmean = mean(z)), by = .(Grp = y %% 2)]
##    Grp Zmean
## 1:   1    12
## 2:   0    12
DT[, mean(z), by = y %% 2]
##    y V1
## 1: 1 12
## 2: 0 12
DT[2:7, mean(z), by = y %% 2]
##    y V1
## 1: 0 10
## 2: 1 12

Chaining

ans <- DT[2:9, mean(z), by = x]
ans[order(x)]
##    x V1
## 1: A 13
## 2: B 12
## 3: C 14
####################################### by Chaining
ans <- DT[2:9, mean(z), by = x][order(x)]
ans
##    x V1
## 1: A 13
## 2: B 12
## 3: C 14

Subset of Data (.SD) - apply same function to columns (summarize)

DT[, lapply(.SD, median), by = x]
##    x y  z
## 1: A 4 10
## 2: B 5 12
## 3: C 6 14
DT[, lapply(.SD, median), .SDcols = 2:3]
##    y  z
## 1: 5 12
DT[, lapply(.SD, median), .SDcols = 2:3, by = x]
##    x y  z
## 1: A 4 10
## 2: B 5 12
## 3: C 6 14
DT[, lapply(.SD, median), by = x, .SDcols = c("y","z")]
##    x y  z
## 1: A 4 10
## 2: B 5 12
## 3: C 6 14
DT[, .(lapply(.SD, median)), by = x, .SDcols = c("y","z")]
##    x V1
## 1: A  4
## 2: A 10
## 3: B  5
## 4: B 12
## 5: C  6
## 6: C 14
DT[, .(lapply(.SD, median), n = .N), by = .(x, y>5), .SDcols = c("y","z")]
##     x     y  V1 n
##  1: A FALSE 2.5 2
##  2: A FALSE   7 2
##  3: B FALSE 3.5 2
##  4: B FALSE   9 2
##  5: C FALSE   3 1
##  6: C FALSE   8 1
##  7: C  TRUE 7.5 2
##  8: C  TRUE  17 2
##  9: A  TRUE   7 1
## 10: A  TRUE  16 1
## 11: B  TRUE   8 1
## 12: B  TRUE  18 1

Using := in j

# Add and update columns (mutate in dplyr)
DT[, c("j","k") := .(rev(y),10:2)];DT
##    x y  z j  k
## 1: A 1  4 9 10
## 2: B 2  6 8  9
## 3: C 3  8 7  8
## 4: A 4 10 6  7
## 5: B 5 12 5  6
## 6: C 6 14 4  5
## 7: A 7 16 3  4
## 8: B 8 18 2  3
## 9: C 9 20 1  2
# Remove the columns
DT[, j := NULL];DT
##    x y  z  k
## 1: A 1  4 10
## 2: B 2  6  9
## 3: C 3  8  8
## 4: A 4 10  7
## 5: B 5 12  6
## 6: C 6 14  5
## 7: A 7 16  4
## 8: B 8 18  3
## 9: C 9 20  2
DT[, c("k") := NULL];DT
##    x y  z
## 1: A 1  4
## 2: B 2  6
## 3: C 3  8
## 4: A 4 10
## 5: B 5 12
## 6: C 6 14
## 7: A 7 16
## 8: B 8 18
## 9: C 9 20
# Other way to remove
DT[, c("j","k") := .(rev(y),10:2)]
Mycols <- c("j", "k")
DT[, (Mycols) := NULL];DT
##    x y  z
## 1: A 1  4
## 2: B 2  6
## 3: C 3  8
## 4: A 4 10
## 5: B 5 12
## 6: C 6 14
## 7: A 7 16
## 8: B 8 18
## 9: C 9 20
# Functional
DT[, `:=` (j = rev(y), k = 10:2)];DT
##    x y  z j  k
## 1: A 1  4 9 10
## 2: B 2  6 8  9
## 3: C 3  8 7  8
## 4: A 4 10 6  7
## 5: B 5 12 5  6
## 6: C 6 14 4  5
## 7: A 7 16 3  4
## 8: B 8 18 2  3
## 9: C 9 20 1  2
# by Group
DT[, c := sum(y), by = x];DT
##    x y  z j  k  c
## 1: A 1  4 9 10 12
## 2: B 2  6 8  9 15
## 3: C 3  8 7  8 18
## 4: A 4 10 6  7 12
## 5: B 5 12 5  6 15
## 6: C 6 14 4  5 18
## 7: A 7 16 3  4 12
## 8: B 8 18 2  3 15
## 9: C 9 20 1  2 18

Using set()

for (i in 1:9) DT[, set := i+1]
DT
###################### by set(DT, index, column, value)
for (i in 1:9) set(DT, i , 7L, i + 1)
DT
##    x y  z j  k  c set
## 1: A 1  4 9 10 12   2
## 2: B 2  6 8  9 15   3
## 3: C 3  8 7  8 18   4
## 4: A 4 10 6  7 12   5
## 5: B 5 12 5  6 15   6
## 6: C 6 14 4  5 18   7
## 7: A 7 16 3  4 12   8
## 8: B 8 18 2  3 15   9
## 9: C 9 20 1  2 18  10
# rename column
setnames(DT, "y", "yrename")
names(DT)
## [1] "x"       "yrename" "z"       "j"       "k"       "c"       "set"
# order columns
setcolorder(DT, c("set", "x", "yrename", "k", "j", "c", "z"))
names(DT)
## [1] "set"     "x"       "yrename" "k"       "j"       "c"       "z"

Indexing (filter)

DT[x == "A"]
##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   5 A       4  7 6 12 10
## 3:   8 A       7  4 3 12 16
DT[x %in% c("A", "B")]
##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   3 B       2  9 8 15  6
## 3:   5 A       4  7 6 12 10
## 4:   6 B       5  6 5 15 12
## 5:   8 A       7  4 3 12 16
## 6:   9 B       8  3 2 15 18
# Another way
(w <- DT[, x == "A"])
## [1]  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE
DT[w]
##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   5 A       4  7 6 12 10
## 3:   8 A       7  4 3 12 16

Keys

# create and use
setkey(DT,x)
DT
##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   5 A       4  7 6 12 10
## 3:   8 A       7  4 3 12 16
## 4:   3 B       2  9 8 15  6
## 5:   6 B       5  6 5 15 12
## 6:   9 B       8  3 2 15 18
## 7:   4 C       3  8 7 18  8
## 8:   7 C       6  5 4 18 14
## 9:  10 C       9  2 1 18 20
DT["A"]
##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   5 A       4  7 6 12 10
## 3:   8 A       7  4 3 12 16
# mult
DT["A", mult = "first"]
##    set x yrename  k j  c z
## 1:   2 A       1 10 9 12 4
DT["A", mult = "last"]
##    set x yrename k j  c  z
## 1:   8 A       7 4 3 12 16
# nomatch
DT[c("A", "D")]
##    set x yrename  k  j  c  z
## 1:   2 A       1 10  9 12  4
## 2:   5 A       4  7  6 12 10
## 3:   8 A       7  4  3 12 16
## 4:  NA D      NA NA NA NA NA
DT[c("A", "D"), nomatch = NA] # default
##    set x yrename  k  j  c  z
## 1:   2 A       1 10  9 12  4
## 2:   5 A       4  7  6 12 10
## 3:   8 A       7  4  3 12 16
## 4:  NA D      NA NA NA NA NA
DT[c("A", "D"), nomatch = 0]
##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   5 A       4  7 6 12 10
## 3:   8 A       7  4 3 12 16
# a two-column key
setkey(DT, x, z)
DT["A"]
##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   5 A       4  7 6 12 10
## 3:   8 A       7  4 3 12 16
DT[.("A", 10)]
##    set x yrename k j  c  z
## 1:   5 A       4 7 6 12 10

Rolling Joins

DT <- DT[, .(x,k,z)]
DT
##    x  k  z
## 1: A 10  4
## 2: A  7 10
## 3: A  4 16
## 4: B  9  6
## 5: B  6 12
## 6: B  3 18
## 7: C  8  8
## 8: C  5 14
## 9: C  2 20
setkey(DT, x, k)
DT[.("B")]
##    x k  z
## 1: B 3 18
## 2: B 6 12
## 3: B 9  6
DT[.("B", 5)]
##    x k  z
## 1: B 5 NA
# Ordered joins
DT[.("B", 5), roll = TRUE]        # forwards by the second key
##    x k  z
## 1: B 5 18
DT[.("B", 5), roll = "nearest"]   # nearest by the second key
##    x k  z
## 1: B 5 12
# Forwards and backwards
DT[.("B", 5), roll = +Inf]        # always forward
##    x k  z
## 1: B 5 18
DT[.("B", 5), roll = -Inf]        # always backword
##    x k  z
## 1: B 5 12
# Limited saleness
DT[.("B", 8), roll = 1]          # forward with limited steps
##    x k  z
## 1: B 8 NA
DT[.("B", 8), roll = 2]
##    x k  z
## 1: B 8 12
DT[.("B", 7), roll = -2]         # backward with limited steps
##    x k z
## 1: B 7 6
DT[.("B", 7), roll = -1]
##    x k  z
## 1: B 7 NA
# End control
DT[.("B", 11:12)]
##    x  k  z
## 1: B 11 NA
## 2: B 12 NA
DT[.("B", 11:12), roll = TRUE]
##    x  k z
## 1: B 11 6
## 2: B 12 6
DT[.("B", 11:12), roll = TRUE, rollends = FALSE]
##    x  k  z
## 1: B 11 NA
## 2: B 12 NA

Tricks

Missing values

df <- data.frame(A = c(1,NA,8,NA),
                 B = c(3,NA,88,23),
                 C = c(2,45,3,1))
is.na(df)
##          A     B     C
## [1,] FALSE FALSE FALSE
## [2,]  TRUE  TRUE FALSE
## [3,] FALSE FALSE FALSE
## [4,]  TRUE FALSE FALSE
# Check if there is a NA
any(is.na(df))
## [1] TRUE
# Count the # of NA
sum(is.na(df))
## [1] 3
# summary
summary(df)
##        A              B              C        
##  Min.   :1.00   Min.   : 3.0   Min.   : 1.00  
##  1st Qu.:2.75   1st Qu.:13.0   1st Qu.: 1.75  
##  Median :4.50   Median :23.0   Median : 2.50  
##  Mean   :4.50   Mean   :38.0   Mean   :12.75  
##  3rd Qu.:6.25   3rd Qu.:55.5   3rd Qu.:13.50  
##  Max.   :8.00   Max.   :88.0   Max.   :45.00  
##  NA's   :2      NA's   :1
# Find rows with no NA
complete.cases(df) 
## [1]  TRUE FALSE  TRUE FALSE
# Another way to remove rows with NAs
na.omit(df)
##   A  B C
## 1 1  3 2
## 3 8 88 3

Outliers and obvious errors

set.seed(10)
x <- c(rnorm(30, mean = 15, sd = 5), -5, 28, 35)
# View a boxplot
boxplot(x, horizontal = TRUE)

# Create another dataset
df2 <- data.frame(A = rnorm(100,50,10),
                  B = c(rnorm(99,50,10), 500),
                  C = c(rnorm(99,50,10), -1))
# View summary
summary(df2)
##        A               B                C        
##  Min.   :31.46   Min.   : 26.79   Min.   :-1.00  
##  1st Qu.:42.21   1st Qu.: 41.35   1st Qu.:45.29  
##  Median :50.20   Median : 50.67   Median :51.06  
##  Mean   :49.70   Mean   : 53.62   Mean   :50.88  
##  3rd Qu.:57.12   3rd Qu.: 56.57   3rd Qu.:58.13  
##  Max.   :72.21   Max.   :500.00   Max.   :76.44
# View histogram
hist(df2$B, breaks = 20)

# View boxplot
boxplot(df2)