Data Manipulation Tutorial

tidyr & lubridate & stringr & dplyr & data.table & some tricks

tidyr

#setwd("/home/creatrol/ws/R/Tutorials")
#library(devtools)
#devtools::install_github("hadley/tibble")
#devtools::install_github("hadley/dplyr")
#devtools::install_github("hadley/tidyr")
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

Gather columns into key-value pairs

wide_df <- data.frame(col = c("X_a","y_b"),
                      A = c(1,2), b = c(2,5), C= c(3,6))
wide_df

##   col A b C
## 1 X_a 1 2 3
## 2 y_b 2 5 6

(long_df <- gather(wide_df, my_key, my_val, - col))

##   col my_key my_val
## 1 X_a      A      1
## 2 y_b      A      2
## 3 X_a      b      2
## 4 y_b      b      5
## 5 X_a      C      3
## 6 y_b      C      6

Spread key-value pairs into columns

spread(long_df, my_key, my_val)

##   col A b C
## 1 X_a 1 2 3
## 2 y_b 2 5 6

Separate one column intp multiple

4th arguement (… ,sep = “_“)

(separate_df <- separate(wide_df, col, c("upper", "lower")))

##   upper lower A b C
## 1     X     a 1 2 3
## 2     y     b 2 5 6

Unite multiple columns into one

4th arguement (… ,sep = “_“)

(unite_df <- unite(separate_df, col, upper, lower, sep = "-"))

##   col A b C
## 1 X-a 1 2 3
## 2 y-b 2 5 6

arrange

arrange(separate_df, lower)

##   upper lower A b C
## 1     X     a 1 2 3
## 2     y     b 2 5 6

arrange(separate_df, desc(lower))

##   upper lower A b C
## 1     y     b 2 5 6
## 2     X     a 1 2 3

lubridate

date & time

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

Basic functions

ymd("2016-08-31")

## [1] "2016-08-31"

ymd("2016 August 31")

## [1] "2016-08-31"

mdy("August 31, 2016")

## [1] "2016-08-31"

hms("13:30:09")

## [1] "13H 30M 9S"

ymd_hms("2016/08/31 13.30.09")

## [1] "2016-08-31 13:30:09 UTC"

stringr

library(stringr)

key functions

# Trim leading and trailing white space
str_trim("    this is a    test   ")

## [1] "this is a    test"

# Pad string with x
str_pad("22343", width = 7, side = "left", pad = "x")

## [1] "xx22343"

# Create character vector of names
friend <- c("Sarah", "Tom", "Tony")
# Search for string in vector
str_detect(friend, "Tony")

## [1] FALSE FALSE  TRUE

# Replace string in vector
str_replace(friend, "Tony", "David")

## [1] "Sarah" "Tom"   "David"

tolower("I am Not TALKING BOY!!")

## [1] "i am not talking boy!!"

toupper("i am a good one..")

## [1] "I AM A GOOD ONE.."

dplyr

#setwd("/home/creatrol/ws/R/Tutorials")
library(dplyr)
chicago <- readRDS("chicago.rds")
# To show all columns
options(dplyr.width = Inf)

tbl

Convert data.frame to table

chicagoTable <- tbl_df(chicago)
class(chicago)

## [1] "data.frame"

class(chicagoTable)

## [1] "tbl_df"     "tbl"        "data.frame"

names(chicago)

## [1] "city"       "tmpd"       "dptp"       "date"       "pm25tmean2"
## [6] "pm10tmean2" "o3tmean2"   "no2tmean2"

names(chicagoTable)

## [1] "city"       "tmpd"       "dptp"       "date"       "pm25tmean2"
## [6] "pm10tmean2" "o3tmean2"   "no2tmean2"

cities <- chicagoTable$city
head(cities)

## [1] "chic" "chic" "chic" "chic" "chic" "chic"

summary(chicago)

##      city                tmpd             dptp             date           
##  Length:6940        Min.   :-16.00   Min.   :-25.62   Min.   :1987-01-01  
##  Class :character   1st Qu.: 35.00   1st Qu.: 27.00   1st Qu.:1991-10-01  
##  Mode  :character   Median : 51.00   Median : 39.88   Median :1996-07-01  
##                     Mean   : 50.31   Mean   : 40.34   Mean   :1996-07-01  
##                     3rd Qu.: 67.00   3rd Qu.: 55.75   3rd Qu.:2001-04-01  
##                     Max.   : 92.00   Max.   : 78.25   Max.   :2005-12-31  
##                     NA's   :1        NA's   :2                            
##    pm25tmean2      pm10tmean2        o3tmean2         no2tmean2     
##  Min.   : 1.70   Min.   :  2.00   Min.   : 0.1528   Min.   : 6.158  
##  1st Qu.: 9.70   1st Qu.: 21.50   1st Qu.:10.0729   1st Qu.:19.654  
##  Median :14.66   Median : 30.28   Median :18.5218   Median :24.556  
##  Mean   :16.23   Mean   : 33.90   Mean   :19.4355   Mean   :25.232  
##  3rd Qu.:20.60   3rd Qu.: 42.00   3rd Qu.:27.0010   3rd Qu.:30.139  
##  Max.   :61.50   Max.   :365.00   Max.   :66.5875   Max.   :62.480  
##  NA's   :4447    NA's   :242

summary(chicagoTable)

##      city                tmpd             dptp             date           
##  Length:6940        Min.   :-16.00   Min.   :-25.62   Min.   :1987-01-01  
##  Class :character   1st Qu.: 35.00   1st Qu.: 27.00   1st Qu.:1991-10-01  
##  Mode  :character   Median : 51.00   Median : 39.88   Median :1996-07-01  
##                     Mean   : 50.31   Mean   : 40.34   Mean   :1996-07-01  
##                     3rd Qu.: 67.00   3rd Qu.: 55.75   3rd Qu.:2001-04-01  
##                     Max.   : 92.00   Max.   : 78.25   Max.   :2005-12-31  
##                     NA's   :1        NA's   :2                            
##    pm25tmean2      pm10tmean2        o3tmean2         no2tmean2     
##  Min.   : 1.70   Min.   :  2.00   Min.   : 0.1528   Min.   : 6.158  
##  1st Qu.: 9.70   1st Qu.: 21.50   1st Qu.:10.0729   1st Qu.:19.654  
##  Median :14.66   Median : 30.28   Median :18.5218   Median :24.556  
##  Mean   :16.23   Mean   : 33.90   Mean   :19.4355   Mean   :25.232  
##  3rd Qu.:20.60   3rd Qu.: 42.00   3rd Qu.:27.0010   3rd Qu.:30.139  
##  Max.   :61.50   Max.   :365.00   Max.   :66.5875   Max.   :62.480  
##  NA's   :4447    NA's   :242

chicagoTable

## # A tibble: 6,940 × 8
##     city  tmpd   dptp       date pm25tmean2 pm10tmean2  o3tmean2 no2tmean2
## *  <chr> <dbl>  <dbl>     <date>      <dbl>      <dbl>     <dbl>     <dbl>
## 1   chic  31.5 31.500 1987-01-01         NA   34.00000  4.250000  19.98810
## 2   chic  33.0 29.875 1987-01-02         NA         NA  3.304348  23.19099
## 3   chic  33.0 27.375 1987-01-03         NA   34.16667  3.333333  23.81548
## 4   chic  29.0 28.625 1987-01-04         NA   47.00000  4.375000  30.43452
## 5   chic  32.0 28.875 1987-01-05         NA         NA  4.750000  30.33333
## 6   chic  40.0 35.125 1987-01-06         NA   48.00000  5.833333  25.77233
## 7   chic  34.5 26.750 1987-01-07         NA   41.00000  9.291667  20.58171
## 8   chic  29.0 22.000 1987-01-08         NA   36.00000 11.291667  17.03723
## 9   chic  26.5 29.000 1987-01-09         NA   33.28571  4.500000  23.38889
## 10  chic  32.5 27.750 1987-01-10         NA         NA  4.958333  19.54167
## # ... with 6,930 more rows

str(chicago)

## 'data.frame':    6940 obs. of  8 variables:
##  $ city      : chr  "chic" "chic" "chic" "chic" ...
##  $ tmpd      : num  31.5 33 33 29 32 40 34.5 29 26.5 32.5 ...
##  $ dptp      : num  31.5 29.9 27.4 28.6 28.9 ...
##  $ date      : Date, format: "1987-01-01" "1987-01-02" ...
##  $ pm25tmean2: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ pm10tmean2: num  34 NA 34.2 47 NA ...
##  $ o3tmean2  : num  4.25 3.3 3.33 4.38 4.75 ...
##  $ no2tmean2 : num  20 23.2 23.8 30.4 30.3 ...

glimpse(chicagoTable)

## Observations: 6,940
## Variables: 8
## $ city       <chr> "chic", "chic", "chic", "chic", "chic", "chic", "ch...
## $ tmpd       <dbl> 31.5, 33.0, 33.0, 29.0, 32.0, 40.0, 34.5, 29.0, 26....
## $ dptp       <dbl> 31.500, 29.875, 27.375, 28.625, 28.875, 35.125, 26....
## $ date       <date> 1987-01-01, 1987-01-02, 1987-01-03, 1987-01-04, 19...
## $ pm25tmean2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ pm10tmean2 <dbl> 34.00000, NA, 34.16667, 47.00000, NA, 48.00000, 41....
## $ o3tmean2   <dbl> 4.250000, 3.304348, 3.333333, 4.375000, 4.750000, 5...
## $ no2tmean2  <dbl> 19.98810, 23.19099, 23.81548, 30.43452, 30.33333, 2...

unique(chicagoTable$city)

## [1] "chic"

chicagoTable[1,1]

## # A tibble: 1 × 1
##    city
##   <chr>
## 1  chic

change labels

changeName <- c("chic" = "chicago", "ny" = "newYork")
chicagoTable$cityName <- changeName[chicagoTable$city]
glimpse(chicagoTable)

## Observations: 6,940
## Variables: 9
## $ city       <chr> "chic", "chic", "chic", "chic", "chic", "chic", "ch...
## $ tmpd       <dbl> 31.5, 33.0, 33.0, 29.0, 32.0, 40.0, 34.5, 29.0, 26....
## $ dptp       <dbl> 31.500, 29.875, 27.375, 28.625, 28.875, 35.125, 26....
## $ date       <date> 1987-01-01, 1987-01-02, 1987-01-03, 1987-01-04, 19...
## $ pm25tmean2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ pm10tmean2 <dbl> 34.00000, NA, 34.16667, 47.00000, NA, 48.00000, 41....
## $ o3tmean2   <dbl> 4.250000, 3.304348, 3.333333, 4.375000, 4.750000, 5...
## $ no2tmean2  <dbl> 19.98810, 23.19099, 23.81548, 30.43452, 30.33333, 2...
## $ cityName   <chr> "chicago", "chicago", "chicago", "chicago", "chicag...

chicago$cityName <- changeName[chicago$city]
glimpse(chicago)

## Observations: 6,940
## Variables: 9
## $ city       <chr> "chic", "chic", "chic", "chic", "chic", "chic", "ch...
## $ tmpd       <dbl> 31.5, 33.0, 33.0, 29.0, 32.0, 40.0, 34.5, 29.0, 26....
## $ dptp       <dbl> 31.500, 29.875, 27.375, 28.625, 28.875, 35.125, 26....
## $ date       <date> 1987-01-01, 1987-01-02, 1987-01-03, 1987-01-04, 19...
## $ pm25tmean2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ pm10tmean2 <dbl> 34.00000, NA, 34.16667, 47.00000, NA, 48.00000, 41....
## $ o3tmean2   <dbl> 4.250000, 3.304348, 3.333333, 4.375000, 4.750000, 5...
## $ no2tmean2  <dbl> 19.98810, 23.19099, 23.81548, 30.43452, 30.33333, 2...
## $ cityName   <chr> "chicago", "chicago", "chicago", "chicago", "chicag...

Select (column)

starts_with(“X”): every name that starts with “X”,
ends_with(“X”): every name that ends with “X”,
contains(“X”): every name that contains “X”,
matches(“X”): every name that matches “X”, where “X” can be a regular expression,
num_range(“x”, 1:5): the variables named x01, x02, x03, x04 and x05,
one_of(x): every name that appears in x, which should be a character vector.

chicago <- readRDS("chicago.rds")
dim(chicago)

## [1] 6940    8

head(select(chicago,1:5))

##   city tmpd   dptp       date pm25tmean2
## 1 chic 31.5 31.500 1987-01-01         NA
## 2 chic 33.0 29.875 1987-01-02         NA
## 3 chic 33.0 27.375 1987-01-03         NA
## 4 chic 29.0 28.625 1987-01-04         NA
## 5 chic 32.0 28.875 1987-01-05         NA
## 6 chic 40.0 35.125 1987-01-06         NA

names(chicago)[1:3]

## [1] "city" "tmpd" "dptp"

head(select(chicago,city:dptp))

##   city tmpd   dptp
## 1 chic 31.5 31.500
## 2 chic 33.0 29.875
## 3 chic 33.0 27.375
## 4 chic 29.0 28.625
## 5 chic 32.0 28.875
## 6 chic 40.0 35.125

head(select(chicagoTable,city:dptp))

## # A tibble: 6 × 3
##    city  tmpd   dptp
##   <chr> <dbl>  <dbl>
## 1  chic  31.5 31.500
## 2  chic  33.0 29.875
## 3  chic  33.0 27.375
## 4  chic  29.0 28.625
## 5  chic  32.0 28.875
## 6  chic  40.0 35.125

head(select(chicago,1:3, pm25tmean2),10)

##    city tmpd   dptp pm25tmean2
## 1  chic 31.5 31.500         NA
## 2  chic 33.0 29.875         NA
## 3  chic 33.0 27.375         NA
## 4  chic 29.0 28.625         NA
## 5  chic 32.0 28.875         NA
## 6  chic 40.0 35.125         NA
## 7  chic 34.5 26.750         NA
## 8  chic 29.0 22.000         NA
## 9  chic 26.5 29.000         NA
## 10 chic 32.5 27.750         NA

head(select(chicago, -(city:dptp)))

##         date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 1987-01-01         NA   34.00000 4.250000  19.98810
## 2 1987-01-02         NA         NA 3.304348  23.19099
## 3 1987-01-03         NA   34.16667 3.333333  23.81548
## 4 1987-01-04         NA   47.00000 4.375000  30.43452
## 5 1987-01-05         NA         NA 4.750000  30.33333
## 6 1987-01-06         NA   48.00000 5.833333  25.77233

#################################### Equivalent base R
i <- match("city", names(chicago))
j <- match("dptp", names(chicago))
head(chicago[,-(i:j)])

##         date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 1987-01-01         NA   34.00000 4.250000  19.98810
## 2 1987-01-02         NA         NA 3.304348  23.19099
## 3 1987-01-03         NA   34.16667 3.333333  23.81548
## 4 1987-01-04         NA   47.00000 4.375000  30.43452
## 5 1987-01-05         NA         NA 4.750000  30.33333
## 6 1987-01-06         NA   48.00000 5.833333  25.77233

####################################
head(select(chicago, starts_with("d")))

##     dptp       date
## 1 31.500 1987-01-01
## 2 29.875 1987-01-02
## 3 27.375 1987-01-03
## 4 28.625 1987-01-04
## 5 28.875 1987-01-05
## 6 35.125 1987-01-06

head(select(chicago, ends_with("2")))

##   pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1         NA   34.00000 4.250000  19.98810
## 2         NA         NA 3.304348  23.19099
## 3         NA   34.16667 3.333333  23.81548
## 4         NA   47.00000 4.375000  30.43452
## 5         NA         NA 4.750000  30.33333
## 6         NA   48.00000 5.833333  25.77233

head(select(chicago, matches("mean")))

##   pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1         NA   34.00000 4.250000  19.98810
## 2         NA         NA 3.304348  23.19099
## 3         NA   34.16667 3.333333  23.81548
## 4         NA   47.00000 4.375000  30.43452
## 5         NA         NA 4.750000  30.33333
## 6         NA   48.00000 5.833333  25.77233

head(select(chicago, contains("mean")))

##   pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1         NA   34.00000 4.250000  19.98810
## 2         NA         NA 3.304348  23.19099
## 3         NA   34.16667 3.333333  23.81548
## 4         NA   47.00000 4.375000  30.43452
## 5         NA         NA 4.750000  30.33333
## 6         NA   48.00000 5.833333  25.77233

Filter (row)

chi.f <- filter(chicago, pm25tmean2 > 30)
head(select(chi.f, 1:3, pm25tmean2), 10)

##    city tmpd dptp pm25tmean2
## 1  chic   23 21.9      38.10
## 2  chic   28 25.8      33.95
## 3  chic   55 51.3      39.40
## 4  chic   59 53.7      35.40
## 5  chic   57 52.0      33.30
## 6  chic   57 56.0      32.10
## 7  chic   75 65.8      56.50
## 8  chic   61 59.0      33.80
## 9  chic   73 60.3      30.30
## 10 chic   78 67.1      41.40

chi.f <- filter(chicagoTable, pm25tmean2 > 30)
head(select(chi.f, 1:3, pm25tmean2), 10)

## # A tibble: 10 × 4
##     city  tmpd  dptp pm25tmean2
##    <chr> <dbl> <dbl>      <dbl>
## 1   chic    23  21.9      38.10
## 2   chic    28  25.8      33.95
## 3   chic    55  51.3      39.40
## 4   chic    59  53.7      35.40
## 5   chic    57  52.0      33.30
## 6   chic    57  56.0      32.10
## 7   chic    75  65.8      56.50
## 8   chic    61  59.0      33.80
## 9   chic    73  60.3      30.30
## 10  chic    78  67.1      41.40

chi.f <- filter(chicago, pm25tmean2 > 30 & tmpd > 80)
head(select(chi.f, 1:3, pm25tmean2, tmpd), 10)

##    city tmpd dptp pm25tmean2
## 1  chic   81 71.2    39.6000
## 2  chic   81 70.4    31.5000
## 3  chic   82 72.2    32.3000
## 4  chic   84 72.9    43.7000
## 5  chic   85 72.6    38.8375
## 6  chic   84 72.6    38.2000
## 7  chic   82 67.4    33.0000
## 8  chic   82 63.5    42.5000
## 9  chic   81 70.4    33.1000
## 10 chic   82 66.2    38.8500

Arrange (reorder row)

Ascending

chicago <- arrange(chicago,date)
head(select(chicago,date,pm25tmean2),3)

##         date pm25tmean2
## 1 1987-01-01         NA
## 2 1987-01-02         NA
## 3 1987-01-03         NA

tail(select(chicago,date,pm25tmean2),3)

##            date pm25tmean2
## 6938 2005-12-29    7.45000
## 6939 2005-12-30   15.05714
## 6940 2005-12-31   15.00000

chicagoTable <- arrange(chicagoTable,date)
head(select(chicagoTable,date,pm25tmean2),3)

## # A tibble: 3 × 2
##         date pm25tmean2
##       <date>      <dbl>
## 1 1987-01-01         NA
## 2 1987-01-02         NA
## 3 1987-01-03         NA

head(select(chicagoTable,date,pm25tmean2),3)

## # A tibble: 3 × 2
##         date pm25tmean2
##       <date>      <dbl>
## 1 1987-01-01         NA
## 2 1987-01-02         NA
## 3 1987-01-03         NA

Descending

chicago <- arrange(chicago, desc(date))
head(select(chicago,date,pm25tmean2),3)

##         date pm25tmean2
## 1 2005-12-31   15.00000
## 2 2005-12-30   15.05714
## 3 2005-12-29    7.45000

tail(select(chicago,date,pm25tmean2),3)

##            date pm25tmean2
## 6938 1987-01-03         NA
## 6939 1987-01-02         NA
## 6940 1987-01-01         NA

Rename (variables / column)

head(chicago[,1:5],3)

##   city tmpd dptp       date pm25tmean2
## 1 chic   35 30.1 2005-12-31   15.00000
## 2 chic   36 31.0 2005-12-30   15.05714
## 3 chic   35 29.4 2005-12-29    7.45000

chicago <- rename(chicago, dewpoint = dptp,
                  pm25 = pm25tmean2)
head(chicago[,1:5],3)

##   city tmpd dewpoint       date     pm25
## 1 chic   35     30.1 2005-12-31 15.00000
## 2 chic   36     31.0 2005-12-30 15.05714
## 3 chic   35     29.4 2005-12-29  7.45000

Mutate

chicago <- mutate(chicago,
                  pm25detrend = pm25 - mean(pm25, na.rm = T))
head(select(chicago, pm25, pm25detrend))

##       pm25 pm25detrend
## 1 15.00000   -1.230958
## 2 15.05714   -1.173815
## 3  7.45000   -8.780958
## 4 17.75000    1.519042
## 5 23.56000    7.329042
## 6  8.40000   -7.830958

tail(select(chicago, pm25, pm25detrend))

##      pm25 pm25detrend
## 6935   NA          NA
## 6936   NA          NA
## 6937   NA          NA
## 6938   NA          NA
## 6939   NA          NA
## 6940   NA          NA

ifelse

chicago <- mutate(chicago, 
                  tempcatifelse = ifelse(tmpd <= 80, "cold", "hot"))
head(chicago,3)

##   city tmpd dewpoint       date     pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic   35     30.1 2005-12-31 15.00000       23.5 2.531250  13.25000
## 2 chic   36     31.0 2005-12-30 15.05714       19.2 3.034420  22.80556
## 3 chic   35     29.4 2005-12-29  7.45000       23.5 6.794837  19.97222
##   pm25detrend tempcatifelse
## 1   -1.230958          cold
## 2   -1.173815          cold
## 3   -8.780958          cold

Group_by (statistics summarize) summarize

One sample

head(chicago,3)

##   city tmpd dewpoint       date     pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic   35     30.1 2005-12-31 15.00000       23.5 2.531250  13.25000
## 2 chic   36     31.0 2005-12-30 15.05714       19.2 3.034420  22.80556
## 3 chic   35     29.4 2005-12-29  7.45000       23.5 6.794837  19.97222
##   pm25detrend tempcatifelse
## 1   -1.230958          cold
## 2   -1.173815          cold
## 3   -8.780958          cold

chicago <- mutate(chicago,
                  tempcat = factor(1 * (tmpd > 80),
                            labels = c("cold","hot")))
head(chicago,3)

##   city tmpd dewpoint       date     pm25 pm10tmean2 o3tmean2 no2tmean2
## 1 chic   35     30.1 2005-12-31 15.00000       23.5 2.531250  13.25000
## 2 chic   36     31.0 2005-12-30 15.05714       19.2 3.034420  22.80556
## 3 chic   35     29.4 2005-12-29  7.45000       23.5 6.794837  19.97222
##   pm25detrend tempcatifelse tempcat
## 1   -1.230958          cold    cold
## 2   -1.173815          cold    cold
## 3   -8.780958          cold    cold

hotcold <- group_by(chicago, tempcat)
class(hotcold)

## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

summarize(hotcold, pm25 = mean(pm25, na.rm = T),
          o3 = max(o3tmean2),
          no2 = median(no2tmean2))

## # A tibble: 3 × 4
##   tempcat     pm25        o3      no2
##    <fctr>    <dbl>     <dbl>    <dbl>
## 1    cold 15.97807 66.587500 24.54924
## 2     hot 26.48118 62.969656 24.93870
## 3      NA 47.73750  9.416667 37.44444

Aggregate functions

min(x) - minimum value of vector x.
max(x) - maximum value of vector x.
mean(x) - mean value of vector x.
median(x) - median value of vector x.
quantile(x, p) - pth quantile of vector x.
sd(x) - standard deviation of vector x.
var(x) - variance of vector x.
IQR(x) - Inter Quartile Range (IQR) of vector x.
diff(range(x)) - total range of vector x.

dplyr aggregate functions

first(x) - The first element of vector x.
last(x) - The last element of vector x.
nth(x, n) - The nth element of vector x.
n() - The number of rows in the data.frame or group of observations that summarise() describes.
n_distinct(x) - The number of unique values in vector x.

chicago <- readRDS("chicago.rds")
head(chicago)

##   city tmpd   dptp       date pm25tmean2 pm10tmean2 o3tmean2 no2tmean2
## 1 chic 31.5 31.500 1987-01-01         NA   34.00000 4.250000  19.98810
## 2 chic 33.0 29.875 1987-01-02         NA         NA 3.304348  23.19099
## 3 chic 33.0 27.375 1987-01-03         NA   34.16667 3.333333  23.81548
## 4 chic 29.0 28.625 1987-01-04         NA   47.00000 4.375000  30.43452
## 5 chic 32.0 28.875 1987-01-05         NA         NA 4.750000  30.33333
## 6 chic 40.0 35.125 1987-01-06         NA   48.00000 5.833333  25.77233

summarise(chicago, num = nth(tmpd, 2))

##   num
## 1  33

summarise(chicago, uniqueNumber = n_distinct(city))

##   uniqueNumber
## 1            1

%>% the pipe operator

chicago %>% mutate(month = as.POSIXlt(date)$mon + 1) %>% group_by(month) %>% summarize(pm25 = mean(pm25tmean2, na.rm = T),
          o3 = max(o3tmean2, na.rm = T),
          no2 = median(no2tmean2, na.rm = T))

## # A tibble: 12 × 4
##    month     pm25       o3      no2
##    <dbl>    <dbl>    <dbl>    <dbl>
## 1      1 17.76996 28.22222 25.35417
## 2      2 20.37513 37.37500 26.78034
## 3      3 17.40818 39.05000 26.76984
## 4      4 13.85879 47.94907 25.03125
## 5      5 14.07420 52.75000 24.22222
## 6      6 15.86461 66.58750 25.01140
## 7      7 16.57087 59.54167 22.38442
## 8      8 16.93380 53.96701 22.98333
## 9      9 15.91279 57.48864 24.47917
## 10    10 14.23557 47.09275 24.15217
## 11    11 15.15794 29.45833 23.56537
## 12    12 17.52221 27.70833 24.45773

dplyr and mySQL databases

# Set up a connection to the mysql database
my_db <- src_mysql(dbname = "dplyr", 
                   host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com", 
                   port = 3306, 
                   user = "student",
                   password = "datacamp")

# Reference a table within that source: nycflights
nycflights <- tbl(my_db, "dplyr")

data.table

the structure of data.table is DT[i,j,by]

Create and Subset (filter row i)

library(data.table)

## -------------------------------------------------------------------------

## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!

## -------------------------------------------------------------------------

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:lubridate':
## 
##     hour, mday, month, quarter, wday, week, yday, year

## The following objects are masked from 'package:dplyr':
## 
##     between, last

DT <- data.table(x = c("A", "B", "C"), y = 1:9, z = (2:10) *2)
DT[3:4]                                         # no need ","

##    x y  z
## 1: C 3  8
## 2: A 4 10

DT[.N]                                          # the last row

##    x y  z
## 1: C 9 20

Select / Summarize Columns and Add function (j)

# select and mutate in dplyr (a little diff)
DT[,.(x, y, diff = z-y, Ymean = mean(y))]

##    x y diff Ymean
## 1: A 1    3     5
## 2: B 2    4     5
## 3: C 3    5     5
## 4: A 4    6     5
## 5: B 5    7     5
## 6: C 6    8     5
## 7: A 7    9     5
## 8: B 8   10     5
## 9: C 9   11     5

# summarize in dplyr
DT[,.(Zmean = mean(z), Yvar = var(y))]

##    Zmean Yvar
## 1:    12  7.5

# Insert functions
DT[, plot(y,z)]

## NULL

DT[, {print(x)
      hist(z)
      NULL   }]

## [1] "A" "B" "C" "A" "B" "C" "A" "B" "C"

## NULL

Doing (j) by Group

DT[, .N, by = .(x)]                                 # the number of rows in this group

##    x N
## 1: A 3
## 2: B 3
## 3: C 3

DT[,.(Ymean = mean(y)), by = .(x)]

##    x Ymean
## 1: A     4
## 2: B     5
## 3: C     6

DT[, .(y, Zmean = mean(z)), by = .(Grp = y %% 2)]

##    Grp y Zmean
## 1:   1 1    12
## 2:   1 3    12
## 3:   1 5    12
## 4:   1 7    12
## 5:   1 9    12
## 6:   0 2    12
## 7:   0 4    12
## 8:   0 6    12
## 9:   0 8    12

DT[, .(Zmean = mean(z)), by = .(Grp = y %% 2)]

##    Grp Zmean
## 1:   1    12
## 2:   0    12

DT[, mean(z), by = y %% 2]

##    y V1
## 1: 1 12
## 2: 0 12

DT[2:7, mean(z), by = y %% 2]

##    y V1
## 1: 0 10
## 2: 1 12

Chaining

ans <- DT[2:9, mean(z), by = x]
ans[order(x)]

##    x V1
## 1: A 13
## 2: B 12
## 3: C 14

####################################### by Chaining
ans <- DT[2:9, mean(z), by = x][order(x)]
ans

##    x V1
## 1: A 13
## 2: B 12
## 3: C 14

Subset of Data (.SD) - apply same function to columns (summarize)

DT[, lapply(.SD, median), by = x]

##    x y  z
## 1: A 4 10
## 2: B 5 12
## 3: C 6 14

DT[, lapply(.SD, median), .SDcols = 2:3]

##    y  z
## 1: 5 12

DT[, lapply(.SD, median), .SDcols = 2:3, by = x]

##    x y  z
## 1: A 4 10
## 2: B 5 12
## 3: C 6 14

DT[, lapply(.SD, median), by = x, .SDcols = c("y","z")]

##    x y  z
## 1: A 4 10
## 2: B 5 12
## 3: C 6 14

DT[, .(lapply(.SD, median)), by = x, .SDcols = c("y","z")]

##    x V1
## 1: A  4
## 2: A 10
## 3: B  5
## 4: B 12
## 5: C  6
## 6: C 14

DT[, .(lapply(.SD, median), n = .N), by = .(x, y>5), .SDcols = c("y","z")]

##     x     y  V1 n
##  1: A FALSE 2.5 2
##  2: A FALSE   7 2
##  3: B FALSE 3.5 2
##  4: B FALSE   9 2
##  5: C FALSE   3 1
##  6: C FALSE   8 1
##  7: C  TRUE 7.5 2
##  8: C  TRUE  17 2
##  9: A  TRUE   7 1
## 10: A  TRUE  16 1
## 11: B  TRUE   8 1
## 12: B  TRUE  18 1

Using := in j

# Add and update columns (mutate in dplyr)
DT[, c("j","k") := .(rev(y),10:2)];DT

##    x y  z j  k
## 1: A 1  4 9 10
## 2: B 2  6 8  9
## 3: C 3  8 7  8
## 4: A 4 10 6  7
## 5: B 5 12 5  6
## 6: C 6 14 4  5
## 7: A 7 16 3  4
## 8: B 8 18 2  3
## 9: C 9 20 1  2

# Remove the columns
DT[, j := NULL];DT

##    x y  z  k
## 1: A 1  4 10
## 2: B 2  6  9
## 3: C 3  8  8
## 4: A 4 10  7
## 5: B 5 12  6
## 6: C 6 14  5
## 7: A 7 16  4
## 8: B 8 18  3
## 9: C 9 20  2

DT[, c("k") := NULL];DT

##    x y  z
## 1: A 1  4
## 2: B 2  6
## 3: C 3  8
## 4: A 4 10
## 5: B 5 12
## 6: C 6 14
## 7: A 7 16
## 8: B 8 18
## 9: C 9 20

# Other way to remove
DT[, c("j","k") := .(rev(y),10:2)]
Mycols <- c("j", "k")
DT[, (Mycols) := NULL];DT

##    x y  z
## 1: A 1  4
## 2: B 2  6
## 3: C 3  8
## 4: A 4 10
## 5: B 5 12
## 6: C 6 14
## 7: A 7 16
## 8: B 8 18
## 9: C 9 20

# Functional
DT[, `:=` (j = rev(y), k = 10:2)];DT

##    x y  z j  k
## 1: A 1  4 9 10
## 2: B 2  6 8  9
## 3: C 3  8 7  8
## 4: A 4 10 6  7
## 5: B 5 12 5  6
## 6: C 6 14 4  5
## 7: A 7 16 3  4
## 8: B 8 18 2  3
## 9: C 9 20 1  2

# by Group
DT[, c := sum(y), by = x];DT

##    x y  z j  k  c
## 1: A 1  4 9 10 12
## 2: B 2  6 8  9 15
## 3: C 3  8 7  8 18
## 4: A 4 10 6  7 12
## 5: B 5 12 5  6 15
## 6: C 6 14 4  5 18
## 7: A 7 16 3  4 12
## 8: B 8 18 2  3 15
## 9: C 9 20 1  2 18

Using set()

for (i in 1:9) DT[, set := i+1]
DT
###################### by set(DT, index, column, value)
for (i in 1:9) set(DT, i , 7L, i + 1)
DT

##    x y  z j  k  c set
## 1: A 1  4 9 10 12   2
## 2: B 2  6 8  9 15   3
## 3: C 3  8 7  8 18   4
## 4: A 4 10 6  7 12   5
## 5: B 5 12 5  6 15   6
## 6: C 6 14 4  5 18   7
## 7: A 7 16 3  4 12   8
## 8: B 8 18 2  3 15   9
## 9: C 9 20 1  2 18  10

# rename column
setnames(DT, "y", "yrename")
names(DT)

## [1] "x"       "yrename" "z"       "j"       "k"       "c"       "set"

# order columns
setcolorder(DT, c("set", "x", "yrename", "k", "j", "c", "z"))
names(DT)

## [1] "set"     "x"       "yrename" "k"       "j"       "c"       "z"

Indexing (filter)

DT[x == "A"]

##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   5 A       4  7 6 12 10
## 3:   8 A       7  4 3 12 16

DT[x %in% c("A", "B")]

##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   3 B       2  9 8 15  6
## 3:   5 A       4  7 6 12 10
## 4:   6 B       5  6 5 15 12
## 5:   8 A       7  4 3 12 16
## 6:   9 B       8  3 2 15 18

# Another way
(w <- DT[, x == "A"])

## [1]  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE

DT[w]

##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   5 A       4  7 6 12 10
## 3:   8 A       7  4 3 12 16

Keys

# create and use
setkey(DT,x)
DT

##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   5 A       4  7 6 12 10
## 3:   8 A       7  4 3 12 16
## 4:   3 B       2  9 8 15  6
## 5:   6 B       5  6 5 15 12
## 6:   9 B       8  3 2 15 18
## 7:   4 C       3  8 7 18  8
## 8:   7 C       6  5 4 18 14
## 9:  10 C       9  2 1 18 20

DT["A"]

##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   5 A       4  7 6 12 10
## 3:   8 A       7  4 3 12 16

# mult
DT["A", mult = "first"]

##    set x yrename  k j  c z
## 1:   2 A       1 10 9 12 4

DT["A", mult = "last"]

##    set x yrename k j  c  z
## 1:   8 A       7 4 3 12 16

# nomatch
DT[c("A", "D")]

##    set x yrename  k  j  c  z
## 1:   2 A       1 10  9 12  4
## 2:   5 A       4  7  6 12 10
## 3:   8 A       7  4  3 12 16
## 4:  NA D      NA NA NA NA NA

DT[c("A", "D"), nomatch = NA] # default

##    set x yrename  k  j  c  z
## 1:   2 A       1 10  9 12  4
## 2:   5 A       4  7  6 12 10
## 3:   8 A       7  4  3 12 16
## 4:  NA D      NA NA NA NA NA

DT[c("A", "D"), nomatch = 0]

##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   5 A       4  7 6 12 10
## 3:   8 A       7  4 3 12 16

# a two-column key
setkey(DT, x, z)
DT["A"]

##    set x yrename  k j  c  z
## 1:   2 A       1 10 9 12  4
## 2:   5 A       4  7 6 12 10
## 3:   8 A       7  4 3 12 16

DT[.("A", 10)]

##    set x yrename k j  c  z
## 1:   5 A       4 7 6 12 10

Rolling Joins

DT <- DT[, .(x,k,z)]
DT

##    x  k  z
## 1: A 10  4
## 2: A  7 10
## 3: A  4 16
## 4: B  9  6
## 5: B  6 12
## 6: B  3 18
## 7: C  8  8
## 8: C  5 14
## 9: C  2 20

setkey(DT, x, k)
DT[.("B")]

##    x k  z
## 1: B 3 18
## 2: B 6 12
## 3: B 9  6

DT[.("B", 5)]

##    x k  z
## 1: B 5 NA

# Ordered joins
DT[.("B", 5), roll = TRUE]        # forwards by the second key

##    x k  z
## 1: B 5 18

DT[.("B", 5), roll = "nearest"]   # nearest by the second key

##    x k  z
## 1: B 5 12

# Forwards and backwards
DT[.("B", 5), roll = +Inf]        # always forward

##    x k  z
## 1: B 5 18

DT[.("B", 5), roll = -Inf]        # always backword

##    x k  z
## 1: B 5 12

# Limited saleness
DT[.("B", 8), roll = 1]          # forward with limited steps

##    x k  z
## 1: B 8 NA

DT[.("B", 8), roll = 2]

##    x k  z
## 1: B 8 12

DT[.("B", 7), roll = -2]         # backward with limited steps

##    x k z
## 1: B 7 6

DT[.("B", 7), roll = -1]

##    x k  z
## 1: B 7 NA

# End control
DT[.("B", 11:12)]

##    x  k  z
## 1: B 11 NA
## 2: B 12 NA

DT[.("B", 11:12), roll = TRUE]

##    x  k z
## 1: B 11 6
## 2: B 12 6

DT[.("B", 11:12), roll = TRUE, rollends = FALSE]

##    x  k  z
## 1: B 11 NA
## 2: B 12 NA

Tricks

missing and outliers

Missing values

df <- data.frame(A = c(1,NA,8,NA),
                 B = c(3,NA,88,23),
                 C = c(2,45,3,1))
is.na(df)

##          A     B     C
## [1,] FALSE FALSE FALSE
## [2,]  TRUE  TRUE FALSE
## [3,] FALSE FALSE FALSE
## [4,]  TRUE FALSE FALSE

# Check if there is a NA
any(is.na(df))

## [1] TRUE

# Count the # of NA
sum(is.na(df))

## [1] 3

# summary
summary(df)

##        A              B              C        
##  Min.   :1.00   Min.   : 3.0   Min.   : 1.00  
##  1st Qu.:2.75   1st Qu.:13.0   1st Qu.: 1.75  
##  Median :4.50   Median :23.0   Median : 2.50  
##  Mean   :4.50   Mean   :38.0   Mean   :12.75  
##  3rd Qu.:6.25   3rd Qu.:55.5   3rd Qu.:13.50  
##  Max.   :8.00   Max.   :88.0   Max.   :45.00  
##  NA's   :2      NA's   :1

# Find rows with no NA
complete.cases(df)

## [1]  TRUE FALSE  TRUE FALSE

# Another way to remove rows with NAs
na.omit(df)

##   A  B C
## 1 1  3 2
## 3 8 88 3

Outliers and obvious errors

set.seed(10)
x <- c(rnorm(30, mean = 15, sd = 5), -5, 28, 35)
# View a boxplot
boxplot(x, horizontal = TRUE)

# Create another dataset
df2 <- data.frame(A = rnorm(100,50,10),
                  B = c(rnorm(99,50,10), 500),
                  C = c(rnorm(99,50,10), -1))
# View summary
summary(df2)

##        A               B                C        
##  Min.   :31.46   Min.   : 26.79   Min.   :-1.00  
##  1st Qu.:42.21   1st Qu.: 41.35   1st Qu.:45.29  
##  Median :50.20   Median : 50.67   Median :51.06  
##  Mean   :49.70   Mean   : 53.62   Mean   :50.88  
##  3rd Qu.:57.12   3rd Qu.: 56.57   3rd Qu.:58.13  
##  Max.   :72.21   Max.   :500.00   Max.   :76.44

# View histogram
hist(df2$B, breaks = 20)

# View boxplot
boxplot(df2)

Data Manipulation Tutorial

Puxin

28 Aug 2016

tidyr

Gather columns into key-value pairs

Spread key-value pairs into columns

Separate one column intp multiple

Unite multiple columns into one

arrange

lubridate

Basic functions

stringr

key functions

dplyr

tbl

Select (column)

Filter (row)

Arrange (reorder row)

Rename (variables / column)

Mutate

Group_by (statistics summarize) summarize

Aggregate functions

dplyr aggregate functions

%>% the pipe operator

dplyr and mySQL databases

data.table

Create and Subset (filter row i)

Select / Summarize Columns and Add function (j)

Doing (j) by Group

Chaining

Subset of Data (.SD) - apply same function to columns (summarize)

Using := in j

Using set()

Indexing (filter)

Keys

Rolling Joins

Tricks

Missing values

Outliers and obvious errors