Importing Data
TidyR
StringR
Lubridate
Data Structures
dplyr
Joins
Visualization (ggplot2)
Control Statements & Iteration
Function
Apply Family
Miscellaneous

Libraries

library(DT)
library(tidyverse)
library(tidyr)
library(stringr)
library(lubridate)
library(nycflights13)
library(GGally)

Importing Data

Reading CSV

baseR

df <- read.csv("data/flights.csv")

readr

library(readr)
df <- read_csv("data/flights.csv")

data.table

library(data.table)
df <- fread("data/flights.csv")

Reading Excel

library(readxl)
excel_sheets("data/aircraft.xlsx")
## [1] "Bombers"              "Fighters"             "Trainers"            
## [4] "UAV_Drones"           "Tankers_Transporters"
df <- read_excel("data/aircraft.xlsx", sheet = "Bombers")

TidyR

cases <- fread("data/cases.csv", header = TRUE)
datatable(cases)

gather

library(tidyr)
# Year - name of the new column
# population - name of the new value column
# 2:4 - columns to collapse
df <-
cases %>% 
  gather(Year, population, 2:4)

datatable(df)
# Alternate Code
df <- cases %>% gather(Year, population, '2011':'2013')
df <- cases %>% gather(Year, population, '2011', '2012', '2013')
df <- cases %>% gather(Year, population, 2:4)
df <- cases %>% gather(Year, population, -country)

spread

# Year - column to use as column names
# population - column to use as column values
df %>% 
  spread(Year, population)
##   country  2011  2012  2013
## 1      DE  5800  6000  6200
## 2      FR  7000  6900  7000
## 3      US 15000 14000 13000

separate

df <- data.frame(x = c("a.b", "a.d", "b.c"))
datatable(df)
# x - column to split into multiple columns
# c("A", "B") - names of new columns
df <-
  df %>% 
    separate(x, c("A", "B"))

datatable(df)

unite

# x - name of the new merged column
# A,B - columns to merge
# sep="." - unite column by
df %>% 
  unite(x,A,B,sep=".")
##     x
## 1 a.b
## 2 a.d
## 3 b.c

example

df <- read_rds("data/bomber_mess.rds") 
datatable(df)
df <- 
  df %>% 
    unite(MD, prefix, number, sep="-") %>% 
    separate(Metric, c("FY","Output"), sep="_") %>% 
    spread(Output, Value)

datatable(df)

StringR

str_sub

# x - string
# 1 - start position
# 3 - end position
x <- "akshay"
str_sub(x,1,3)
## [1] "aks"

str_pad

x <- "abc"
str_pad(x, 5,"left") # default pads on left
## [1] "  abc"
str_pad(x, 5,"right") # default pads on left
## [1] "abc  "

str_trim

x <- c("  a   ", "b   ",  "   c")
str_trim(x, "left")
## [1] "a   " "b   " "c"
str_trim(x)
## [1] "a" "b" "c"

str_to_upper, str_to_title, str_to_lower

x <- "I like horses."
str_to_upper(x)
## [1] "I LIKE HORSES."
str_to_title(x)
## [1] "I Like Horses."
str_to_lower(x)
## [1] "i like horses."

str_detect

# fruits - string
# "apple" - pattern to match
fruits <- c(
  "apple", 
  "orange",
  "banana",
  "grapes",
  "grapesgrapes"
)
str_detect(fruits, "apple")
## [1]  TRUE FALSE FALSE FALSE FALSE

str_replace

# fruits - string
# "grapes" - pattern to replace
# "guava" - pattern to replace with
str_replace(fruits, "grapes", "guava")
## [1] "apple"       "orange"      "banana"      "guava"       "guavagrapes"
str_replace_all(fruits, "grapes", "guava")
## [1] "apple"      "orange"     "banana"     "guava"      "guavaguava"

Lubridate

ymd, mdy, dmy

library(lubridate)
ymd("20110604")
## [1] "2011-06-04"
mdy("06-04-2011")
## [1] "2011-06-04"
dmy("04/06/2011")
## [1] "2011-06-04"

ymd_hms

arrive <- ymd_hms("2011-06-04 12:00:00")
arrive
## [1] "2011-06-04 12:00:00 UTC"
leave <- ymd_hms("2011-08-10 14:00:00")
leave
## [1] "2011-08-10 14:00:00 UTC"

second, minute, hour, day, wday, yday, week, month, year

wday(arrive)
## [1] 7

Data Structures

vectors

state.name
##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"
state.name[1:5]
## [1] "Alabama"    "Alaska"     "Arizona"    "Arkansas"   "California"
state.name[c(23, 6, 15, 34, 35)]
## [1] "Minnesota"    "Colorado"     "Iowa"         "North Dakota"
## [5] "Ohio"

matrix

VADeaths
##       Rural Male Rural Female Urban Male Urban Female
## 50-54       11.7          8.7       15.4          8.4
## 55-59       18.1         11.7       24.3         13.6
## 60-64       26.9         20.3       37.0         19.3
## 65-69       41.0         30.9       54.6         35.1
## 70-74       66.0         54.3       71.1         50.0
VADeaths[1:3,]
##       Rural Male Rural Female Urban Male Urban Female
## 50-54       11.7          8.7       15.4          8.4
## 55-59       18.1         11.7       24.3         13.6
## 60-64       26.9         20.3       37.0         19.3
VADeaths[, c(2, 4)]
##       Rural Female Urban Female
## 50-54          8.7          8.4
## 55-59         11.7         13.6
## 60-64         20.3         19.3
## 65-69         30.9         35.1
## 70-74         54.3         50.0
VADeaths[1:3, c(2, 4)]
##       Rural Female Urban Female
## 50-54          8.7          8.4
## 55-59         11.7         13.6
## 60-64         20.3         19.3

dataframe

mtcars
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
mtcars[1:10,]
##                    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360        14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D         24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230          22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280          19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
mtcars[, c("mpg", "cyl", "vs")]
##                      mpg cyl vs
## Mazda RX4           21.0   6  0
## Mazda RX4 Wag       21.0   6  0
## Datsun 710          22.8   4  1
## Hornet 4 Drive      21.4   6  1
## Hornet Sportabout   18.7   8  0
## Valiant             18.1   6  1
## Duster 360          14.3   8  0
## Merc 240D           24.4   4  1
## Merc 230            22.8   4  1
## Merc 280            19.2   6  1
## Merc 280C           17.8   6  1
## Merc 450SE          16.4   8  0
## Merc 450SL          17.3   8  0
## Merc 450SLC         15.2   8  0
## Cadillac Fleetwood  10.4   8  0
## Lincoln Continental 10.4   8  0
## Chrysler Imperial   14.7   8  0
## Fiat 128            32.4   4  1
## Honda Civic         30.4   4  1
## Toyota Corolla      33.9   4  1
## Toyota Corona       21.5   4  1
## Dodge Challenger    15.5   8  0
## AMC Javelin         15.2   8  0
## Camaro Z28          13.3   8  0
## Pontiac Firebird    19.2   8  0
## Fiat X1-9           27.3   4  1
## Porsche 914-2       26.0   4  0
## Lotus Europa        30.4   4  1
## Ford Pantera L      15.8   8  0
## Ferrari Dino        19.7   6  0
## Maserati Bora       15.0   8  0
## Volvo 142E          21.4   4  1
mtcars[1:10, c("mpg", "cyl", "vs")]
##                    mpg cyl vs
## Mazda RX4         21.0   6  0
## Mazda RX4 Wag     21.0   6  0
## Datsun 710        22.8   4  1
## Hornet 4 Drive    21.4   6  1
## Hornet Sportabout 18.7   8  0
## Valiant           18.1   6  1
## Duster 360        14.3   8  0
## Merc 240D         24.4   4  1
## Merc 230          22.8   4  1
## Merc 280          19.2   6  1

list

l1 <- list(item1 = 1:3,
           item2 = matrix(1:9, nrow = 3))

l1
## $item1
## [1] 1 2 3
## 
## $item2
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
l1[2] # list containing matrix
## $item2
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
l1[[2]] # matrix
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
l1$item2 # matrix
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
l1$item2[3, 1] # element of a matrix
## [1] 3

dplyr

filter: pick observations based on filters

library(dplyr)
library(nycflights13)
filter(flights, month == 1, day == 1, dep_delay > 0)
## # A tibble: 352 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      601            600         1      844
##  5  2013     1     1      608            600         8      807
##  6  2013     1     1      611            600        11      945
##  7  2013     1     1      613            610         3      925
##  8  2013     1     1      623            610        13      920
##  9  2013     1     1      632            608        24      740
## 10  2013     1     1      644            636         8      931
## # ... with 342 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, month == 12)
## # A tibble: 28,135 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    12     1       13           2359        14      446
##  2  2013    12     1       17           2359        18      443
##  3  2013    12     1      453            500        -7      636
##  4  2013    12     1      520            515         5      749
##  5  2013    12     1      536            540        -4      845
##  6  2013    12     1      540            550       -10     1005
##  7  2013    12     1      541            545        -4      734
##  8  2013    12     1      546            545         1      826
##  9  2013    12     1      549            600       -11      648
## 10  2013    12     1      550            600       -10      825
## # ... with 28,125 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, month != 12)
## # A tibble: 308,641 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 308,631 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, month %in% c(11, 12))
## # A tibble: 55,403 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    11     1        5           2359         6      352
##  2  2013    11     1       35           2250       105      123
##  3  2013    11     1      455            500        -5      641
##  4  2013    11     1      539            545        -6      856
##  5  2013    11     1      542            545        -3      831
##  6  2013    11     1      549            600       -11      912
##  7  2013    11     1      550            600       -10      705
##  8  2013    11     1      554            600        -6      659
##  9  2013    11     1      554            600        -6      826
## 10  2013    11     1      554            600        -6      749
## # ... with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, arr_delay <= 120)
## # A tibble: 317,312 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 317,302 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, !(arr_delay <= 120))
## # A tibble: 10,034 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      811            630       101     1047
##  2  2013     1     1      848           1835       853     1001
##  3  2013     1     1      957            733       144     1056
##  4  2013     1     1     1114            900       134     1447
##  5  2013     1     1     1505           1310       115     1638
##  6  2013     1     1     1525           1340       105     1831
##  7  2013     1     1     1549           1445        64     1912
##  8  2013     1     1     1558           1359       119     1718
##  9  2013     1     1     1732           1630        62     2028
## 10  2013     1     1     1803           1620       103     2008
## # ... with 10,024 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, is.na(tailnum))
## # A tibble: 2,512 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     2       NA           1545        NA       NA
##  2  2013     1     2       NA           1601        NA       NA
##  3  2013     1     3       NA            857        NA       NA
##  4  2013     1     3       NA            645        NA       NA
##  5  2013     1     4       NA            845        NA       NA
##  6  2013     1     4       NA           1830        NA       NA
##  7  2013     1     5       NA            840        NA       NA
##  8  2013     1     7       NA            820        NA       NA
##  9  2013     1     8       NA           1645        NA       NA
## 10  2013     1     9       NA            755        NA       NA
## # ... with 2,502 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Using comma is same as using &

filter(flights, month == 12, day == 25)
## # A tibble: 719 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    12    25      456            500        -4      649
##  2  2013    12    25      524            515         9      805
##  3  2013    12    25      542            540         2      832
##  4  2013    12    25      546            550        -4     1022
##  5  2013    12    25      556            600        -4      730
##  6  2013    12    25      557            600        -3      743
##  7  2013    12    25      557            600        -3      818
##  8  2013    12    25      559            600        -1      855
##  9  2013    12    25      559            600        -1      849
## 10  2013    12    25      600            600         0      850
## # ... with 709 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, month == 12 & day == 25)
## # A tibble: 719 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    12    25      456            500        -4      649
##  2  2013    12    25      524            515         9      805
##  3  2013    12    25      542            540         2      832
##  4  2013    12    25      546            550        -4     1022
##  5  2013    12    25      556            600        -4      730
##  6  2013    12    25      557            600        -3      743
##  7  2013    12    25      557            600        -3      818
##  8  2013    12    25      559            600        -1      855
##  9  2013    12    25      559            600        -1      849
## 10  2013    12    25      600            600         0      850
## # ... with 709 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Use %in% as a shortcut for |

filter(flights, month == 11 | month == 12)
## # A tibble: 55,403 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    11     1        5           2359         6      352
##  2  2013    11     1       35           2250       105      123
##  3  2013    11     1      455            500        -5      641
##  4  2013    11     1      539            545        -6      856
##  5  2013    11     1      542            545        -3      831
##  6  2013    11     1      549            600       -11      912
##  7  2013    11     1      550            600       -10      705
##  8  2013    11     1      554            600        -6      659
##  9  2013    11     1      554            600        -6      826
## 10  2013    11     1      554            600        -6      749
## # ... with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, month %in% c(11, 12))
## # A tibble: 55,403 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    11     1        5           2359         6      352
##  2  2013    11     1       35           2250       105      123
##  3  2013    11     1      455            500        -5      641
##  4  2013    11     1      539            545        -6      856
##  5  2013    11     1      542            545        -3      831
##  6  2013    11     1      549            600       -11      912
##  7  2013    11     1      550            600       -10      705
##  8  2013    11     1      554            600        -6      659
##  9  2013    11     1      554            600        -6      826
## 10  2013    11     1      554            600        -6      749
## # ... with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

arrange: reorder data

arrange(flights, dep_delay, arr_delay)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    12     7     2040           2123       -43       40
##  2  2013     2     3     2022           2055       -33     2240
##  3  2013    11    10     1408           1440       -32     1549
##  4  2013     1    11     1900           1930       -30     2233
##  5  2013     1    29     1703           1730       -27     1947
##  6  2013     8     9      729            755       -26     1002
##  7  2013     3    30     2030           2055       -25     2213
##  8  2013    10    23     1907           1932       -25     2143
##  9  2013     5     5      934            958       -24     1225
## 10  2013     9    18     1631           1655       -24     1812
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     9      641            900      1301     1242
##  2  2013     6    15     1432           1935      1137     1607
##  3  2013     1    10     1121           1635      1126     1239
##  4  2013     9    20     1139           1845      1014     1457
##  5  2013     7    22      845           1600      1005     1044
##  6  2013     4    10     1100           1900       960     1342
##  7  2013     3    17     2321            810       911      135
##  8  2013     6    27      959           1900       899     1236
##  9  2013     7    22     2257            759       898      121
## 10  2013    12     5      756           1700       896     1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

select: pick variables

select(flights, year, month, day)
## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows
select(flights, year:day)
## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows
select(flights, -(year:day))
## # A tibble: 336,776 x 16
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
##       <int>          <int>     <dbl>    <int>          <int>     <dbl>
##  1      517            515         2      830            819        11
##  2      533            529         4      850            830        20
##  3      542            540         2      923            850        33
##  4      544            545        -1     1004           1022       -18
##  5      554            600        -6      812            837       -25
##  6      554            558        -4      740            728        12
##  7      555            600        -5      913            854        19
##  8      557            600        -3      709            723       -14
##  9      557            600        -3      838            846        -8
## 10      558            600        -2      753            745         8
## # ... with 336,766 more rows, and 10 more variables: carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
select(flights, ends_with("time"))
## # A tibble: 336,776 x 5
##    dep_time sched_dep_time arr_time sched_arr_time air_time
##       <int>          <int>    <int>          <int>    <dbl>
##  1      517            515      830            819      227
##  2      533            529      850            830      227
##  3      542            540      923            850      160
##  4      544            545     1004           1022      183
##  5      554            600      812            837      116
##  6      554            558      740            728      150
##  7      555            600      913            854      158
##  8      557            600      709            723       53
##  9      557            600      838            846      140
## 10      558            600      753            745      138
## # ... with 336,766 more rows
select(flights, c(carrier, ends_with("time"), contains("delay")))
## # A tibble: 336,776 x 8
##    carrier dep_time sched_dep_time arr_time sched_arr_time air_time
##    <chr>      <int>          <int>    <int>          <int>    <dbl>
##  1 UA           517            515      830            819      227
##  2 UA           533            529      850            830      227
##  3 AA           542            540      923            850      160
##  4 B6           544            545     1004           1022      183
##  5 DL           554            600      812            837      116
##  6 UA           554            558      740            728      150
##  7 B6           555            600      913            854      158
##  8 EV           557            600      709            723       53
##  9 B6           557            600      838            846      140
## 10 AA           558            600      753            745      138
## # ... with 336,766 more rows, and 2 more variables: dep_delay <dbl>,
## #   arr_delay <dbl>
select(flights, time_hour, air_time, everything())
## # A tibble: 336,776 x 19
##    time_hour           air_time  year month   day dep_time sched_dep_time
##    <dttm>                 <dbl> <int> <int> <int>    <int>          <int>
##  1 2013-01-01 05:00:00      227  2013     1     1      517            515
##  2 2013-01-01 05:00:00      227  2013     1     1      533            529
##  3 2013-01-01 05:00:00      160  2013     1     1      542            540
##  4 2013-01-01 05:00:00      183  2013     1     1      544            545
##  5 2013-01-01 06:00:00      116  2013     1     1      554            600
##  6 2013-01-01 05:00:00      150  2013     1     1      554            558
##  7 2013-01-01 06:00:00      158  2013     1     1      555            600
##  8 2013-01-01 06:00:00       53  2013     1     1      557            600
##  9 2013-01-01 06:00:00      140  2013     1     1      557            600
## 10 2013-01-01 06:00:00      138  2013     1     1      558            600
## # ... with 336,766 more rows, and 12 more variables: dep_delay <dbl>,
## #   arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
## #   hour <dbl>, minute <dbl>
rename(flights, departure_delay = dep_delay)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time departure_delay arr_time
##    <int> <int> <int>    <int>          <int>           <dbl>    <int>
##  1  2013     1     1      517            515               2      830
##  2  2013     1     1      533            529               4      850
##  3  2013     1     1      542            540               2      923
##  4  2013     1     1      544            545              -1     1004
##  5  2013     1     1      554            600              -6      812
##  6  2013     1     1      554            558              -4      740
##  7  2013     1     1      555            600              -5      913
##  8  2013     1     1      557            600              -3      709
##  9  2013     1     1      557            600              -3      838
## 10  2013     1     1      558            600              -2      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

mutate: create new variables

flights_sml <- select(flights,
                      year:day,
                      ends_with("delay"),
                      distance,
                      air_time
)
flights_sml
## # A tibble: 336,776 x 7
##     year month   day dep_delay arr_delay distance air_time
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
##  1  2013     1     1         2        11     1400      227
##  2  2013     1     1         4        20     1416      227
##  3  2013     1     1         2        33     1089      160
##  4  2013     1     1        -1       -18     1576      183
##  5  2013     1     1        -6       -25      762      116
##  6  2013     1     1        -4        12      719      150
##  7  2013     1     1        -5        19     1065      158
##  8  2013     1     1        -3       -14      229       53
##  9  2013     1     1        -3        -8      944      140
## 10  2013     1     1        -2         8      733      138
## # ... with 336,766 more rows
mutate(flights_sml,
       gain = arr_delay - dep_delay,
       hours = air_time / 60,
       gain_per_hour = gain / hours
)
## # A tibble: 336,776 x 10
##     year month   day dep_delay arr_delay distance air_time  gain hours
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
##  1  2013     1     1         2        11     1400      227     9 3.78 
##  2  2013     1     1         4        20     1416      227    16 3.78 
##  3  2013     1     1         2        33     1089      160    31 2.67 
##  4  2013     1     1        -1       -18     1576      183   -17 3.05 
##  5  2013     1     1        -6       -25      762      116   -19 1.93 
##  6  2013     1     1        -4        12      719      150    16 2.5  
##  7  2013     1     1        -5        19     1065      158    24 2.63 
##  8  2013     1     1        -3       -14      229       53   -11 0.883
##  9  2013     1     1        -3        -8      944      140    -5 2.33 
## 10  2013     1     1        -2         8      733      138    10 2.3  
## # ... with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>
transmute(flights,
          gain = arr_delay - dep_delay,
          hours = air_time / 60,
          gain_per_hour = gain / hours
)
## # A tibble: 336,776 x 3
##     gain hours gain_per_hour
##    <dbl> <dbl>         <dbl>
##  1     9 3.78           2.38
##  2    16 3.78           4.23
##  3    31 2.67          11.6 
##  4   -17 3.05          -5.57
##  5   -19 1.93          -9.83
##  6    16 2.5            6.4 
##  7    24 2.63           9.11
##  8   -11 0.883        -12.5 
##  9    -5 2.33          -2.14
## 10    10 2.3            4.35
## # ... with 336,766 more rows
transmute(flights,
          normalized_delay = dep_delay / (mean(dep_delay, na.rm = TRUE)))
## # A tibble: 336,776 x 1
##    normalized_delay
##               <dbl>
##  1           0.158 
##  2           0.316 
##  3           0.158 
##  4          -0.0791
##  5          -0.475 
##  6          -0.316 
##  7          -0.396 
##  8          -0.237 
##  9          -0.237 
## 10          -0.158 
## # ... with 336,766 more rows
transmute(flights,
          log_air_time = log2(air_time),
          exp_delay = exp(dep_delay))
## # A tibble: 336,776 x 2
##    log_air_time exp_delay
##           <dbl>     <dbl>
##  1         7.83   7.39   
##  2         7.83  54.6    
##  3         7.32   7.39   
##  4         7.52   0.368  
##  5         6.86   0.00248
##  6         7.23   0.0183 
##  7         7.30   0.00674
##  8         5.73   0.0498 
##  9         7.13   0.0498 
## 10         7.11   0.135  
## # ... with 336,766 more rows
transmute(flights,
          arr_delay = arr_delay,
          bucket = ntile(arr_delay, 10))
## # A tibble: 336,776 x 2
##    arr_delay bucket
##        <dbl>  <int>
##  1        11      8
##  2        20      8
##  3        33      9
##  4       -18      3
##  5       -25      2
##  6        12      8
##  7        19      8
##  8       -14      3
##  9        -8      5
## 10         8      7
## # ... with 336,766 more rows

summarize: summarize data by functions of choice

summarize(flights,
          dep_delay_mean = mean(dep_delay, na.rm = TRUE),
          dep_delay_sd = sd(dep_delay, na.rm = TRUE),
          n = n())
## # A tibble: 1 x 3
##   dep_delay_mean dep_delay_sd      n
##            <dbl>        <dbl>  <int>
## 1           12.6         40.2 336776

group_by: group data by categorical levels

by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 365 x 4
## # Groups:   year, month [?]
##     year month   day delay
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ... with 355 more rows
# Which carrier had the largest mean departure delay? Smallest?
by_carrier <- group_by(flights, carrier)
summarise(by_carrier, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 16 x 2
##    carrier delay
##    <chr>   <dbl>
##  1 9E      16.7 
##  2 AA       8.59
##  3 AS       5.80
##  4 B6      13.0 
##  5 DL       9.26
##  6 EV      20.0 
##  7 F9      20.2 
##  8 FL      18.7 
##  9 HA       4.90
## 10 MQ      10.6 
## 11 OO      12.6 
## 12 UA      12.1 
## 13 US       3.78
## 14 VX      12.9 
## 15 WN      17.7 
## 16 YV      19.0
# Which carrier had the largest difference between their max and min departure delay?
summarise(by_carrier,
          max = max(dep_delay, na.rm = TRUE),
          min = min(dep_delay, na.rm = TRUE),
          delta = max - min)
## # A tibble: 16 x 4
##    carrier   max   min delta
##    <chr>   <dbl> <dbl> <dbl>
##  1 9E        747   -24   771
##  2 AA       1014   -24  1038
##  3 AS        225   -21   246
##  4 B6        502   -43   545
##  5 DL        960   -33   993
##  6 EV        548   -32   580
##  7 F9        853   -27   880
##  8 FL        602   -22   624
##  9 HA       1301   -16  1317
## 10 MQ       1137   -26  1163
## 11 OO        154   -14   168
## 12 UA        483   -20   503
## 13 US        500   -19   519
## 14 VX        653   -20   673
## 15 WN        471   -13   484
## 16 YV        387   -16   403
# Which month has the largest variance for arrival delays?
by_month <- group_by(flights, month)
summarise(by_month, delay = sd(arr_delay, na.rm = TRUE))
## # A tibble: 12 x 2
##    month delay
##    <int> <dbl>
##  1     1  40.4
##  2     2  39.5
##  3     3  44.1
##  4     4  47.5
##  5     5  44.2
##  6     6  56.1
##  7     7  57.1
##  8     8  42.6
##  9     9  39.7
## 10    10  32.6
## 11    11  31.4
## 12    12  46.1

% Operator

flights %>%
  filter(!is.na(tailnum)) %>%
  group_by(tailnum) %>%
  summarise(delay = mean(arr_delay, na.rm = TRUE),
  n = n()) %>%
  arrange(desc(delay))
## # A tibble: 4,043 x 3
##    tailnum delay     n
##    <chr>   <dbl> <int>
##  1 N844MH   320      1
##  2 N911DA   294      1
##  3 N922EV   276      1
##  4 N587NW   264      1
##  5 N851NW   219      1
##  6 N928DN   201      1
##  7 N7715E   188      1
##  8 N654UA   185      1
##  9 N665MQ   175.     6
## 10 N427SW   157      1
## # ... with 4,033 more rows
flights %>%
group_by(carrier, month) %>%
summarise(max_delay = max(arr_delay, na.rm = TRUE)) %>%
mutate(rank_delay = rank(desc(max_delay))) %>%
group_by(month) %>%
summarize(avg_rank = mean(rank_delay)) %>%
arrange(desc(avg_rank))
## # A tibble: 12 x 2
##    month avg_rank
##    <int>    <dbl>
##  1    10     9.2 
##  2     2     7.8 
##  3    11     7.56
##  4     8     6.72
##  5     9     6.59
##  6    12     6.47
##  7     3     6.33
##  8     5     6.27
##  9     1     6.06
## 10     4     5.8 
## 11     6     4.44
## 12     7     3.67

Joins

  1. Mutating Joins - Adding Variables
x <- tribble(
  ~key, ~val_x,
  1, "x1",
  2, "x2",
  3, "x3"
)

x
## # A tibble: 3 x 2
##     key val_x
##   <dbl> <chr>
## 1     1 x1   
## 2     2 x2   
## 3     3 x3
y <- tribble(
  ~key, ~val_y,
  1, "y1",
  2, "y2",
  4, "y3"
)

y
## # A tibble: 3 x 2
##     key val_y
##   <dbl> <chr>
## 1     1 y1   
## 2     2 y2   
## 3     4 y3

Inner Join

x %>% 
  inner_join(y, by="key")
## # A tibble: 2 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2

Left Join

x %>% 
  left_join(y, by="key")
## # A tibble: 3 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA>

Right Join

x %>% 
  right_join(y, by="key")
## # A tibble: 3 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     4 <NA>  y3

Full Join

x %>% 
  full_join(y, by="key")
## # A tibble: 4 x 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA> 
## 4     4 <NA>  y3

What if key names don not match?

x <- tribble(
    ~key1, ~val_x,
    1, "x1",
    2, "x2",
    3, "x3"
  )
y <- tribble(
  ~key2, ~val_y,
  1, "y1",
  2, "y2",
  4, "y3"
)

x %>% 
  inner_join(y, by = c("key1" = "key2"))
## # A tibble: 2 x 3
##    key1 val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2
  1. Filtering Joins - Affects observations rather than adding variables

semi join: keeps all observations in x that have a match in y

x %>% 
  semi_join(y, by = c("key1" = "key2"))
## # A tibble: 2 x 2
##    key1 val_x
##   <dbl> <chr>
## 1     1 x1   
## 2     2 x2

anti join: drops all observations in x that have a match in y

x %>% 
  anti_join(y, by = c("key1" = "key2"))
## # A tibble: 1 x 2
##    key1 val_x
##   <dbl> <chr>
## 1     3 x3

Visualization (ggplot2)

Drawing the Canvas

library(ggplot2)
ggplot(data = mpg)

ggplot(data = mpg, aes(x = displ, y = hwy))

Geoms:

#aes - aesthetics
ggplot(data = mpg, aes(x = hwy)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = mpg, aes(x = hwy)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = mpg, aes(x = hwy)) +
  geom_density()

ggplot(data = mpg, aes(x = class)) +
  geom_bar()

ggplot(data = mpg, aes(x = displ, y = hwy)) +
  geom_point()

ggplot(data = mpg, aes(x = class, y = hwy)) +
  geom_boxplot()

ggplot(data = mpg, aes(x = class, y = hwy)) +
  geom_violin()

Non-Mapping Aesthetics - color, size, shape, opacity

ggplot(data = mpg, aes(x = displ, y = hwy)) +
  geom_point(color = "blue", size = 2, shape = 17, alpha = .5)

color inside geom_point - colors all the points blue

ggplot(data = mpg, aes(x = displ, y = hwy)) +
  geom_point(color = "blue")

color inside aesthetic - makes class variable as legend

ggplot(data = mpg, aes(x = displ, y = hwy, color = class)) +
  geom_point()

A common error - color =“blue” inside aesthetic

ggplot(data = mpg, aes(x = displ, y = hwy, color = "blue")) +
  geom_point()

Facets

facet_wrap: primarily used to create small multiples based on a single variable

ggplot(data = mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_wrap(~ class, nrow = 2)

facet_grid: primarily used to create a small multiples grid based on two variables

ggplot(data = mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_grid(drv ~ cyl)

Overplotting - Multiple Geoms

ggplot(data = mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  geom_smooth()

aes inside geom_point - ‘color = class’ applicable to only the geom_point

ggplot(data = mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = class)) +
  geom_smooth()

aes inside ggplot - ‘color = class’ applicable to both geoms

ggplot(data = mpg, aes(x = displ, y = hwy, color = class)) +
  geom_point() +
  geom_smooth()

ggplot(data = mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = class == "2seater")) +
  geom_smooth(data = filter(mpg, class == "2seater"), se = FALSE) +
  geom_smooth(data = filter(mpg, class != "2seater"), se = FALSE)

Positioning - Bar chart

ggplot(data = mpg, aes(class, color = factor(year))) +
  geom_bar()

ggplot(data = mpg, aes(class, fill = factor(year))) +
  geom_bar()

ggplot(data = mpg, aes(class, fill = factor(year))) +
  geom_bar(position = "fill")

ggplot(data = mpg, aes(class, fill = factor(year))) +
  geom_bar(position = "dodge")

Coordinate System

Flipping the Coordinate System

# top
ggplot(data = mpg, aes(x = class, y = hwy)) +
  geom_boxplot()

# bottom
ggplot(data = mpg, aes(x = class, y = hwy)) +
  geom_boxplot() +
  coord_flip()

Zooming in or out

ggplot(data = mpg, aes(x = displ, y = cty)) +
  geom_jitter() +
  coord_cartesian(xlim = c(4, 7), ylim = c(10, 20))

Formatting Axes and labels

ggplot(data = txhousing, aes(x = volume, y = median)) +
  geom_point(alpha = .25) +
  scale_y_continuous(name = "Median Sales Price", labels = scales::dollar) +
  scale_x_log10(name = "Total Sales Volume", labels = scales::comma) +
  ggtitle("Texas Housing Sales",
  subtitle = "Sales data from 2000-2010 provided by the TAMU real estate center")

Pairwise Plots

# library(GGally)
ggpairs(mtcars)

Control Statements & Iteration

if-else

x <- 7
if(x >= 10) {
  print("x exceeds acceptable tolerance levels")
} else if(x >= 0 & x < 10) {
  print("x is within acceptable tolerance levels")
} else {
  print("x is negative")
}
## [1] "x is within acceptable tolerance levels"

for loop

years <- 2010:2017
for (i in seq_along(years)) {
  output <- paste("The year is", years[i])
  print(output)
}
## [1] "The year is 2010"
## [1] "The year is 2011"
## [1] "The year is 2012"
## [1] "The year is 2013"
## [1] "The year is 2014"
## [1] "The year is 2015"
## [1] "The year is 2016"
## [1] "The year is 2017"
result <- vector(mode = "character",
                 length = length(years))
for (i in seq_along(years)) {
  output <- paste("The year is", years[i])
  result[i] <- output
}

result
## [1] "The year is 2010" "The year is 2011" "The year is 2012"
## [4] "The year is 2013" "The year is 2014" "The year is 2015"
## [7] "The year is 2016" "The year is 2017"
x <- c(-1, 7, 8, 11)
tolerance <- vector(mode = "character",
                    length = length(x))
for (i in seq_along(x)) {
  if(x[i] >= 10) {
    value <- "x exceeds acceptable tolerance levels"
  } else if(x[i] >= 0 & x[i] < 10) {
    value <- "x is within acceptable tolerance levels"
  } else {
    value <- "x is negative"
  }
  tolerance[i] <- value
}

tolerance
## [1] "x is negative"                          
## [2] "x is within acceptable tolerance levels"
## [3] "x is within acceptable tolerance levels"
## [4] "x exceeds acceptable tolerance levels"

break

x <- 1:5
for (val in x) {
  if (val == 3){
    break
  }
  print(val)
}
## [1] 1
## [1] 2

next

x <- 1:5
for (val in x) {
  if (val == 3){
    next
  }
  print(val)
}
## [1] 1
## [1] 2
## [1] 4
## [1] 5

Function

pv <- function(FV, r, n) {
  present_value <- FV / (1 + r)^n
  round(present_value, 2)
}

pv(FV = 1000, r = .08, n = 5)
## [1] 680.58
pv <- function(FV, r, n = 5) {
  if(!is.atomic(FV)) {
    stop('FV must be an atomic vector')
  }
  if(!is.numeric(FV) | !is.numeric(r) | !is.numeric(n)){
    stop('This function only works for numeric inputs!\n',
         'You have provided objects of the following classes:\n',
         'FV: ', class(FV), '\n',
         'r: ', class(r), '\n',
         'n: ', class(n))
  }
  present_value <- FV / (1 + r)^n
  round(present_value, 2)
}

# pv(FV = "1000", .08, n = 5)
# Error in pv(FV = "1000", 0.08, n = 5) : 
#This function only works for numeric inputs!
# You have provided objects of the following classes:
# FV: character
# r: numeric
# n: numeric

Apply Family

apply - can be used to apply a function to a matrix.

data <- matrix(c(1:10, 21:30), nrow = 5, ncol = 4)
data
##      [,1] [,2] [,3] [,4]
## [1,]    1    6   21   26
## [2,]    2    7   22   27
## [3,]    3    8   23   28
## [4,]    4    9   24   29
## [5,]    5   10   25   30
# applies function to row (1) or column (2) of the matrix
apply(data, 1, mean)
## [1] 13.5 14.5 15.5 16.5 17.5

custom function in apply

apply(mtcars[,1:2], 2, function(x)(x-mean(x))/sd(x))
##                             mpg        cyl
## Mazda RX4            0.15088482 -0.1049878
## Mazda RX4 Wag        0.15088482 -0.1049878
## Datsun 710           0.44954345 -1.2248578
## Hornet 4 Drive       0.21725341 -0.1049878
## Hornet Sportabout   -0.23073453  1.0148821
## Valiant             -0.33028740 -0.1049878
## Duster 360          -0.96078893  1.0148821
## Merc 240D            0.71501778 -1.2248578
## Merc 230             0.44954345 -1.2248578
## Merc 280            -0.14777380 -0.1049878
## Merc 280C           -0.38006384 -0.1049878
## Merc 450SE          -0.61235388  1.0148821
## Merc 450SL          -0.46302456  1.0148821
## Merc 450SLC         -0.81145962  1.0148821
## Cadillac Fleetwood  -1.60788262  1.0148821
## Lincoln Continental -1.60788262  1.0148821
## Chrysler Imperial   -0.89442035  1.0148821
## Fiat 128             2.04238943 -1.2248578
## Honda Civic          1.71054652 -1.2248578
## Toyota Corolla       2.29127162 -1.2248578
## Toyota Corona        0.23384555 -1.2248578
## Dodge Challenger    -0.76168319  1.0148821
## AMC Javelin         -0.81145962  1.0148821
## Camaro Z28          -1.12671039  1.0148821
## Pontiac Firebird    -0.14777380  1.0148821
## Fiat X1-9            1.19619000 -1.2248578
## Porsche 914-2        0.98049211 -1.2248578
## Lotus Europa         1.71054652 -1.2248578
## Ford Pantera L      -0.71190675  1.0148821
## Ferrari Dino        -0.06481307 -0.1049878
## Maserati Bora       -0.84464392  1.0148821
## Volvo 142E           0.21725341 -1.2248578

lapply - similar to apply, but it takes a list as an input, and returns a list as the output.

data <- list(x = 1:5, y = 6:10, z = 11:15)
data
## $x
## [1] 1 2 3 4 5
## 
## $y
## [1]  6  7  8  9 10
## 
## $z
## [1] 11 12 13 14 15
# applies a function to each element in the list
lapply(data, FUN = median)
## $x
## [1] 3
## 
## $y
## [1] 8
## 
## $z
## [1] 13

sapply - same as lapply, but returns a vector instead of a list.

sapply(data, FUN = median)
##  x  y  z 
##  3  8 13

tapply - splits the array based on specified data, usually factor levels and then applies the function to it.

# group by cyl, then find mean wt in each group
tapply(mtcars$wt, mtcars$cyl, mean)
##        4        6        8 
## 2.285727 3.117143 3.999214

Miscellaneous

bind_rows()

one <- mtcars[1:4, 1:3]
two <- mtcars[11:14, 1:2]

print(one)
##                 mpg cyl disp
## Mazda RX4      21.0   6  160
## Mazda RX4 Wag  21.0   6  160
## Datsun 710     22.8   4  108
## Hornet 4 Drive 21.4   6  258
print(two)
##              mpg cyl
## Merc 280C   17.8   6
## Merc 450SE  16.4   8
## Merc 450SL  17.3   8
## Merc 450SLC 15.2   8
# You can bind dataframes by rows for having different columns
bind_rows(one, two)
##    mpg cyl disp
## 1 21.0   6  160
## 2 21.0   6  160
## 3 22.8   4  108
## 4 21.4   6  258
## 5 17.8   6   NA
## 6 16.4   8   NA
## 7 17.3   8   NA
## 8 15.2   8   NA

bind_cols()

one <- mtcars[1:5, 1:2]
two <- mtcars[1:5, 3:4]

print(one)
##                    mpg cyl
## Mazda RX4         21.0   6
## Mazda RX4 Wag     21.0   6
## Datsun 710        22.8   4
## Hornet 4 Drive    21.4   6
## Hornet Sportabout 18.7   8
print(two)
##                   disp  hp
## Mazda RX4          160 110
## Mazda RX4 Wag      160 110
## Datsun 710         108  93
## Hornet 4 Drive     258 110
## Hornet Sportabout  360 175
# You can bind dataframes by columns
bind_cols(one, two)
##    mpg cyl disp  hp
## 1 21.0   6  160 110
## 2 21.0   6  160 110
## 3 22.8   4  108  93
## 4 21.4   6  258 110
## 5 18.7   8  360 175

remove duplicates

one <- mtcars[1:4, 1:7]
df <- bind_rows(one,one)

df[!duplicated(df), ]
##    mpg cyl disp  hp drat    wt  qsec
## 1 21.0   6  160 110 3.90 2.620 16.46
## 2 21.0   6  160 110 3.90 2.875 17.02
## 3 22.8   4  108  93 3.85 2.320 18.61
## 4 21.4   6  258 110 3.08 3.215 19.44

“.” operator usage

mtcars %>% 
  mutate(cars = rownames(.))
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb                cars
## 1  21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4           Mazda RX4
## 2  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4       Mazda RX4 Wag
## 3  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1          Datsun 710
## 4  21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1      Hornet 4 Drive
## 5  18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2   Hornet Sportabout
## 6  18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1             Valiant
## 7  14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4          Duster 360
## 8  24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2           Merc 240D
## 9  22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2            Merc 230
## 10 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4            Merc 280
## 11 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4           Merc 280C
## 12 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3          Merc 450SE
## 13 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3          Merc 450SL
## 14 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3         Merc 450SLC
## 15 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4  Cadillac Fleetwood
## 16 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4 Lincoln Continental
## 17 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4   Chrysler Imperial
## 18 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1            Fiat 128
## 19 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2         Honda Civic
## 20 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1      Toyota Corolla
## 21 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1       Toyota Corona
## 22 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2    Dodge Challenger
## 23 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2         AMC Javelin
## 24 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4          Camaro Z28
## 25 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2    Pontiac Firebird
## 26 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1           Fiat X1-9
## 27 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2       Porsche 914-2
## 28 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2        Lotus Europa
## 29 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4      Ford Pantera L
## 30 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6        Ferrari Dino
## 31 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8       Maserati Bora
## 32 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2          Volvo 142E

accessing columns having special characters using ’’

model <- lm(mpg~disp,mtcars)
coeff <- as.data.frame(summary(model)$coefficients)

coeff
##                Estimate  Std. Error   t value     Pr(>|t|)
## (Intercept) 29.59985476 1.229719515 24.070411 3.576586e-21
## disp        -0.04121512 0.004711833 -8.747152 9.380327e-10
coeff$`Pr(>|t|)`
## [1] 3.576586e-21 9.380327e-10

avoid single columns of dataframe to become vector

df1 <- mtcars[,1]
print(class(df1))
## [1] "numeric"
print(df1)
##  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2
## [15] 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4
## [29] 15.8 19.7 15.0 21.4
df2 <- mtcars[,1,drop=FALSE]
print(class(df2))
## [1] "data.frame"
print(df2)
##                      mpg
## Mazda RX4           21.0
## Mazda RX4 Wag       21.0
## Datsun 710          22.8
## Hornet 4 Drive      21.4
## Hornet Sportabout   18.7
## Valiant             18.1
## Duster 360          14.3
## Merc 240D           24.4
## Merc 230            22.8
## Merc 280            19.2
## Merc 280C           17.8
## Merc 450SE          16.4
## Merc 450SL          17.3
## Merc 450SLC         15.2
## Cadillac Fleetwood  10.4
## Lincoln Continental 10.4
## Chrysler Imperial   14.7
## Fiat 128            32.4
## Honda Civic         30.4
## Toyota Corolla      33.9
## Toyota Corona       21.5
## Dodge Challenger    15.5
## AMC Javelin         15.2
## Camaro Z28          13.3
## Pontiac Firebird    19.2
## Fiat X1-9           27.3
## Porsche 914-2       26.0
## Lotus Europa        30.4
## Ford Pantera L      15.8
## Ferrari Dino        19.7
## Maserati Bora       15.0
## Volvo 142E          21.4