1. Conditionals and Control Flow

linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)
views <- matrix(c(linkedin, facebook), nrow = 2, byrow = TRUE)
views > 15
##      [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]
## [1,] TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
## [2,] TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
linkedin > facebook
## [1] FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE
linkedin > 15 | linkedin <= 5
## [1]  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE
linkedin > 10 & facebook > 10
## [1]  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE
views[2,]
## [1] 17  7  5 16  8 13 14
views[2,] > 15
## [1]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
sum(views[2,]>15)
## [1] 2
f <- views[2,] > 15

3. Funtions

help(mean)
?mean
args(mean)
## function (x, ...) 
## NULL

linkedin <- c(16, 9, 13, 5, NA, 17, 14)
mean(linkedin)
## [1] NA
mean(linkedin, na.rm = TRUE)
## [1] 12.33333
mean(linkedin, na.rm = TRUE, trim = 0.2)
## [1] 13

(cf) mean() arg : trim Trim을 사용하면 양 극단에서 일정부분을 빼고 계산. 0.3일 경우 양 끝에서 전체 데이터의 0.3 만큼을 제외한다. 이 경우에 전체데이터가 10개이므로 양 끝에서 3개씩을 뺀다. 즉 정렬된 벡터 (−21, −5, 2, 3, 4.2, 7, 8, 12, 18, 54) 에서 (−21,−5,2) (12,18,54) 을 뺀 평균을 계산

Function paste()

speed <- 31
print(paste("Your speed is", speed))
## [1] "Your speed is 31"

4. The Apply Family

pioneers <- c("GAUSS:1777", "BAYES:1702", "PASCAL:1623", "PEARSON:1857")
split_math <- strsplit(pioneers, split = ":")
split_low <- lapply(split_math, tolower)
split_low
## [[1]]
## [1] "gauss" "1777" 
## 
## [[2]]
## [1] "bayes" "1702" 
## 
## [[3]]
## [1] "pascal" "1623"  
## 
## [[4]]
## [1] "pearson" "1857"
select_first <- function(x) {
  x[1]
}
select_second <- function(x) {
  x[2]
}
names <- lapply(split_low, select_first)
years <- lapply(split_low, select_second)
names
## [[1]]
## [1] "gauss"
## 
## [[2]]
## [1] "bayes"
## 
## [[3]]
## [1] "pascal"
## 
## [[4]]
## [1] "pearson"
years
## [[1]]
## [1] "1777"
## 
## [[2]]
## [1] "1702"
## 
## [[3]]
## [1] "1623"
## 
## [[4]]
## [1] "1857"

Anonymous functions

names <- lapply(split_low, function(x) { x[1] })
years <- lapply(split_low, function(x) { x[2] })

Use lapply with additional arguments

select_el <- function(x, index) {
  x[index]
}
names <- lapply(split_low, select_el, index = 1)
years <- lapply(split_low, select_el, index = 2)

sapply()

cities <- c("New York", "Paris", "London")
unlist(lapply(cities, nchar))
## [1] 8 5 6
sapply(cities, nchar)
## New York    Paris   London 
##        8        5        6
sapply(cities, nchar, USE.NAMES = FALSE)
## [1] 8 5 6
temp <- list(c(3,7,9,6,-1), c(6,9,12,13,5), c(4,8,3,-1,-3))
lapply(temp, min)
## [[1]]
## [1] -1
## 
## [[2]]
## [1] 5
## 
## [[3]]
## [1] -3
sapply(temp, max)
## [1]  9 13  8
extremes_avg <- function(x) {
  ( min(x) + max(x) ) / 2
}
sapply(temp, extremes_avg)
## [1] 4.0 9.0 2.5

sapply() with function returning vector

extremes <- function(x) {
  c(min = min(x), max = max(x))
}
sapply(temp, extremes)
##     [,1] [,2] [,3]
## min   -1    5   -3
## max    9   13    8
lapply(temp, extremes)
## [[1]]
## min max 
##  -1   9 
## 
## [[2]]
## min max 
##   5  13 
## 
## [[3]]
## min max 
##  -3   8

sapply() with returning NULL

below_zero <- function(x) {
  return(x[x < 0])
}
freezing_s <- sapply(temp, below_zero)
freezing_l <- lapply(temp, below_zero)
identical(freezing_s, freezing_l)
## [1] TRUE
freezing_s
## [[1]]
## [1] -1
## 
## [[2]]
## numeric(0)
## 
## [[3]]
## [1] -1 -3
freezing_l
## [[1]]
## [1] -1
## 
## [[2]]
## numeric(0)
## 
## [[3]]
## [1] -1 -3

Given that the length of the output of below_zero() changes for different input vectors, sapply() is not able to nicely convert the output of lapply() to a nicely formatted matrix. Instead, the output values of sapply() and lapply() are exactly the same, as shown by the TRUE output of identical().

sapply() reacts when it is used to apply a function that returns NULL over a vector or a list.

sapply(list(runif (10), runif (10)), 
       function(x) c(min = min(x), mean = mean(x), max = max(x)))
##           [,1]       [,2]
## min  0.3291673 0.06498558
## mean 0.6606805 0.50216997
## max  0.8661332 0.77227106

vapply()

basics <- function(x) {
  c(min = min(x), mean = mean(x), max = max(x))
}
vapply(temp, basics, numeric(3))
##      [,1] [,2] [,3]
## min  -1.0    5 -3.0
## mean  4.8    9  2.2
## max   9.0   13  8.0
basics <- function(x) {
  c(min = min(x), mean = mean(x), median = median(x), max = max(x))
}
vapply(temp, basics, numeric(4))
##        [,1] [,2] [,3]
## min    -1.0    5 -3.0
## mean    4.8    9  2.2
## median  6.0    9  3.0
## max     9.0   13  8.0

first_and_last <- function(name) {
  name <- gsub(" ", "", name)
  letters <- strsplit(name, split = "")[[1]]
  return(c(first = min(letters), last = max(letters)))
}
sapply(cities, first_and_last)
##       New York Paris London
## first "e"      "a"   "d"   
## last  "Y"      "s"   "o"
vapply(cities, first_and_last, character(2))
##       New York Paris London
## first "e"      "a"   "d"   
## last  "Y"      "s"   "o"

5. Utilities

Mathematical utilities

errors <- c(1.9, -2.6, 4.0, -9.5, -3.4, 7.3)
sum(abs(round(errors)))
## [1] 29
vec1 <- c(1.5, 2.5, 8.4, 3.7, 6.3)
vec2 <- rev(vec1)
mean(c(vec1, vec2))
## [1] 4.48

Data Utilities

linkedin <- list(16, 9, 13, 5, 2, 17, 14)
facebook <- list(17, 7, 5, 16, 8, 13, 14)
li_vec <- as.vector(linkedin)
fb_vec <- as.vector(facebook)

social_vec <- append(li_vec, fb_vec) # Append fb_vec to li_vec: social_vec
sort(unlist(social_vec), decreasing = TRUE) # Sort social_vec
##  [1] 17 17 16 16 14 14 13 13  9  8  7  5  5  2
rep(seq(1, 7, by = 2), times = 7)
##  [1] 1 3 5 7 1 3 5 7 1 3 5 7 1 3 5 7 1 3 5 7 1 3 5 7 1 3 5 7

Regular Expressions

regex : Regular Expressions

animals <- c("cat", "moose", "impala", "ant", "kiwi")

grepl(pattern = "a", x = animals) # [1]  TRUE FALSE  TRUE  TRUE FALSE
grep(pattern = "a", x = animals) # [1] 1 3 4

grepl(pattern = "^a", x = animals) # [1] FALSE FALSE FALSE  TRUE FALSE
grep(pattern = "^a", x = animals) # [1] 4

grepl(pattern = "a$", x = animals) # [1] FALSE FALSE  TRUE FALSE FALSE
which(grepl(pattern = "a", x = animals)) # [1] 1 3 4

sub(pattern = "a", replacement = "o", x = animals) # [1] "cot"    "moose"  "impola" "ont"    "kiwi"
gsub(pattern = "a", replacement = "o", x = animals) # [1] "cot"    "moose"  "impolo" "ont"    "kiwi"
sub(pattern = "a|i", replacement = "_", x = animals) # [1] "c_t"    "moose"  "_mpala" "_nt"    "k_wi"
  • .*: A usual suspect! It can be read as “any character that is matched zero or more times”.
  • \s: Match a space. The “s” is normally a character, escaping it (\) makes it a metacharacter.
  • [0-9]+: Match the numbers 0 to 9, at least once (+).
  • ([0-9]+): The parentheses are used to make parts of the matching string available to define the replacement. The \1 in the replacement argument of sub() gets set to the string that is captured by the regular expression [0-9]+.
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org", "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")
hits <- grep("@.*\\.edu", x = emails)
emails[hits]
## [1] "john.doe@ivyleague.edu"   "quant@bigdatacollege.edu"
sub(pattern = "@.*\\.edu$", replacement = "@datacamp.edu", x = emails)
## [1] "john.doe@datacamp.edu"    "education@world.gov"     
## [3] "dalai.lama@peace.org"     "invalid.edu"             
## [5] "quant@datacamp.edu"       "cookie.monster@sesame.tv"
awards <- c("Won 1 Oscar.",
  "Won 1 Oscar. Another 9 wins & 24 nominations.",
  "1 win and 2 nominations.",
  "2 wins & 3 nominations.",
  "Nominated for 2 Golden Globes. 1 more win & 2 nominations.",
  "4 wins & 1 nomination.")

sub(".*\\s([0-9]+)\\snomination.*$", "\\1", awards)
## [1] "Won 1 Oscar." "24"           "2"            "3"           
## [5] "2"            "1"

The ([0-9]+) selects the entire number that comes before the word “nomination” in the string, and the entire match gets replaced by this number because of the \1 that reference to the content inside the parentheses.

Times and Date

my_date <- Sys.Date()
class(my_date)
## [1] "Date"
my_time <- Sys.time()
class(my_time)
## [1] "POSIXct" "POSIXt"
my_date2 <- as.Date("1971-05-14") # as.Date("1971-14-05", format = "%Y-%d-%m")
my_date + 1
## [1] "2017-05-31"
my_date2 <- as.Date("1998-08-29")
my_date2 - my_date
## Time difference of -6849 days
my_time + 1 # seconds incremented by 1
## [1] "2017-05-30 17:17:30 EDT"
my_time2 <- as.POSIXct("1974-07-14 21:11:55 CET")
my_time2 - my_time
## Time difference of -15660.84 days
unclass(my_date)
## [1] 17316

Create and format dates

  • %Y: 4-digit year (1982)
  • %y: 2-digit year (82)
  • %m: 2-digit month (01)
  • %d: 2-digit day of the month (13)
  • %A: weekday (Wednesday)
  • %a: abbreviated weekday (Wed)
  • %B: month (January)
  • %b: abbreviated month (Jan)
str1 <- "May 23, '96"
str2 <- "2012-03-15"
str3 <- "30/January/2006"

date1 <- as.Date(str1, format = "%b %d, '%y")
date2 <- as.Date(str2, format = "%Y-%m-%d")
date3 <- as.Date(str3, format = "%d/%B/%Y")

format(date1, "%A")
## [1] "Thursday"
format(date2, "%d")
## [1] "15"
format(date3, "%b %Y")
## [1] "Jan 2006"

Create and format dates

  • %H: hours as a decimal number (00-23)
  • %I: hours as a decimal number (01-12)
  • %M: minutes as a decimal number
  • %S: seconds as a decimal number
  • %T: shorthand notation for the typical format %H:%M:%S
  • %p: AM/PM indicator
  1. Using format(), create a string from time1 containing only the minutes.
  2. From time2, extract the hours and minutes as “hours:minutes AM/PM”. Refer to the assignment text above to find the correct conversion symbols
str1 <- "May 23, '96 hours:23 minutes:01 seconds:45"
str2 <- "2012-3-12 14:23:08"

# Convert the strings to POSIXct objects: time1, time2
time1 <- as.POSIXct(str1, format = "%B %d, '%y hours:%H minutes:%M seconds:%S")
time2 <- as.POSIXct(str2, fromat = "%Y-%m-%d %H:%M:%S")

# Convert times to formatted strings
format(time1, "%M")
## [1] "01"
format(time2, "%I:%M %p")
## [1] "02:23 PM"

Calculations with Dates

pizza <- c("2017-05-12", "2017-05-14", "2017-05-19", "2017-05-25", "2017-05-30")
pizza <- as.Date(pizza)
day_diff <- diff(pizza)
day_diff
## Time differences in days
## [1] 2 5 6 5
mean(day_diff)
## Time difference of 4.5 days