Whats Covered

Conditionals and Control Flow
- Relational Operators. == < >
- Logical Operators. and or not
- Conditional Statements. if else
Loops
- While loops
- For loops
- Breaks
Functions
- Documentation
- Required vs optional inputs
- Scoping
- Writing functions
- Loading packages
Apply statements
- lapply, sapply, vapply
Utilities
- Useful base functions
- Regular Expressions and grep/sub
- Times and Dates

Conditionals and Control Flow

Relational Operators

These are used to compare objects
They are common in any langage and are a basis of programming

Equality

# Comparison of logicals
TRUE == FALSE

## [1] FALSE

# Comparison of numerics
-6 * 14 != 17 - 101

## [1] FALSE

# Comparison of character strings
"useR" == "user"

## [1] FALSE

# Compare a logical with a numeric
TRUE == 1

## [1] TRUE

Greater and less than

# Comparison of numerics
-6 * 5 + 2 >= -10 + 1

## [1] FALSE

# Comparison of character strings
"raining" <= "raining dogs"

## [1] TRUE

# Comparison of logicals
TRUE > FALSE

## [1] TRUE

Compare vectors

# The linkedin and facebook vectors have already been created for you
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)

# Popular days
linkedin > 15

## [1]  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE

# Quiet days
linkedin <= 5

## [1] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE

# LinkedIn more popular than Facebook
linkedin > facebook

## [1] FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE

Compare matrices

linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)

views <- matrix(c(linkedin, facebook), nrow = 2, byrow = TRUE)

# When does views equal 13?
views == 13

##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]
## [1,] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE

# When is views less than or equal to 14?
views <= 14

##       [,1] [,2] [,3]  [,4] [,5]  [,6] [,7]
## [1,] FALSE TRUE TRUE  TRUE TRUE FALSE TRUE
## [2,] FALSE TRUE TRUE FALSE TRUE  TRUE TRUE

# How often does facebook equal or exceed linkedin times two?
sum(facebook >= linkedin * 2)

## [1] 2

Logical Operators

These are also super common and a basis of programming
They let us combine multiple relational statements

& and |

linkedin <- c(16, 9, 13, 5, 2, 17, 14)

last <- tail(linkedin, 1)

# Is last under 5 or above 10?
last < 5 | last > 10

## [1] TRUE

# Is last between 15 (exclusive) and 20 (inclusive)?
last > 15 & last <= 20

## [1] FALSE

# Is last between 0 and 5 or between 10 and 15?
(last > 0 & last < 5) | (last > 10 & last < 15)

## [1] TRUE

& and | (2)

linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)

# linkedin exceeds 10 but facebook below 10
linkedin > 10 & facebook < 10

## [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE

# When were one or both visited at least 12 times?
linkedin > 12 | facebook > 12

## [1]  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE

# When is views between 11 (exclusive) and 14 (inclusive)?
views > 11 & views <= 14

##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6] [,7]
## [1,] FALSE FALSE  TRUE FALSE FALSE FALSE TRUE
## [2,] FALSE FALSE FALSE FALSE FALSE  TRUE TRUE

Reverse the result: !

!TRUE

## [1] FALSE

!(5 > 3)

## [1] FALSE

!!FALSE

## [1] FALSE

x <- 5
y <- 7
!(!(x < 4) & !!!(y > 12))

## [1] FALSE

Blend it all together

second <- c(3, 23, 18, 18, 25, 20, 17, 27, 6, 35, 17, 6, 1, 12, 15, 17, 12, 8)

# Build a logical vector, TRUE if value in second is extreme: extremes
extremes <- second > 25 | second < 5

# Count the number of TRUEs in extremes
sum(extremes)

## [1] 4

Conditional Statements

Used to execute statements based on result of relational statements
All of this is basis of programming and can be used in all the languages

The if statement

# Variables related to your last day of recordings
medium <- "LinkedIn"
num_views <- 14

# Examine the if statement for medium
if (medium == "LinkedIn") {
  print("Showing LinkedIn information")
}

## [1] "Showing LinkedIn information"

# Write the if statement for num_views
if (num_views > 15) {
  print("You're popular!")
}

And an else

# Variables related to your last day of recordings
medium <- "LinkedIn"
num_views <- 14

# Control structure for medium
if (medium == "LinkedIn") {
  print("Showing LinkedIn information")
} else {
  print("Unknown medium")
}

## [1] "Showing LinkedIn information"

# Control structure for num_views
if (num_views > 15) {
  print("You're popular!")
} else {
  print("Try to be more visible!")
}

## [1] "Try to be more visible!"

Customize further: else if

# Variables related to your last day of recordings
medium <- "LinkedIn"
num_views <- 14

# Control structure for medium
if (medium == "LinkedIn") {
  print("Showing LinkedIn information")
} else if (medium == "Facebook") {
  # Add code to print correct string when condition is True
  print("Showing Facebook information")
} else {
  print("Unknown medium")
}

## [1] "Showing LinkedIn information"

# Control structure for num_views
if (num_views > 15) {
  print("You're popular!")
} else if (num_views <= 15 & num_views > 10) {
  # Add code to print correct string when condition is TRUE
  print("Your number of views is average")
} else {
  print("Try to be more visible!")
}

## [1] "Your number of views is average"

Else if 2.0

numbers <- c(6, 100, 4, 2500)

for (number in numbers) {
  
  if (number < 10) {
    if (number < 5) {
      result <- "extra small"
    } else {
      result <- "small"
    }
  } else if (number < 100) {
    result <- "medium"
  } else {
    result <- "large"
  }
  
  print(result)
}

## [1] "small"
## [1] "large"
## [1] "extra small"
## [1] "large"

Take control!

# Variables related to your last day of recordings
li <- 15
fb <- 9

# Code the control-flow construct
if (li >= 15 & fb >= 15) {
  sms <- (li + fb) * 2
} else if (li < 10 & fb < 10) {
  sms <- (li + fb) / 2
} else {
  sms <- (li + fb)
}

# Print the resulting sms to the console
print(sms)

## [1] 24

Loops

While loop

it runs as long as the while condition is true

Write a while loop

# Initialize the speed variable
speed <- 64

# Code the while loop
while (speed > 30 ) {
  print('Slow down!')
  speed <- speed - 7
}

## [1] "Slow down!"
## [1] "Slow down!"
## [1] "Slow down!"
## [1] "Slow down!"
## [1] "Slow down!"

print(speed)

## [1] 29

Throw in more conditionals

# Initialize the speed variable
speed <- 64

# Extend/adapt the while loop
while (speed > 30) {
  print(paste("Your speed is",speed))
  if (speed > 48) {
    print("Slow down big time!")
    speed <- speed - 11
  } else {
    print("Slow down!")
    speed <- speed - 6
  }
}

## [1] "Your speed is 64"
## [1] "Slow down big time!"
## [1] "Your speed is 53"
## [1] "Slow down big time!"
## [1] "Your speed is 42"
## [1] "Slow down!"
## [1] "Your speed is 36"
## [1] "Slow down!"

Stop the while loop: break

# Initialize the speed variable
speed <- 88

while (speed > 30) {
  print(paste("Your speed is",speed))
  
  # Break the while loop when speed exceeds 80
  if (speed > 80) {
    break
  }
  
  if (speed > 48) {
    print("Slow down big time!")
    speed <- speed - 11
  } else {
    print("Slow down!")
    speed <- speed - 6
  }
}

## [1] "Your speed is 88"

Build a while loop from scratch

# Initialize i as 1 
i <- 1

# Code the while loop
while (i <= 10) {
  triple <- i * 3
  print(triple)
  
  if (triple %% 8 == 0) {
    break
  }
  i <- i + 1
}

## [1] 3
## [1] 6
## [1] 9
## [1] 12
## [1] 15
## [1] 18
## [1] 21
## [1] 24

For loop

Run once for each variable in the list
- This list can just be a sequence like 1:5 or variables in a list like a bunch of names
- This is so fundamental to programming in any language
However I actualy never use loops in R
- I always modify data arrays with array functions. They are WAY faster
- The apply functions covered later are one way to do it.
- dplyr is the best way. All the loopig happens in C code so its super fast.
- If you are looping on an array in R, you should probably rethink your approach.

Loop over a vector

# The linkedin vector has already been defined for you
linkedin <- c(16, 9, 13, 5, 2, 17, 14)

# Loop version 1
for (views in linkedin) {
  print(views)
}

## [1] 16
## [1] 9
## [1] 13
## [1] 5
## [1] 2
## [1] 17
## [1] 14

# Loop version 2
for (i in 1:length(linkedin)) {
  print(linkedin[i])
}

## [1] 16
## [1] 9
## [1] 13
## [1] 5
## [1] 2
## [1] 17
## [1] 14

Loop over a list

# The nyc list is already specified
nyc <- list(pop = 8405837, 
            boroughs = c("Manhattan", "Bronx", "Brooklyn", "Queens", "Staten Island"), 
            capital = FALSE)

# Loop version 1
for (value in nyc) {
  print(value)
}

## [1] 8405837
## [1] "Manhattan"     "Bronx"         "Brooklyn"      "Queens"       
## [5] "Staten Island"
## [1] FALSE

# Loop version 2
for (i in 1:length(nyc)) {
  print(nyc[[i]])
}

## [1] 8405837
## [1] "Manhattan"     "Bronx"         "Brooklyn"      "Queens"       
## [5] "Staten Island"
## [1] FALSE

Loop over a matrix

# The tic-tac-toe matrix has already been defined for you
ttt <- matrix(c("O", NA, "X", NA, "O", NA, "X", "O", "X"), nrow = 3, ncol = 3)

# define the double for loop
for (i in 1:nrow(ttt)) {
  for (j in 1:ncol(ttt)) {
  print(paste("On row",i,"and column",j,"the board contains",ttt[i,j]))
  }
}

## [1] "On row 1 and column 1 the board contains O"
## [1] "On row 1 and column 2 the board contains NA"
## [1] "On row 1 and column 3 the board contains X"
## [1] "On row 2 and column 1 the board contains NA"
## [1] "On row 2 and column 2 the board contains O"
## [1] "On row 2 and column 3 the board contains O"
## [1] "On row 3 and column 1 the board contains X"
## [1] "On row 3 and column 2 the board contains NA"
## [1] "On row 3 and column 3 the board contains X"

Mix it up with control flow

# The linkedin vector has already been defined for you
linkedin <- c(16, 9, 13, 5, 2, 17, 14)

# Code the for loop with conditionals
for (i in 1:length(linkedin)) {
  if (linkedin[i] > 10) {
    print("You're popular!")
  } else {
    print("Be more visible!")
  }
  print(linkedin[i])
}

## [1] "You're popular!"
## [1] 16
## [1] "Be more visible!"
## [1] 9
## [1] "You're popular!"
## [1] 13
## [1] "Be more visible!"
## [1] 5
## [1] "Be more visible!"
## [1] 2
## [1] "You're popular!"
## [1] 17
## [1] "You're popular!"
## [1] 14

Next, you break it

# The linkedin vector has already been defined for you
linkedin <- c(16, 9, 13, 5, 2, 17, 14)

# Extend the for loop
for (li in linkedin) {
  if (li > 10) {
    print("You're popular!")
  } else {
    print("Be more visible!")
  }
  
  # Add code to conditionally break iteration
  if (li > 16) {
    print("This is ridiculous, I'm outta here!")
    break
  }
  
  # Add code to conditionally skip iteration
  if (li < 5) {
    print("This is too embarrassing!")
    next
  }
  
  print(li)
}

## [1] "You're popular!"
## [1] 16
## [1] "Be more visible!"
## [1] 9
## [1] "You're popular!"
## [1] 13
## [1] "Be more visible!"
## [1] 5
## [1] "Be more visible!"
## [1] "This is too embarrassing!"
## [1] "You're popular!"
## [1] "This is ridiculous, I'm outta here!"

Build a loop from scratch

# Pre-defined variables
rquote <- "R's internals are irrefutably intriguing"
chars <- strsplit(rquote, split = "")[[1]]
chars

##  [1] "R" "'" "s" " " "i" "n" "t" "e" "r" "n" "a" "l" "s" " " "a" "r" "e" " " "i"
## [20] "r" "r" "e" "f" "u" "t" "a" "b" "l" "y" " " "i" "n" "t" "r" "i" "g" "u" "i"
## [39] "n" "g"

# Your solution here
rcount <- 0
for (c in chars) {
  if (c == 'u') {
    break
  }
  if (c == 'r' | c == 'R') {
    rcount <- rcount + 1
  }
}

# Print the resulting rcount variable to the console
print(rcount)

## [1] 5

Functions

Introduction to Functions

You have already been using these a bunch.

Function documentation

# Consult the documentation on the mean() function
# In Rstudio this will open the help window
?mean

# Inspect the arguments of the mean() function
args(mean)

## function (x, ...) 
## NULL

Use a function

# The linkedin and facebook vectors have already been created for you
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)

# Calculate average number of views
avg_li <- mean(linkedin)
avg_fb <- mean(facebook)

# Inspect avg_li and avg_fb
avg_li

## [1] 10.85714

avg_fb

## [1] 11.42857

# Calculate the mean of linkedin minus facebook
mean(linkedin - facebook)

## [1] -0.5714286

Use a function (2)

# The linkedin and facebook vectors have already been created for you
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)

# Calculate the mean of the sum
avg_sum <- mean(linkedin + facebook)

# Calculate the trimmed mean of the sum
avg_sum_trimmed <- mean(linkedin + facebook, trim = 0.2)

# Inspect both new variables
avg_sum

## [1] 22.28571

avg_sum_trimmed

## [1] 22.6

Use a function (3)

# The linkedin and facebook vectors have already been created for you
linkedin <- c(16, 9, 13, 5, NA, 17, 14)
facebook <- c(17, NA, 5, 16, 8, 13, 14)

# Basic average of linkedin
mean(linkedin)

## [1] NA

mean(facebook)

## [1] NA

# Advanced average of facebook
mean(linkedin, na.rm = T)

## [1] 12.33333

mean(facebook, na.rm = T)

## [1] 12.16667

Functions inside functions

# The linkedin and facebook vectors have already been created for you
linkedin <- c(16, 9, 13, 5, NA, 17, 14)
facebook <- c(17, NA, 5, 16, 8, 13, 14)

# Calculate the mean absolute deviation
mean(abs(linkedin - facebook), na.rm = T)

## [1] 4.8

Required, or optional?

There are required and optional arguments to a function
Some (or actually most) of the optional arguments have a default value
- this will be set automatically
- for example, read.table has header = FALSE by default

The begining of the read.table function definition looks like this:

read.table(file, header = FALSE, sep = "", quote = "\"",...

In the read.table() function:

file is required
header, sep, and quote are optional arguments
- header is defaulted to FALSE
- sep is defaulted to and empty string ""
- quote is defaulted to "
You do not have to write all the argument names
- read.table("myfile.txt", TRUE, "-") will work.
But the order matters if you use that shortcut
- read.table("myfile.txt", "-", TRUE) will throw an error.
You can use any order if you specify all the names and values
- read.table("myfile.txt", sep = "-", header = TRUE) will work.

Writing Functions

Write your own functions

# Create a function pow_two()
pow_two <- function(x) {
 result <- x^2
 return(result)
}

# Use the function 
pow_two(12)

## [1] 144

# Create a function sum_abs()
sum_abs <- function(x, y) {
  result <- abs(x) + abs(y)
  return(result)
}

# Use the function
sum_abs(-2, 3)

## [1] 5

Write your own functions (2)

# Define the function hello()
hello <- function() {
  print('Hi there!')
  return(TRUE)
}

# Call the function hello()
hello()

## [1] "Hi there!"

## [1] TRUE

# Define the function my_filter()
my_filter <- function(x) {
  if (x >=0) {
    return(x)
  } else {
    return(NULL)
  }
}

# Call the function my_filter() twice
my_filter(5)

## [1] 5

my_filter(-5)

## NULL

Write you own functions (3)

# Extend the pow_two() function
pow_two <- function(x, print_info = T) {
  
  y <- x ^ 2
  
  if (print_info == T) {
    print(paste(x, 'to the power two equals',y))
  }
  
  return(y)
}

pow_two(5)

## [1] "5 to the power two equals 25"

## [1] 25

pow_two(5, print_info=F)

## [1] 25

Function scoping

Variables defined inside a function are not available outside of that function
- Nor are the variable names given to the input
- Calling x or y outside of this function would fail

pow_two <- function(x) {
  y <- x ^ 2
  return(y)
}
pow_two(4)

R passes arguments by value

In other words, a function won’t change the original variable passed in

triple <- function(x) {
  x <- 3*x
  x
}

a <- 5
triple(a)

## [1] 15

## [1] 5

R you functional?

# The linkedin and facebook vectors have already been created for you
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)

# Define the interpret function
interpret <- function(x) {

 if (x > 15) {
   print("You're popular!")
   return(x)
 } else {
   print("Try to be more visible!")
   return(0)
 }
 
}

# Call the interpret function twice
interpret(linkedin[1])

## [1] "You're popular!"

## [1] 16

interpret(facebook[2])

## [1] "Try to be more visible!"

## [1] 0

R you functional? (2)

# The linkedin and facebook vectors have already been created for you
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)

# The interpret() can be used inside interpret_all()
interpret <- function(num_views) {
  if (num_views > 15) {
    print("You're popular!")
    return(num_views)
  } else {
    print("Try to be more visible!")
    return(0)
  }
}

# Define the interpret_all() function
interpret_all <- function(x, show_sum = T) {
  
  sum_pop_views <- 0
  
  for (i in x) {
    y <- interpret(i)
    sum_pop_views <- sum_pop_views + y
  }
  
  if (show_sum == T) {
    return(sum_pop_views)
  } else {
    return(NULL)
  }
  
}

# Call the interpret_all() function on both linkedin and facebook
interpret_all(facebook)

## [1] "You're popular!"
## [1] "Try to be more visible!"
## [1] "Try to be more visible!"
## [1] "You're popular!"
## [1] "Try to be more visible!"
## [1] "Try to be more visible!"
## [1] "Try to be more visible!"

## [1] 33

interpret_all(linkedin)

## [1] "You're popular!"
## [1] "Try to be more visible!"
## [1] "Try to be more visible!"
## [1] "Try to be more visible!"
## [1] "Try to be more visible!"
## [1] "You're popular!"
## [1] "Try to be more visible!"

## [1] 33

R Packages

Many great functions are available through packages
The base package is installed with R and loaded when you start R
The others you need to load yourself when you want them
I think the reason R is so useful today is becasue it has amazing packages
- Especially the ones made by Hadley Wicklam or the folks at Rstudio

Load an R Package

# The mtcars vectors have already been prepared for you
wt <- mtcars$wt
hp <- mtcars$hp

# Request the currently attached packages
search()

##  [1] ".GlobalEnv"        "package:codetools" "package:shiny"    
##  [4] "package:stats"     "package:graphics"  "package:grDevices"
##  [7] "package:utils"     "package:datasets"  "package:methods"  
## [10] "Autoloads"         "package:base"

# Load the ggplot2 package
library(ggplot2)

# Use the qplot() function. 
# It will fail if you try before loading ggplot2 library
qplot(wt,hp)

# Check out the currently attached packages again
search()

##  [1] ".GlobalEnv"        "package:ggplot2"   "package:codetools"
##  [4] "package:shiny"     "package:stats"     "package:graphics" 
##  [7] "package:grDevices" "package:utils"     "package:datasets" 
## [10] "package:methods"   "Autoloads"         "package:base"

Different ways to load a package

Its not picky. These all work.

## I already have ggplot2 installed
## install.packages("ggplot2")

library(ggplot2)
library("ggplot2")
require(ggplot2)

The apply family

lapply

applies a function to each element of a list/vector
less code than writting out a whole loop
also faster
always returns a list
- wrap in unlist() if you want a vector

Use Lapply with a built-in R function

# The vector pioneers has already been created for you
pioneers <- c("GAUSS:1777", "BAYES:1702", "PASCAL:1623", "PEARSON:1857")

# Split names from birth year: split_math
split_math <- strsplit(pioneers, ":")
split_math

## [[1]]
## [1] "GAUSS" "1777" 
## 
## [[2]]
## [1] "BAYES" "1702" 
## 
## [[3]]
## [1] "PASCAL" "1623"  
## 
## [[4]]
## [1] "PEARSON" "1857"

# Convert to lowercase strings: split_low
split_low <- lapply(split_math, tolower)
split_low

## [[1]]
## [1] "gauss" "1777" 
## 
## [[2]]
## [1] "bayes" "1702" 
## 
## [[3]]
## [1] "pascal" "1623"  
## 
## [[4]]
## [1] "pearson" "1857"

# Take a look at the structure of split_low
str(split_low)

## List of 4
##  $ : chr [1:2] "gauss" "1777"
##  $ : chr [1:2] "bayes" "1702"
##  $ : chr [1:2] "pascal" "1623"
##  $ : chr [1:2] "pearson" "1857"

# For comparision, if you wrote out the whole loop
split_low2 <- list()

for (i in 1:length(split_math)) {
  split_low2[[i]] <- tolower(split_math[[i]])
}

split_low2

## [[1]]
## [1] "gauss" "1777" 
## 
## [[2]]
## [1] "bayes" "1702" 
## 
## [[3]]
## [1] "pascal" "1623"  
## 
## [[4]]
## [1] "pearson" "1857"

Use lapply with your own function

# Code from previous exercise:
pioneers <- c("GAUSS:1777", "BAYES:1702", "PASCAL:1623", "PEARSON:1857")
split <- strsplit(pioneers, split = ":")
split_low <- lapply(split, tolower)

# Write function select_first()
select_first <- function(x) {
  return(x[1])
}

# Apply select_first() over split_low: names
names <- lapply(split_low, select_first)
names

## [[1]]
## [1] "gauss"
## 
## [[2]]
## [1] "bayes"
## 
## [[3]]
## [1] "pascal"
## 
## [[4]]
## [1] "pearson"

# Write function select_second()
# Write function select_first()
select_second <- function(x) {
  return(x[2])
}

# Apply select_second() over split_low: years
years <- lapply(split_low, select_second)
years

## [[1]]
## [1] "1777"
## 
## [[2]]
## [1] "1702"
## 
## [[3]]
## [1] "1623"
## 
## [[4]]
## [1] "1857"

lapply and anonymous functions

# Definition of split_low
pioneers <- c("GAUSS:1777", "BAYES:1702", "PASCAL:1623", "PEARSON:1857")
split <- strsplit(pioneers, split = ":")
split_low <- lapply(split, tolower)

names <- lapply(split_low, function(x) {x[1]})
years <- lapply(split_low, function(x) {x[2]})

names

## [[1]]
## [1] "gauss"
## 
## [[2]]
## [1] "bayes"
## 
## [[3]]
## [1] "pascal"
## 
## [[4]]
## [1] "pearson"

years

## [[1]]
## [1] "1777"
## 
## [[2]]
## [1] "1702"
## 
## [[3]]
## [1] "1623"
## 
## [[4]]
## [1] "1857"

Use lapply with additional arguments

# Definition of split_low
pioneers <- c("GAUSS:1777", "BAYES:1702", "PASCAL:1623", "PEARSON:1857")
split <- strsplit(pioneers, split = ":")
split_low <- lapply(split, tolower)

# Replace the select_*() functions by a single function: select_el
select_el <- function(x,pos) { 
  x[pos] 
}

# Call the select_el() function twice on split_low: names and years
names <- lapply(split_low, select_el, 1)
years <- lapply(split_low, select_el, 2)

names

## [[1]]
## [1] "gauss"
## 
## [[2]]
## [1] "bayes"
## 
## [[3]]
## [1] "pascal"
## 
## [[4]]
## [1] "pearson"

years

## [[1]]
## [1] "1777"
## 
## [[2]]
## [1] "1702"
## 
## [[3]]
## [1] "1623"
## 
## [[4]]
## [1] "1857"

Apply functions that return NULL

lapply(list(1, "a", TRUE), str)

##  num 1
##  chr "a"
##  logi TRUE

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL

str(list(1,"a",TRUE))

## List of 3
##  $ : num 1
##  $ : chr "a"
##  $ : logi TRUE

str(TRUE)

##  logi TRUE

sapply

Stands for Simple apply
Like lapply, but returns a vetor if it can
It will name the vector by default
If there are multiple outputs it will return a matrix
If it can’t return a vector or matrix it will return a list
Becareful with this if you expect a certain data type returned in a program

How to use sapply

# temp variable
temp <- list(
  c(3, 7,  9,  6, -1),
  c(6,  9, 12, 13,  5),
  c(4,  8,  3, -1, -3),
  c(1,  4,  7,  2, -2),
  c(5, 7, 9, 4, 2),
  c(-3,  5,  8,  9,  4),
  c(3, 6, 9, 4, 1)
  )

# Use lapply() to find each day's minimum temperature
lapply(temp, min)

## [[1]]
## [1] -1
## 
## [[2]]
## [1] 5
## 
## [[3]]
## [1] -3
## 
## [[4]]
## [1] -2
## 
## [[5]]
## [1] 2
## 
## [[6]]
## [1] -3
## 
## [[7]]
## [1] 1

# Use sapply() to find each day's minimum temperature
sapply(temp, min)

## [1] -1  5 -3 -2  2 -3  1

# Use lapply() to find each day's maximum temperature
lapply(temp, max)

## [[1]]
## [1] 9
## 
## [[2]]
## [1] 13
## 
## [[3]]
## [1] 8
## 
## [[4]]
## [1] 7
## 
## [[5]]
## [1] 9
## 
## [[6]]
## [1] 9
## 
## [[7]]
## [1] 9

# Use sapply() to find each day's maximum temperature
sapply(temp, max)

## [1]  9 13  8  7  9  9  9

sapply with your own function

# temp is already defined in the workspace

# Define a function that calculates the average of the min and max of a vector: extremes_avg
extremes_avg <- function(x) {
 avg <- mean(c(min(x), max(x)))
 return(avg)
}

# Apply extremes_avg() over temp using sapply()
sapply(temp, extremes_avg)

## [1] 4.0 9.0 2.5 2.5 5.5 3.0 5.0

# Apply extremes_avg() over temp using lapply()
lapply(temp, extremes_avg)

## [[1]]
## [1] 4
## 
## [[2]]
## [1] 9
## 
## [[3]]
## [1] 2.5
## 
## [[4]]
## [1] 2.5
## 
## [[5]]
## [1] 5.5
## 
## [[6]]
## [1] 3
## 
## [[7]]
## [1] 5

sapply with function returning vector

# temp is already available in the workspace

# Create a function that returns min and max of a vector: extremes

extremes <- function (x) {
  return(c(min(x),max(x)))
}

# Apply extremes() over temp with sapply()
sapply(temp, extremes)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,]   -1    5   -3   -2    2   -3    1
## [2,]    9   13    8    7    9    9    9

# Apply extremes() over temp with lapply()
lapply(temp, extremes)

## [[1]]
## [1] -1  9
## 
## [[2]]
## [1]  5 13
## 
## [[3]]
## [1] -3  8
## 
## [[4]]
## [1] -2  7
## 
## [[5]]
## [1] 2 9
## 
## [[6]]
## [1] -3  9
## 
## [[7]]
## [1] 1 9

sapply can’t simplify, now what?

# temp is already prepared for you in the workspace

# Create a function that returns all values below zero: below_zero
below_zero <- function (x) {
 val <- x[x <0]
 return(val)
}

below_zero(c(1,2,3,-1,-1,-3))

## [1] -1 -1 -3

# Apply below_zero over temp using sapply(): freezing_s
freezing_s <- sapply(temp, below_zero)
freezing_s

## [[1]]
## [1] -1
## 
## [[2]]
## numeric(0)
## 
## [[3]]
## [1] -1 -3
## 
## [[4]]
## [1] -2
## 
## [[5]]
## numeric(0)
## 
## [[6]]
## [1] -3
## 
## [[7]]
## numeric(0)

# Apply below_zero over temp using lapply(): freezing_l
freezing_l <- lapply(temp, below_zero)
freezing_l

## [[1]]
## [1] -1
## 
## [[2]]
## numeric(0)
## 
## [[3]]
## [1] -1 -3
## 
## [[4]]
## [1] -2
## 
## [[5]]
## numeric(0)
## 
## [[6]]
## [1] -3
## 
## [[7]]
## numeric(0)

# Compare freezing_s to freezing_l using identical()
identical(freezing_s, freezing_l)

## [1] TRUE

sapply with functions that return NULL

# temp is already available in the workspace

# Write a function that 'cat()s' out the average temperatures: print_info
print_info <- function (x) {
  cat("The average temperature is", mean(x), "\n")
}


# Apply print_info() over temp using lapply()
lapply(temp, print_info)

## The average temperature is 4.8 
## The average temperature is 9 
## The average temperature is 2.2 
## The average temperature is 2.4 
## The average temperature is 5.4 
## The average temperature is 4.6 
## The average temperature is 4.6

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL

# Apply print_info() over temp using sapply()
sapply(temp, print_info)

## The average temperature is 4.8 
## The average temperature is 9 
## The average temperature is 2.2 
## The average temperature is 2.4 
## The average temperature is 5.4 
## The average temperature is 4.6 
## The average temperature is 4.6

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL

Reverse engineering sapply

this uses an anonymous function
the result will have 3 rows and 2 columns

sapply(list(runif (10), runif (10)), 
       function(x) c(min = min(x), mean = mean(x), max = max(x)))

##           [,1]       [,2]
## min  0.4230623 0.01313717
## mean 0.7350797 0.46165963
## max  0.9431425 0.74547888

vapply

this is safer than sapply
- becasue sapply can return a vector or a list (if result lengths differ)
you can tell it what the return data type should be

Use vapply

# temp is already available in the workspace

# Code the basics() function
basics <- function (x) {
  return(c(min(x), mean(x), max(x)))
}

# Apply basics() over temp using vapply()
vapply(temp, basics, numeric(3))

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] -1.0    5 -3.0 -2.0  2.0 -3.0  1.0
## [2,]  4.8    9  2.2  2.4  5.4  4.6  4.6
## [3,]  9.0   13  8.0  7.0  9.0  9.0  9.0

Use vapply (2)

# temp is already available in the workspace

# Definition of the basics() function
basics <- function(x) {
  c(min = min(x), mean = mean(x), median = median(x), max = max(x))
}

# This time there are 4 values
vapply(temp, basics, numeric(4))

##        [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## min    -1.0    5 -3.0 -2.0  2.0 -3.0  1.0
## mean    4.8    9  2.2  2.4  5.4  4.6  4.6
## median  6.0    9  3.0  2.0  5.0  5.0  4.0
## max     9.0   13  8.0  7.0  9.0  9.0  9.0

From sapply to vapply

# temp is already defined in the workspace
temp

## [[1]]
## [1]  3  7  9  6 -1
## 
## [[2]]
## [1]  6  9 12 13  5
## 
## [[3]]
## [1]  4  8  3 -1 -3
## 
## [[4]]
## [1]  1  4  7  2 -2
## 
## [[5]]
## [1] 5 7 9 4 2
## 
## [[6]]
## [1] -3  5  8  9  4
## 
## [[7]]
## [1] 3 6 9 4 1

# sapply() expression
sapply(temp, max)

## [1]  9 13  8  7  9  9  9

# Convert to vapply() expression
vapply(temp, max, numeric(1))

## [1]  9 13  8  7  9  9  9

# sapply() expression
sapply(temp, function(x, y) { mean(x) > y }, y = 5)

## [1] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE

# Convert to vapply() expression
vapply(temp, function(x, y) { mean(x) > y }, y = 5, logical(1))

## [1] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE

# Definition of get_info (don't change)
get_info <- function(x, y) { 
  if (mean(x) > y) {
    return("Not too cold!")
  } else {
    return("Pretty cold!")
  }
}

# sapply() expression
sapply(temp, get_info, y = 5)

## [1] "Pretty cold!"  "Not too cold!" "Pretty cold!"  "Pretty cold!" 
## [5] "Not too cold!" "Pretty cold!"  "Pretty cold!"

# Convert to vapply() expression
vapply(temp, get_info, y = 5, character(1))

## [1] "Pretty cold!"  "Not too cold!" "Pretty cold!"  "Pretty cold!" 
## [5] "Not too cold!" "Pretty cold!"  "Pretty cold!"

Utilities

Useful Functions

The class video goes through some useful functions. I’ll list them here. There are many more than this.

abs()
- calc the absolute value of a vector
sum()
- calc the sum of a vector
mean()
- calc the mean value of a vector
seq()
- create a sequence
rep()
- repeat an array
sort()
- sort a vector
str()
- see the structure of your datatype
is.*(), as.*()
- check your data type of change it
append(), rev()
- append and reverse vectors

Mathematical utilities

# The errors vector has already been defined for you
errors <- c(1.9,-2.6,4.0,-9.5,-3.4,7.3)

# Sum of absolute rounded values of errors
sum(round(abs(errors)))

## [1] 29

Find the error

# Don't edit these two lines
vec1 <- c(1.5,2.5,8.4,3.7,6.3)
vec2 <- rev(vec1)

# Fix the error
mean(c(abs(vec1), abs(vec2)))

## [1] 4.48

Data Utilities

# The linkedin and facebook vectors have already been created for you
linkedin <- list(16, 9, 13, 5, 2, 17, 14)
facebook <- list(17, 7, 5, 16, 8, 13, 14)

# Convert linkedin and facebook to a vector: li_vec and fb_vec
li_vec <- as.numeric(linkedin)
fb_vec <- as.numeric(facebook)


# Append fb_vec to li_vec: social_vec
social_vec <- append(li_vec, fb_vec)
social_vec

##  [1] 16  9 13  5  2 17 14 17  7  5 16  8 13 14

# Sort social_vec
sort(social_vec, decreasing=T)

##  [1] 17 17 16 16 14 14 13 13  9  8  7  5  5  2

Find the error (2)

# Fix me
round(sum(unlist(list(1.1,3,5))))

## [1] 9

# Fix me
rep(seq(1, 7, by = 2), times = 7)

##  [1] 1 3 5 7 1 3 5 7 1 3 5 7 1 3 5 7 1 3 5 7 1 3 5 7 1 3 5 7

Beat Gauss using R

# Create first sequence: seq1
seq1 <- seq(1,500,by=3)
seq1

##   [1]   1   4   7  10  13  16  19  22  25  28  31  34  37  40  43  46  49  52
##  [19]  55  58  61  64  67  70  73  76  79  82  85  88  91  94  97 100 103 106
##  [37] 109 112 115 118 121 124 127 130 133 136 139 142 145 148 151 154 157 160
##  [55] 163 166 169 172 175 178 181 184 187 190 193 196 199 202 205 208 211 214
##  [73] 217 220 223 226 229 232 235 238 241 244 247 250 253 256 259 262 265 268
##  [91] 271 274 277 280 283 286 289 292 295 298 301 304 307 310 313 316 319 322
## [109] 325 328 331 334 337 340 343 346 349 352 355 358 361 364 367 370 373 376
## [127] 379 382 385 388 391 394 397 400 403 406 409 412 415 418 421 424 427 430
## [145] 433 436 439 442 445 448 451 454 457 460 463 466 469 472 475 478 481 484
## [163] 487 490 493 496 499

# Create second sequence: seq2
seq2 <- seq(1200, 900, by=-7)
seq2

##  [1] 1200 1193 1186 1179 1172 1165 1158 1151 1144 1137 1130 1123 1116 1109 1102
## [16] 1095 1088 1081 1074 1067 1060 1053 1046 1039 1032 1025 1018 1011 1004  997
## [31]  990  983  976  969  962  955  948  941  934  927  920  913  906

# Calculate total sum of the sequences
sum(c(seq1, seq2))

## [1] 87029

Regular Expressions

These are just sequences of characters and meta-characters that can match a pattern
They are used for
- pattern existence
- pattern replacement
- pattern extraction

grepl & grep

# The emails vector has already been defined for you
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org", 
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")

# Use grepl() to match for "edu"
grepl("edu",emails)

## [1]  TRUE  TRUE FALSE  TRUE  TRUE FALSE

# Use grep() to match for "edu", save result to hits
hits <- grep("edu",emails)
hits

## [1] 1 2 4 5

# Subset emails using hits
emails[hits]

## [1] "john.doe@ivyleague.edu"   "education@world.gov"     
## [3] "invalid.edu"              "quant@bigdatacollege.edu"

grepl & grep (2)

# The emails vector has already been defined for you
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org", 
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")

# Use grep() to match for .edu addresses more robustly
grep("@.*\\.edu$",emails)

## [1] 1 5

# Use grepl() to match for .edu addresses more robustly, save result to hits
hits <- grepl("@.*\\.edu$",emails)
hits

## [1]  TRUE FALSE FALSE FALSE  TRUE FALSE

# Subset emails using hits
emails[hits]

## [1] "john.doe@ivyleague.edu"   "quant@bigdatacollege.edu"

sub and gsub

# The emails vector has already been defined for you
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org", 
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")

# Use sub() to convert the email domains to datacamp.edu (attempt 1)
# The @ character is removed
sub("@.*\\.edu$","datacamp.edu",emails)

## [1] "john.doedatacamp.edu"     "education@world.gov"     
## [3] "dalai.lama@peace.org"     "invalid.edu"             
## [5] "quantdatacamp.edu"        "cookie.monster@sesame.tv"

# Use sub() to convert the email domains to datacamp.edu (attempt 2)
# Thats better
sub("@.*\\.edu$","@datacamp.edu",emails)

## [1] "john.doe@datacamp.edu"    "education@world.gov"     
## [3] "dalai.lama@peace.org"     "invalid.edu"             
## [5] "quant@datacamp.edu"       "cookie.monster@sesame.tv"

sub and gsub (2)

awards <- c("Won 1 Oscar.",
  "Won 1 Oscar. Another 9 wins & 24 nominations.",
  "1 win and 2 nominations.",
  "2 wins & 3 nominations.",
  "Nominated for 2 Golden Globes. 1 more win & 2 nominations.",
  "4 wins & 1 nomination.")

sub(".*\\s([0-9]+)\\snomination.*$", "\\1", awards)

## [1] "Won 1 Oscar." "24"           "2"            "3"            "2"           
## [6] "1"

Times and Dates

Right here, right now

# Get the current date: today
today <- Sys.Date()

# See what today looks like under the hood
unclass(today)

## [1] 18313

# Get the current time: now
now <- Sys.time()

# See what now looks like under the hood
unclass(now)

## [1] 1582280279

Create and format times

This will be helpful…
- look up the format values in strptime.
- e.g. ?strptime
- There are a lot

# Definition of character strings representing dates
str1 <- "May 23, '96"
str2 <- "2012-3-15"
str3 <- "30/January/2006"

# Convert the strings to dates: date1, date2, date3
date1 <- as.Date(str1, format = "%b %d, '%y")
date2 <- as.Date(str2, format = "%Y-%m-%d")
date3 <- as.Date(str3, format = "%d/%B/%Y")

date1

## [1] "1996-05-23"

date2

## [1] "2012-03-15"

date3

## [1] "2006-01-30"

# Convert dates to formatted strings

# Get the day of week
format(date1, "%A")

## [1] "Thursday"

# Get the day of the month
format(date2, "%d")

## [1] "15"

# Get the month abv and year
format(date3, "%b %Y")

## [1] "Jan 2006"

# Definition of character strings representing times
str1 <- "May 23, '96 hours:23 minutes:01 seconds:45"
str2 <- "2012-3-12 14:23:08"

# Convert the strings to POSIXct objects: time1, time2
time1 <- as.POSIXct(str1, format = "%B %d, '%y hours:%H minutes:%M seconds:%S")
time2 <- as.POSIXct(str2, format = "%Y-%m-%d %T")

time1

## [1] "1996-05-23 23:01:45 EDT"

time2

## [1] "2012-03-12 14:23:08 EDT"

# Convert times to formatted strings

# Get the minnute
format(time1, "%M")

## [1] "01"

# Get the hour:minute and AM/PM
format(time2, "%I:%M %p")

## [1] "02:23 PM"

Calculations with Dates

# day1, day2, day3, day4 and day5 are already available in the workspace
day1 <- as.Date("2017-03-12")
day2 <- as.Date("2017-03-14")
day3 <- as.Date("2017-03-19")
day4 <- as.Date("2017-03-25")
day5 <- as.Date("2017-03-30")

# Difference between last and first pizza day
day5-day1

## Time difference of 18 days

# Create vector pizza
pizza <- c(day1, day2, day3, day4, day5)
pizza

## [1] "2017-03-12" "2017-03-14" "2017-03-19" "2017-03-25" "2017-03-30"

# Create differences between consecutive pizza days: day_diff
day_diff <- diff(pizza)
day_diff

## Time differences in days
## [1] 2 5 6 5

# Average period between two consecutive pizza days
mean(day_diff)

## Time difference of 4.5 days

Calculations with Times

# login and logout are already defined in the workspace
login <- c(as.POSIXct("2017-03-16 10:18:04 UTC"), 
            as.POSIXct("2017-03-21 09:14:18 UTC"),
            as.POSIXct("2017-03-21 12:21:51 UTC"), 
            as.POSIXct("2017-03-21 12:37:24 UTC"),
            as.POSIXct("2017-03-23 21:37:55 UTC"))

logout <- c(as.POSIXct("2017-03-16 10:56:29 UTC"),
            as.POSIXct("2017-03-21 09:14:52 UTC"),
            as.POSIXct("2017-03-21 12:35:48 UTC"), 
            as.POSIXct("2017-03-21 13:17:22 UTC"),
            as.POSIXct("2017-03-23 22:08:47 UTC"))
 
# Calculate the difference between login and logout: time_online
time_online <- logout - login

# Inspect the variable time_online
time_online

## Time differences in secs
## [1] 2305   34  837 2398 1852

# Calculate the total time online
sum(time_online)

## Time difference of 7426 secs

# Calculate the average time online
mean(time_online)

## Time difference of 1485.2 secs

Time is of the essence

astro <- c("20-Mar-2015", "25-Jun-2015", "23-Sep-2015", "22-Dec-2015")
names(astro) <- c("spring", "summer","fall","winter") 
astro

##        spring        summer          fall        winter 
## "20-Mar-2015" "25-Jun-2015" "23-Sep-2015" "22-Dec-2015"

meteo <- c("March 1, 15", "June 1, 15", "September 1, 15", "December 1, 15")
names(meteo) <- c("spring", "summer", "fall", "winter")
meteo

##            spring            summer              fall            winter 
##     "March 1, 15"      "June 1, 15" "September 1, 15"  "December 1, 15"

# Convert astro to vector of Date objects: astro_dates
astro_dates <- as.Date(astro, "%d-%b-%Y")
str(astro)

##  Named chr [1:4] "20-Mar-2015" "25-Jun-2015" "23-Sep-2015" "22-Dec-2015"
##  - attr(*, "names")= chr [1:4] "spring" "summer" "fall" "winter"

str(astro_dates)

##  Date[1:4], format: "2015-03-20" "2015-06-25" "2015-09-23" "2015-12-22"

# Convert meteo to vector of Date objects: meteo_dates
meteo_dates <- as.Date(meteo, "%B %d, %y")
str(meteo)

##  Named chr [1:4] "March 1, 15" "June 1, 15" "September 1, 15" ...
##  - attr(*, "names")= chr [1:4] "spring" "summer" "fall" "winter"

str(meteo_dates)

##  Date[1:4], format: "2015-03-01" "2015-06-01" "2015-09-01" "2015-12-01"

# Calculate the maximum absolute difference between astro_dates and meteo_dates
max(abs(astro_dates - meteo_dates))

## Time difference of 24 days

The End

Woof, thats a lot!
- And thats just a sample of some key programming topics in R
Theres a lot more to learn in each of those topics
- Especially the funtions and packages
- There are a lot of key packages to work with dataframes or times
- There are packages for everything really
This doc is useful to me as a reference
- Now I just need to see it again a few times to really get it
- And I can look back here if ever need a little refresher in the future

Intermediate R

Amar Kapote

2017-03-29