This document was updated on 2017-07-04.


library(dplyr)

Introduction

Whats Covered

  • Conditionals and Control Flow
    • Relational Operators. == < >
    • Logical Operators. && || !
    • Conditional Statements. if else
  • While and For loops
  • Functions
  • Apply statements
    • lapply, sapply, vapply
  • Utilities
    • Regular Expressions and grep/sub
    • Times and Dates

   


Conditionals and Control Flow

## my grade in the class (hypothetically)
me 
## [1] 89
## the other students grades
glimpse(other_199)
##  num [1:188] 66 81 77 90 95 54 78 90 71 96 ...
## grades for students over the past 4 years
glimpse(previous_4)
##  int [1:200, 1:4] 82 66 66 86 76 74 56 80 71 69 ...
head(previous_4)
##      [,1] [,2] [,3] [,4]
## [1,]   82   89   53   70
## [2,]   66   89   66   76
## [3,]   66   65   85   73
## [4,]   86   79   65   64
## [5,]   76   83   75   56
## [6,]   74   73   69   71

Grades analysis in R

  • First, we just join all the data to get a matrix of 5 years of data
  • Normally there would be different numbers each year, but whatev
# Merge me and other_199: my_class
my_class <- c(me, other_199)
head(my_class)
## [1] 89 66 81 77 90 95
# cbind() my_class and previous_4: last_5
last_5 <- cbind(my_class, previous_4)
head(last_5)
##      my_class            
## [1,]       89 82 89 53 70
## [2,]       66 66 89 66 76
## [3,]       81 66 65 85 73
## [4,]       77 86 79 65 64
## [5,]       90 76 83 75 56
## [6,]       95 74 73 69 71
# Name last_5 appropriately
nms <- paste0("year_", 1:5)

colnames(last_5) <- nms
head(last_5)
##      year_1 year_2 year_3 year_4 year_5
## [1,]     89     82     89     53     70
## [2,]     66     66     89     66     76
## [3,]     81     66     65     85     73
## [4,]     77     86     79     65     64
## [5,]     90     76     83     75     56
## [6,]     95     74     73     69     71

Explore your data

  • There is an entire class on this later so this is just the start,
  • These 3 functions are pretty useful
# Build histogram of my_class
hist(my_class)

# Generate summary of last_5
summary(last_5)
##      year_1           year_2           year_3           year_4           year_5      
##  Min.   : 50.00   Min.   : 50.00   Min.   : 50.00   Min.   : 50.00   Min.   : 50.00  
##  1st Qu.: 68.00   1st Qu.: 63.00   1st Qu.: 70.00   1st Qu.: 66.00   1st Qu.: 70.00  
##  Median : 75.50   Median : 71.50   Median : 77.00   Median : 73.50   Median : 78.00  
##  Mean   : 75.78   Mean   : 72.28   Mean   : 76.24   Mean   : 74.50   Mean   : 77.69  
##  3rd Qu.: 83.25   3rd Qu.: 80.00   3rd Qu.: 84.00   3rd Qu.: 82.25   3rd Qu.: 88.00  
##  Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :100.00
# Build boxplot of last_5
boxplot(last_5)

Basic queries

  • This uses relational operators to answer very simple questions
  • I am wrapping some of these in head, just to shorten the results
# Is your grade equal to 72?
me == 72
## [1] FALSE
# Which grades in your class are higher than 75?
head(my_class > 75, 10)
##  [1]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
# Which grades in the last 5 years are below or equal to 64?
head(last_5 <= 64, 10)
##       year_1 year_2 year_3 year_4 year_5
##  [1,]  FALSE  FALSE  FALSE   TRUE  FALSE
##  [2,]  FALSE  FALSE  FALSE  FALSE  FALSE
##  [3,]  FALSE  FALSE  FALSE  FALSE  FALSE
##  [4,]  FALSE  FALSE  FALSE  FALSE   TRUE
##  [5,]  FALSE  FALSE  FALSE  FALSE   TRUE
##  [6,]  FALSE  FALSE  FALSE  FALSE  FALSE
##  [7,]   TRUE   TRUE  FALSE  FALSE  FALSE
##  [8,]  FALSE  FALSE  FALSE  FALSE  FALSE
##  [9,]  FALSE  FALSE   TRUE  FALSE  FALSE
## [10,]  FALSE  FALSE  FALSE  FALSE  FALSE

Build aggregates

  • Becuse the boolean arrays are only so useful. Normall a count or porportion is more helpful.
# How many grades in your class are higher than 75?
sum(my_class > 75)
## [1] 92
# How many students in your class scored strictly higher than you?
sum(my_class > me)
## [1] 26
# What's the proportion of grades below or equal to 64 in the last 5 years?
mean(last_5 <= 64)
## [1] 0.2

Logical operator

# Is your grade greater than 87 and smaller than or equal to 89?
me > 87 & me <= 89
## [1] TRUE
# Which grades in your class are below 60 or above 90?
head(my_class < 60 | my_class > 90, 20)
##  [1] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
## [17] FALSE  TRUE FALSE  TRUE

Build aggregates(2)

# What's the proportion of grades in your class that is average?
mean(my_class >= 70 & my_class <= 85)
## [1] 0.5026455
# How many students in the last 5 years had a grade of 80 or 90?
sum(last_5 == 80 | last_5 == 90)
## [1] 86

if, else

if (me > 80) {
  print("Good student!")
} else {
  print("Better luck next year!")
}
## [1] "Good student!"

if, else: DIY

# Define n_smart
n_smart <- sum(my_class >= 80)

# Code the if-else construct
if (n_smart > 50) {
  print("smart class")
} else {
  print("rather average")
}
## [1] "smart class"

else if

# Define prop_less
prop_less <- mean(my_class < me)

# Code the control construct
if(prop_less > .90) {
  print("you're among the best 10 percent")
} else if (prop_less > .80) {
  print("you're among the best 20 percent")
} else {
  print("need more analysis")
}
## [1] "you're among the best 20 percent"

Embed if-else clauses

# Embedded control structure: fix the error
if (mean(my_class) < 75) {
  if (mean(my_class) > me) {
    print("average year, but still smarter than me")
  } else {
    print("average year, but I'm not that bad")
  }
} else {
  if (mean(my_class) > me) {
    print("smart year, even smarter than me")
  } else {
    print("smart year, but I am smarter")
  }
}
## [1] "smart year, but I am smarter"

Operations and controls expertise

# Create top_grades
top_grades <- my_class[my_class >= 85]

# Create worst_grades
worst_grades <- my_class[my_class < 65]

# Write conditional statement
if (length(top_grades) > length(worst_grades)) {
  print("top grades prevail")
} 
## [1] "top grades prevail"

   


Loops

length(logs)
## [1] 96
logs[1:3]
## [[1]]
## [[1]]$success
## [1] TRUE
## 
## [[1]]$details
## [[1]]$details$message
## [1] "check"
## 
## 
## [[1]]$timestamp
## [1] "2015-09-14 19:01:07 EDT"
## 
## 
## [[2]]
## [[2]]$success
## [1] TRUE
## 
## [[2]]$details
## [[2]]$details$message
## [1] "all good"
## 
## 
## [[2]]$timestamp
## [1] "2015-09-14 20:00:13 EDT"
## 
## 
## [[3]]
## [[3]]$success
## [1] TRUE
## 
## [[3]]$details
## [[3]]$details$message
## [1] "check"
## 
## 
## [[3]]$timestamp
## [1] "2015-09-14 21:00:43 EDT"
str(logs[1:3])
## List of 3
##  $ :List of 3
##   ..$ success  : logi TRUE
##   ..$ details  :List of 1
##   .. ..$ message: chr "check"
##   ..$ timestamp: POSIXct[1:1], format: "2015-09-14 19:01:07"
##  $ :List of 3
##   ..$ success  : logi TRUE
##   ..$ details  :List of 1
##   .. ..$ message: chr "all good"
##   ..$ timestamp: POSIXct[1:1], format: "2015-09-14 20:00:13"
##  $ :List of 3
##   ..$ success  : logi TRUE
##   ..$ details  :List of 1
##   .. ..$ message: chr "check"
##   ..$ timestamp: POSIXct[1:1], format: "2015-09-14 21:00:43"

Scanning Logs in R

# Print the structure of logs
# str(logs)

# Use list subsetting to print the details part of 11th logs entry
logs[11]
## [[1]]
## [[1]]$success
## [1] TRUE
## 
## [[1]]$details
## [[1]]$details$message
## [1] "ok"
## 
## 
## [[1]]$timestamp
## [1] "2015-09-15 04:59:54 EDT"
# Print the class of the timestamp component of the first entry
class(logs[[1]]$timestamp)
## [1] "POSIXct" "POSIXt"

Understand the logs

## You can index into a list like this
logs[[2]][[2]][[1]]
## [1] "all good"
## You can also use an array to index
logs[[c(2, 2, 1)]]
## [1] "all good"
## Note that if success == FALSE then there are two elements in the details

While: start easy

# Initialize the iterator i to be 1
i <- 1

# Code the while loop
while ( logs[[i]]$success == TRUE ) {
  print(i)
  i <- i + 1
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9

Adapt the while loop

# Adapt the while loop
i <- 1
while (logs[[i]]$success) {
  print(logs[[i]]$details$message)
  i <- i + 1
}
## [1] "check"
## [1] "all good"
## [1] "check"
## [1] "check"
## [1] "ok"
## [1] "all good"
## [1] "check"
## [1] "all good"
## [1] "all good"

While: different approach

  • This approach set a value to false initially, then updates when a specific condition is met which will then top the loop
# Initialize i and found
i <- 1
found <- FALSE

# Code the while loop
while (found == FALSE) {
  if (logs[[i]]$success == FALSE && logs[[i]]$details$location == "waste") {
    print("found")
    found <- TRUE
  } else {
    print("still looking")
    i <- i + 1
  }
}
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "found"

The for loop

  • I shorten this up to just the first 10 logs
# Code a for loop that prints the timestamp of each log
for (log in logs[1:10]) {
  print(log$timestamp)
}
## [1] "2015-09-14 19:01:07 EDT"
## [1] "2015-09-14 20:00:13 EDT"
## [1] "2015-09-14 21:00:43 EDT"
## [1] "2015-09-14 22:01:18 EDT"
## [1] "2015-09-14 22:59:59 EDT"
## [1] "2015-09-15 00:01:08 EDT"
## [1] "2015-09-15 01:03:20 EDT"
## [1] "2015-09-15 01:59:25 EDT"
## [1] "2015-09-15 02:59:29 EDT"
## [1] "2015-09-15 04:00:53 EDT"

Going through the list

# Make the printout conditional: only if success
for (log in logs[1:20]) {
  if (log$success == TRUE) {
    print(log$timestamp)  
  }
}
## [1] "2015-09-14 19:01:07 EDT"
## [1] "2015-09-14 20:00:13 EDT"
## [1] "2015-09-14 21:00:43 EDT"
## [1] "2015-09-14 22:01:18 EDT"
## [1] "2015-09-14 22:59:59 EDT"
## [1] "2015-09-15 00:01:08 EDT"
## [1] "2015-09-15 01:03:20 EDT"
## [1] "2015-09-15 01:59:25 EDT"
## [1] "2015-09-15 02:59:29 EDT"
## [1] "2015-09-15 04:59:54 EDT"
## [1] "2015-09-15 06:00:39 EDT"
## [1] "2015-09-15 07:03:18 EDT"
## [1] "2015-09-15 08:01:49 EDT"
## [1] "2015-09-15 09:01:54 EDT"
## [1] "2015-09-15 10:02:27 EDT"
## [1] "2015-09-15 11:06:20 EDT"
## [1] "2015-09-15 12:05:48 EDT"
## [1] "2015-09-15 13:07:10 EDT"

Adapt the logs list

# Finish the for loop: add date element for each entry
for (i in 1:length(logs)) {
  logs[[i]]$date <- as.Date(logs[[i]]$timestamp)
}

# Print first 6 elements in logs
head(logs, 3)
## [[1]]
## [[1]]$success
## [1] TRUE
## 
## [[1]]$details
## [[1]]$details$message
## [1] "check"
## 
## 
## [[1]]$timestamp
## [1] "2015-09-14 19:01:07 EDT"
## 
## [[1]]$date
## [1] "2015-09-14"
## 
## 
## [[2]]
## [[2]]$success
## [1] TRUE
## 
## [[2]]$details
## [[2]]$details$message
## [1] "all good"
## 
## 
## [[2]]$timestamp
## [1] "2015-09-14 20:00:13 EDT"
## 
## [[2]]$date
## [1] "2015-09-15"
## 
## 
## [[3]]
## [[3]]$success
## [1] TRUE
## 
## [[3]]$details
## [[3]]$details$message
## [1] "check"
## 
## 
## [[3]]$timestamp
## [1] "2015-09-14 21:00:43 EDT"
## 
## [[3]]$date
## [1] "2015-09-15"

Collect all failures

# Intialize empty list: failures
failures <- list()

# Finish the for loop: add each failure to failures
for (log in logs) {
  if (log$success == FALSE) {
    failures <- c(failures, list(log))
  }
}

# Display the structure of failures
str(failures)
## List of 7
##  $ :List of 4
##   ..$ success  : logi FALSE
##   ..$ details  :List of 2
##   .. ..$ message : chr "stack overflow"
##   .. ..$ location: chr "control room"
##   ..$ timestamp: POSIXct[1:1], format: "2015-09-15 04:00:53"
##   ..$ date     : Date[1:1], format: "2015-09-15"
##  $ :List of 4
##   ..$ success  : logi FALSE
##   ..$ details  :List of 2
##   .. ..$ message : chr "segmentation fault"
##   .. ..$ location: chr "waste"
##   ..$ timestamp: POSIXct[1:1], format: "2015-09-15 14:07:58"
##   ..$ date     : Date[1:1], format: "2015-09-15"
##  $ :List of 4
##   ..$ success  : logi FALSE
##   ..$ details  :List of 2
##   .. ..$ message : chr "human error"
##   .. ..$ location: chr "reactor"
##   ..$ timestamp: POSIXct[1:1], format: "2015-09-15 17:14:16"
##   ..$ date     : Date[1:1], format: "2015-09-15"
##  $ :List of 4
##   ..$ success  : logi FALSE
##   ..$ details  :List of 2
##   .. ..$ message : chr "stack overflow"
##   .. ..$ location: chr "tubes"
##   ..$ timestamp: POSIXct[1:1], format: "2015-09-16 21:31:51"
##   ..$ date     : Date[1:1], format: "2015-09-17"
##  $ :List of 4
##   ..$ success  : logi FALSE
##   ..$ details  :List of 2
##   .. ..$ message : chr "segmentation fault"
##   .. ..$ location: chr "waste"
##   ..$ timestamp: POSIXct[1:1], format: "2015-09-17 09:33:43"
##   ..$ date     : Date[1:1], format: "2015-09-17"
##  $ :List of 4
##   ..$ success  : logi FALSE
##   ..$ details  :List of 2
##   .. ..$ message : chr "human error"
##   .. ..$ location: chr "waste"
##   ..$ timestamp: POSIXct[1:1], format: "2015-09-17 11:34:22"
##   ..$ date     : Date[1:1], format: "2015-09-17"
##  $ :List of 4
##   ..$ success  : logi FALSE
##   ..$ details  :List of 2
##   .. ..$ message : chr "human error"
##   .. ..$ location: chr "waste"
##   ..$ timestamp: POSIXct[1:1], format: "2015-09-17 23:37:18"
##   ..$ date     : Date[1:1], format: "2015-09-18"

Without Loops

  • The looping code above is good practice but normally you should not use loops in R.
  • There are much better ways to work with data.
  • I will show how to pull this list data into a dataframe to make life much easier while doing analysis.
  • This is important because you can waste a lot of time not just in execution, but in writing code if you don’t learn dplyr and other libraries that are specifically made to crunch data.
  • Loops are good for other programming routines or low level operations but not for crunching data.

Pulling the list into a clean data frame

  • Ideally the log data would be consistent and not have different elements in different situations.
  • That makes it tricky to clean this up into a dataframe
  • But its still possible and once clean is much easier to work with!
  • Note: I am using the dplyr chain function to make these read well

Convert the variable level list to a dataframe:

  • First, I convert each log into a data frame
    • I want to apply the as.data.frame function to each log record
    • I can’t name the columns here becasue details.location is not always present.
    • So, I will clean that up later
  • Then you can use bind_rows (from dplyr package) to combine the list of dataframes
    • If the columns are missing in some of the dataframes (and they are) that is okay, it will just fill in with NA
  • Now I will clean up the column names so message and details.message are the same column
    • I just set details.message to be message when its NA
    • I wish as.data.frame would always pull in the full name path from a list as the column name
    • Unfortunately, If there is just one nested record it will just use the last name as the column name, so we have message when its the only thing in details or details.message when there are multiple recoreds under details
    • This is lame but its how it works.
  • And last thing is to remove the original message column since we have moved it into details.message
df_logs <- logs %>%
  sapply(as.data.frame) %>%
  bind_rows() %>%
  mutate(
    details.message = ifelse(is.na(details.message), message, details.message)
  ) %>%
  select(-message)

head(df_logs, 12)
##    success           timestamp       date details.message details.location
## 1     TRUE 2015-09-14 19:01:07 2015-09-14           check             <NA>
## 2     TRUE 2015-09-14 20:00:13 2015-09-15        all good             <NA>
## 3     TRUE 2015-09-14 21:00:43 2015-09-15           check             <NA>
## 4     TRUE 2015-09-14 22:01:18 2015-09-15           check             <NA>
## 5     TRUE 2015-09-14 22:59:59 2015-09-15              ok             <NA>
## 6     TRUE 2015-09-15 00:01:08 2015-09-15        all good             <NA>
## 7     TRUE 2015-09-15 01:03:20 2015-09-15           check             <NA>
## 8     TRUE 2015-09-15 01:59:25 2015-09-15        all good             <NA>
## 9     TRUE 2015-09-15 02:59:29 2015-09-15        all good             <NA>
## 10   FALSE 2015-09-15 04:00:53 2015-09-15  stack overflow     control room
## 11    TRUE 2015-09-15 04:59:54 2015-09-15              ok             <NA>
## 12    TRUE 2015-09-15 06:00:39 2015-09-15              ok             <NA>

Analysing the data

  • Now we can see all the data so much easier.
  • Really, you always want your data in a dataframe like this.
  • These few lines of code pretty much cover all the tasks we just did with the loops above.
## subset the data
df_logs %>% filter(success==FALSE)
##   success           timestamp       date    details.message details.location
## 1   FALSE 2015-09-15 04:00:53 2015-09-15     stack overflow     control room
## 2   FALSE 2015-09-15 14:07:58 2015-09-15 segmentation fault            waste
## 3   FALSE 2015-09-15 17:14:16 2015-09-15        human error          reactor
## 4   FALSE 2015-09-16 21:31:51 2015-09-17     stack overflow            tubes
## 5   FALSE 2015-09-17 09:33:43 2015-09-17 segmentation fault            waste
## 6   FALSE 2015-09-17 11:34:22 2015-09-17        human error            waste
## 7   FALSE 2015-09-17 23:37:18 2015-09-18        human error            waste
df_logs %>% filter(success==FALSE & details.location == 'waste')
##   success           timestamp       date    details.message details.location
## 1   FALSE 2015-09-15 14:07:58 2015-09-15 segmentation fault            waste
## 2   FALSE 2015-09-17 09:33:43 2015-09-17 segmentation fault            waste
## 3   FALSE 2015-09-17 11:34:22 2015-09-17        human error            waste
## 4   FALSE 2015-09-17 23:37:18 2015-09-18        human error            waste
## add data column
df_logs %>% mutate(date = as.Date(timestamp)) %>% head(10)
##    success           timestamp       date details.message details.location
## 1     TRUE 2015-09-14 19:01:07 2015-09-14           check             <NA>
## 2     TRUE 2015-09-14 20:00:13 2015-09-15        all good             <NA>
## 3     TRUE 2015-09-14 21:00:43 2015-09-15           check             <NA>
## 4     TRUE 2015-09-14 22:01:18 2015-09-15           check             <NA>
## 5     TRUE 2015-09-14 22:59:59 2015-09-15              ok             <NA>
## 6     TRUE 2015-09-15 00:01:08 2015-09-15        all good             <NA>
## 7     TRUE 2015-09-15 01:03:20 2015-09-15           check             <NA>
## 8     TRUE 2015-09-15 01:59:25 2015-09-15        all good             <NA>
## 9     TRUE 2015-09-15 02:59:29 2015-09-15        all good             <NA>
## 10   FALSE 2015-09-15 04:00:53 2015-09-15  stack overflow     control room

   


Functions

Using functions

# Call max() on timestamps
max(timestamps)
## [1] "2015-09-17 23:37:18 EDT"
# What is the date of the latest timestamp?
as.Date(max(timestamps))
## [1] "2015-09-18"

Optional Arguments

# Print out timestamps
print(timestamps)
## [1] "2015-09-15 04:00:53 EDT" "2015-09-15 14:07:58 EDT" "2015-09-15 17:14:16 EDT"
## [4] "2015-09-16 21:31:51 EDT" "2015-09-17 09:33:43 EDT" "2015-09-17 11:34:22 EDT"
## [7] "2015-09-17 23:37:18 EDT"
# Call max() on timestamps, no additional arguments
max(timestamps)
## [1] "2015-09-17 23:37:18 EDT"
# Call max() on timestamps, specify na.rm
max(timestamps, na.rm=T)
## [1] "2015-09-17 23:37:18 EDT"

Extracting log information (1)

# Build a function extract_info(): use for loop, add return statement
extract_info <- function(x) {
  info <- c()
  for (log in x) {
    info <- c(info, log$timestamp)
  }
  return(info)
}

# Call extract_info() on logs
extract_info(logs)
##  [1] 1442271667 1442275213 1442278843 1442282479 1442286000 1442289669 1442293400 1442296766
##  [9] 1442300369 1442304054 1442307595 1442311239 1442314999 1442318509 1442322115 1442325748
## [17] 1442329581 1442333148 1442336831 1442340478 1442344104 1442347809 1442351656 1442355230
## [25] 1442358897 1442362565 1442366247 1442369825 1442373613 1442377367 1442381036 1442384587
## [33] 1442388127 1442391716 1442395201 1442398873 1442402487 1442406138 1442409894 1442413503
## [41] 1442416980 1442420588 1442424320 1442427945 1442431540 1442435144 1442438737 1442442497
## [49] 1442446129 1442449737 1442453512 1442457227 1442460737 1442464501 1442468239 1442471721
## [57] 1442475370 1442478869 1442482442 1442486000 1442489681 1442493222 1442496823 1442500519
## [65] 1442504062 1442507676 1442511301 1442514920 1442518449 1442522024 1442525547 1442529154
## [73] 1442532866 1442536496 1442540274 1442543775 1442547438 1442550882 1442554480 1442558131
## [81] 1442561734 1442565493 1442569141 1442572776 1442576398 1442580077 1442583672 1442587330
## [89] 1442591011 1442594690 1442598327 1442602124 1442605778 1442609424 1442613054 1442616469

Extracting log information (2)

# Adapt the extract_info() function.
extract_info <- function(x, property) {
  
  info <- c()
  
  for (log in x) {
   info <- c(info, log[[property]])
  }
  
  return(info)
}

# Call extract_info() on logs, set property to "timestamp"
extract_info(logs, "timestamp")
##  [1] 1442271667 1442275213 1442278843 1442282479 1442286000 1442289669 1442293400 1442296766
##  [9] 1442300369 1442304054 1442307595 1442311239 1442314999 1442318509 1442322115 1442325748
## [17] 1442329581 1442333148 1442336831 1442340478 1442344104 1442347809 1442351656 1442355230
## [25] 1442358897 1442362565 1442366247 1442369825 1442373613 1442377367 1442381036 1442384587
## [33] 1442388127 1442391716 1442395201 1442398873 1442402487 1442406138 1442409894 1442413503
## [41] 1442416980 1442420588 1442424320 1442427945 1442431540 1442435144 1442438737 1442442497
## [49] 1442446129 1442449737 1442453512 1442457227 1442460737 1442464501 1442468239 1442471721
## [57] 1442475370 1442478869 1442482442 1442486000 1442489681 1442493222 1442496823 1442500519
## [65] 1442504062 1442507676 1442511301 1442514920 1442518449 1442522024 1442525547 1442529154
## [73] 1442532866 1442536496 1442540274 1442543775 1442547438 1442550882 1442554480 1442558131
## [81] 1442561734 1442565493 1442569141 1442572776 1442576398 1442580077 1442583672 1442587330
## [89] 1442591011 1442594690 1442598327 1442602124 1442605778 1442609424 1442613054 1442616469
# Call extract_info() on logs, set property to "success"
extract_info(logs, "success")
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [17]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [33]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [49]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
## [65] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## [81]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

Extracting log information (3)

# Add default value for property argument
extract_info <- function(x, property="success") {
  info <- c()
  for (log in x) {
   info <- c(info, log[[property]])
  }
  return(info)
}

# Call extract_info() on logs, don't specify property
extract_info(logs)
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [17]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [33]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [49]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
## [65] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## [81]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
# Call extract_info() on logs, set property to "timestamp"
extract_info(logs, "timestamp")
##  [1] 1442271667 1442275213 1442278843 1442282479 1442286000 1442289669 1442293400 1442296766
##  [9] 1442300369 1442304054 1442307595 1442311239 1442314999 1442318509 1442322115 1442325748
## [17] 1442329581 1442333148 1442336831 1442340478 1442344104 1442347809 1442351656 1442355230
## [25] 1442358897 1442362565 1442366247 1442369825 1442373613 1442377367 1442381036 1442384587
## [33] 1442388127 1442391716 1442395201 1442398873 1442402487 1442406138 1442409894 1442413503
## [41] 1442416980 1442420588 1442424320 1442427945 1442431540 1442435144 1442438737 1442442497
## [49] 1442446129 1442449737 1442453512 1442457227 1442460737 1442464501 1442468239 1442471721
## [57] 1442475370 1442478869 1442482442 1442486000 1442489681 1442493222 1442496823 1442500519
## [65] 1442504062 1442507676 1442511301 1442514920 1442518449 1442522024 1442525547 1442529154
## [73] 1442532866 1442536496 1442540274 1442543775 1442547438 1442550882 1442554480 1442558131
## [81] 1442561734 1442565493 1442569141 1442572776 1442576398 1442580077 1442583672 1442587330
## [89] 1442591011 1442594690 1442598327 1442602124 1442605778 1442609424 1442613054 1442616469

Extracting log information (4)

# Adapt extract_info():
# - add argument with default value
# - change function body
extract_info <- function(x, property = "success", include_all=T) {
  info <- c()
  for (log in x) {

   # add if construct around the line below
   if (include_all || !log$success) {
      
      info <- c(info, log[[property]])
      
    }

  }
  return(info)
}

# Call extract_info() on logs, no additional arguments
extract_info(logs)
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [17]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [33]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [49]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
## [65] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## [81]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
# Call extract_info() on logs, set include_all to FALSE
extract_info(logs, include_all=F)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Extracting log information (5)

# Defition of the extract_info() function
extract_info <- function(x, property = "success", include_all = TRUE) {
  info <- c()
  for (log in x) {
    if (include_all || !log$success) {
     info <- c(info, log[[property]])
    }
  }
  return(info)
}

# Generate vector of messages
extract_info(logs, property = c('details', 'message'))
##  [1] "check"              "all good"           "check"              "check"             
##  [5] "ok"                 "all good"           "check"              "all good"          
##  [9] "all good"           "stack overflow"     "ok"                 "ok"                
## [13] "all good"           "check"              "check"              "ok"                
## [17] "check"              "check"              "all good"           "segmentation fault"
## [21] "check"              "all good"           "human error"        "check"             
## [25] "ok"                 "all good"           "check"              "ok"                
## [29] "ok"                 "all good"           "ok"                 "ok"                
## [33] "ok"                 "ok"                 "check"              "all good"          
## [37] "ok"                 "ok"                 "ok"                 "ok"                
## [41] "ok"                 "ok"                 "check"              "all good"          
## [45] "all good"           "check"              "check"              "check"             
## [49] "check"              "check"              "stack overflow"     "ok"                
## [53] "ok"                 "all good"           "check"              "ok"                
## [57] "check"              "ok"                 "check"              "ok"                
## [61] "ok"                 "ok"                 "segmentation fault" "check"             
## [65] "human error"        "ok"                 "ok"                 "all good"          
## [69] "ok"                 "all good"           "check"              "check"             
## [73] "ok"                 "check"              "all good"           "ok"                
## [77] "human error"        "ok"                 "check"              "all good"          
## [81] "check"              "check"              "check"              "check"             
## [85] "check"              "ok"                 "ok"                 "all good"          
## [89] "ok"                 "all good"           "all good"           "all good"          
## [93] "all good"           "ok"                 "check"              "check"
# Generate vector of locations for failed log entries
extract_info(logs, property = c('details','location'), include_all = F)
## [1] "control room" "waste"        "reactor"      "tubes"        "waste"        "waste"       
## [7] "waste"

Over to you

# Write the function compute_fail_pct

compute_fail_pct <- function(logs) {
  
  is_fail <- c()
  
  for (i in 1:length(logs)) {
    is_fail[i] <- logs[[i]]$success == FALSE
  }
  
  fail_pct <- sum(is_fail)/length(is_fail) * 100
  
  return(fail_pct)
}


# Call compute_fail_pct on logs
compute_fail_pct(logs)
## [1] 7.291667

   


The apply family

lapply refresher

# Call length() on each element of logs
lapply(logs,length) %>% head()
## [[1]]
## [1] 4
## 
## [[2]]
## [1] 4
## 
## [[3]]
## [1] 4
## 
## [[4]]
## [1] 4
## 
## [[5]]
## [1] 4
## 
## [[6]]
## [1] 4
# Call class() on each element of logs
lapply(logs,class) %>% head()
## [[1]]
## [1] "list"
## 
## [[2]]
## [1] "list"
## 
## [[3]]
## [1] "list"
## 
## [[4]]
## [1] "list"
## 
## [[5]]
## [1] "list"
## 
## [[6]]
## [1] "list"

lapply on logs (1)

# Define get_timestamp()
get_timestamp <- function(x) {
  return(x$timestamp)
}

# Apply get_timestamp() over all elements in logs
lapply(logs, get_timestamp) %>% head()
## [[1]]
## [1] "2015-09-14 19:01:07 EDT"
## 
## [[2]]
## [1] "2015-09-14 20:00:13 EDT"
## 
## [[3]]
## [1] "2015-09-14 21:00:43 EDT"
## 
## [[4]]
## [1] "2015-09-14 22:01:18 EDT"
## 
## [[5]]
## [1] "2015-09-14 22:59:59 EDT"
## 
## [[6]]
## [1] "2015-09-15 00:01:08 EDT"

lapply on logs (2)

# Have lapply() use an anonymous function
lapply(logs, function(x) {x$timestamp}) %>% head()
## [[1]]
## [1] "2015-09-14 19:01:07 EDT"
## 
## [[2]]
## [1] "2015-09-14 20:00:13 EDT"
## 
## [[3]]
## [1] "2015-09-14 21:00:43 EDT"
## 
## [[4]]
## [1] "2015-09-14 22:01:18 EDT"
## 
## [[5]]
## [1] "2015-09-14 22:59:59 EDT"
## 
## [[6]]
## [1] "2015-09-15 00:01:08 EDT"

lapply on logs (3)

# Replace the anonymous function with `[[` 
lapply(logs, `[[`, 'timestamp') %>% head()
## [[1]]
## [1] "2015-09-14 19:01:07 EDT"
## 
## [[2]]
## [1] "2015-09-14 20:00:13 EDT"
## 
## [[3]]
## [1] "2015-09-14 21:00:43 EDT"
## 
## [[4]]
## [1] "2015-09-14 22:01:18 EDT"
## 
## [[5]]
## [1] "2015-09-14 22:59:59 EDT"
## 
## [[6]]
## [1] "2015-09-15 00:01:08 EDT"

sapply refresher

# Call length() on each element of logs using sapply()
sapply(logs, length)
##  [1] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [49] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
# Definition of get_timestamp
get_timestamp <- function(x) {
  x$timestamp
}

# Get vector of log entries' timestamps
sapply(logs, get_timestamp)
##  [1] 1442271667 1442275213 1442278843 1442282479 1442286000 1442289669 1442293400 1442296766
##  [9] 1442300369 1442304054 1442307595 1442311239 1442314999 1442318509 1442322115 1442325748
## [17] 1442329581 1442333148 1442336831 1442340478 1442344104 1442347809 1442351656 1442355230
## [25] 1442358897 1442362565 1442366247 1442369825 1442373613 1442377367 1442381036 1442384587
## [33] 1442388127 1442391716 1442395201 1442398873 1442402487 1442406138 1442409894 1442413503
## [41] 1442416980 1442420588 1442424320 1442427945 1442431540 1442435144 1442438737 1442442497
## [49] 1442446129 1442449737 1442453512 1442457227 1442460737 1442464501 1442468239 1442471721
## [57] 1442475370 1442478869 1442482442 1442486000 1442489681 1442493222 1442496823 1442500519
## [65] 1442504062 1442507676 1442511301 1442514920 1442518449 1442522024 1442525547 1442529154
## [73] 1442532866 1442536496 1442540274 1442543775 1442547438 1442550882 1442554480 1442558131
## [81] 1442561734 1442565493 1442569141 1442572776 1442576398 1442580077 1442583672 1442587330
## [89] 1442591011 1442594690 1442598327 1442602124 1442605778 1442609424 1442613054 1442616469

sapply on logs (1)

# Use sapply() to select the success element from each log: results
results <- sapply(logs, `[[`, 'success')

# Call mean() on results
mean(results)
## [1] 0.9270833
# Use sapply() to select the details element from each log
sapply(logs, `[[`, 'details') %>% head()
## [[1]]
## [[1]]$message
## [1] "check"
## 
## 
## [[2]]
## [[2]]$message
## [1] "all good"
## 
## 
## [[3]]
## [[3]]$message
## [1] "check"
## 
## 
## [[4]]
## [[4]]$message
## [1] "check"
## 
## 
## [[5]]
## [[5]]$message
## [1] "ok"
## 
## 
## [[6]]
## [[6]]$message
## [1] "all good"

sapply on logs (2)

# Implement function get_failure_loc
get_failure_loc <- function(x) {
  if (x$success==T) {
    return(NULL)
  } else {
    return(x$details$location)
  }
}

# Use sapply() to call get_failure_loc on logs
sapply(logs, get_failure_loc) %>% head(11)
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## [1] "control room"
## 
## [[11]]
## NULL

vapply refresher

# Convert the sapply call to vapply
vapply(logs, length, integer(1))
##  [1] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [49] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
# Convert the sapply call to vapply
vapply(logs, `[[`, "success", FUN.VALUE = logical(1))
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [17]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [33]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [49]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
## [65] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## [81]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

vapply on logs (1)

# Convert the sapply() call to a vapply() or lapply() call
vapply(logs, `[[`, c("details", "message"), FUN.VALUE=character(1))
##  [1] "check"              "all good"           "check"              "check"             
##  [5] "ok"                 "all good"           "check"              "all good"          
##  [9] "all good"           "stack overflow"     "ok"                 "ok"                
## [13] "all good"           "check"              "check"              "ok"                
## [17] "check"              "check"              "all good"           "segmentation fault"
## [21] "check"              "all good"           "human error"        "check"             
## [25] "ok"                 "all good"           "check"              "ok"                
## [29] "ok"                 "all good"           "ok"                 "ok"                
## [33] "ok"                 "ok"                 "check"              "all good"          
## [37] "ok"                 "ok"                 "ok"                 "ok"                
## [41] "ok"                 "ok"                 "check"              "all good"          
## [45] "all good"           "check"              "check"              "check"             
## [49] "check"              "check"              "stack overflow"     "ok"                
## [53] "ok"                 "all good"           "check"              "ok"                
## [57] "check"              "ok"                 "check"              "ok"                
## [61] "ok"                 "ok"                 "segmentation fault" "check"             
## [65] "human error"        "ok"                 "ok"                 "all good"          
## [69] "ok"                 "all good"           "check"              "check"             
## [73] "ok"                 "check"              "all good"           "ok"                
## [77] "human error"        "ok"                 "check"              "all good"          
## [81] "check"              "check"              "check"              "check"             
## [85] "check"              "ok"                 "ok"                 "all good"          
## [89] "ok"                 "all good"           "all good"           "all good"          
## [93] "all good"           "ok"                 "check"              "check"
# Convert the sapply() call to a vapply() or lapply() call
lapply(logs, function(x) { x$details }) %>% head()
## [[1]]
## [[1]]$message
## [1] "check"
## 
## 
## [[2]]
## [[2]]$message
## [1] "all good"
## 
## 
## [[3]]
## [[3]]$message
## [1] "check"
## 
## 
## [[4]]
## [[4]]$message
## [1] "check"
## 
## 
## [[5]]
## [[5]]$message
## [1] "ok"
## 
## 
## [[6]]
## [[6]]$message
## [1] "all good"

loop it the way you want it

# Return vector with uppercase version of message elements in log entries
toupper(sapply(logs, `[[`, c('details','message')))
##  [1] "CHECK"              "ALL GOOD"           "CHECK"              "CHECK"             
##  [5] "OK"                 "ALL GOOD"           "CHECK"              "ALL GOOD"          
##  [9] "ALL GOOD"           "STACK OVERFLOW"     "OK"                 "OK"                
## [13] "ALL GOOD"           "CHECK"              "CHECK"              "OK"                
## [17] "CHECK"              "CHECK"              "ALL GOOD"           "SEGMENTATION FAULT"
## [21] "CHECK"              "ALL GOOD"           "HUMAN ERROR"        "CHECK"             
## [25] "OK"                 "ALL GOOD"           "CHECK"              "OK"                
## [29] "OK"                 "ALL GOOD"           "OK"                 "OK"                
## [33] "OK"                 "OK"                 "CHECK"              "ALL GOOD"          
## [37] "OK"                 "OK"                 "OK"                 "OK"                
## [41] "OK"                 "OK"                 "CHECK"              "ALL GOOD"          
## [45] "ALL GOOD"           "CHECK"              "CHECK"              "CHECK"             
## [49] "CHECK"              "CHECK"              "STACK OVERFLOW"     "OK"                
## [53] "OK"                 "ALL GOOD"           "CHECK"              "OK"                
## [57] "CHECK"              "OK"                 "CHECK"              "OK"                
## [61] "OK"                 "OK"                 "SEGMENTATION FAULT" "CHECK"             
## [65] "HUMAN ERROR"        "OK"                 "OK"                 "ALL GOOD"          
## [69] "OK"                 "ALL GOOD"           "CHECK"              "CHECK"             
## [73] "OK"                 "CHECK"              "ALL GOOD"           "OK"                
## [77] "HUMAN ERROR"        "OK"                 "CHECK"              "ALL GOOD"          
## [81] "CHECK"              "CHECK"              "CHECK"              "CHECK"             
## [85] "CHECK"              "OK"                 "OK"                 "ALL GOOD"          
## [89] "OK"                 "ALL GOOD"           "ALL GOOD"           "ALL GOOD"          
## [93] "ALL GOOD"           "OK"                 "CHECK"              "CHECK"

   


Utilities

Titanic

# Call dim on titanic
dim(titanic)
## [1] 891  12
# Generate histogram of Age column
hist(titanic$Age)

Exploratory queries

# Print out total value of fares
str(titanic)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 354 273 16 555 516 625 413 577 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
sum(titanic$Fare)
## [1] 28693.95
# Print out proportion of passengers that survived
mean(titanic$Survived)
## [1] 0.3838384

Infer gender from names(1)

# Extract the name column from titanic
pass_names <- titanic$Name

# Create the logical vectror is_man
is_man <- grepl(", Mr\\.", pass_names)

# Count the number of men
sum(is_man)
## [1] 517
# Count number of men based on gender
sum(titanic$Sex == "male")
## [1] 577

Infer gender from names(2)

# Extract the name column from titanic
pass_names <- titanic$Name

# Create titles
titles <- gsub("^.*, (.*?)\\..*$", "\\1", pass_names)

# Call unique() on titles
unique(titles)
##  [1] "Mr"           "Mrs"          "Miss"         "Master"       "Don"          "Rev"         
##  [7] "Dr"           "Mme"          "Ms"           "Major"        "Lady"         "Sir"         
## [13] "Mlle"         "Col"          "Capt"         "the Countess" "Jonkheer"

Infer gender from names(3)

pass_names <- titanic$Name
titles_male <- paste(",", c("Mr\\.", "Master", "Don", "Rev", "Dr\\.", "Major", "Sir", "Col", "Capt", "Jonkheer"))

# Finish the vapply() command
hits <- vapply(titles_male,
               FUN = grepl,
               FUN.VALUE = logical(length(pass_names)),
               pass_names)

# Calculate the sum() of hits
sum(hits)
## [1] 578
# Count number of men based on gender
sum(titanic$Sex == "male")
## [1] 577

Why is it off by one?

  • They don’t really address this in the class
  • But one function I use a lot is the table function
  • It easily shows whats off with the title assumptions and the gender
    • There is a female doctor on the boat but they are counting Dr as a male title
    • Definitely need to be careful with things like this. Other titles could go eaither way.
table(titles, titanic$Sex)
##               
## titles         female male
##   Capt              0    1
##   Col               0    2
##   Don               0    1
##   Dr                1    6
##   Jonkheer          0    1
##   Lady              1    0
##   Major             0    2
##   Master            0   40
##   Miss            182    0
##   Mlle              2    0
##   Mme               1    0
##   Mr                0  517
##   Mrs             125    0
##   Ms                1    0
##   Rev               0    6
##   Sir               0    1
##   the Countess      1    0

Reformat passenger names

  • this is some great regex
# pass_names is available in your workspace

convert_name <- function(name) {
  # women: take name from inside parentheses
  if (grepl("\\(.*?\\)", name)) {
    gsub("^.*?\\((.*?)\\)$", "\\1", name)
  # men: take name before comma and after title
  } else {
    # Finish the gsub() function
    gsub("^(.*?),\\s[a-zA-Z\\.]*?\\s(.*?)$", "\\2 \\1", name)
  }
}

# Call convert_name on name
clean_pass_names <- vapply(pass_names, FUN = convert_name,
                           FUN.VALUE = character(1), USE.NAMES = FALSE)

# Print out clean_pass_names
head(clean_pass_names, 20)
##  [1] "Owen Harris Braund"               "Florence Briggs Thayer"          
##  [3] "Laina Heikkinen"                  "Lily May Peel"                   
##  [5] "William Henry Allen"              "James Moran"                     
##  [7] "Timothy J McCarthy"               "Gosta Leonard Palsson"           
##  [9] "Elisabeth Vilhelmina Berg"        "Adele Achem"                     
## [11] "Marguerite Rut Sandstrom"         "Elizabeth Bonnell"               
## [13] "William Henry Saundercock"        "Anders Johan Andersson"          
## [15] "Hulda Amanda Adolfina Vestrom"    "Hewlett, Mrs. (Mary D Kingcome) "
## [17] "Eugene Rice"                      "Charles Eugene Williams"         
## [19] "Emelia Maria Vandemoortele"       "Fatima Masselmani"

Add birth dates

  • I’m adding the dob data manually

  • the strptime function documentation has all the codes for matching dates

# Have a look at head() of dob1 and dob2
head(dob1)
## [1] "1890-01-03" "1874-02-28" "1886-01-17" "1877-03-04" "1876-09-28" NA
head(dob2)
## [1] "November 26, 1872"  "September 09, 1885" "September 09, 1890" "April 01, 1884"    
## [5] "August 08, 1891"    "April 20, 1877"
# Convert dob1 to dob1d, convert dob2 to dob2d
dob1d <- as.Date(dob1, '%Y-%m-%d')

dob2d <- as.Date(dob2, '%b %d, %Y')

# Combine dob1d and dob2d into single vector: birth_dates
birth_dates <- c(dob1d,dob2d)

Average age

  • I have not used subset in a while. Just use dplyr::filter now
# titanic, dob1 and dob2 are preloaded
dob1d <- as.Date(dob1)
dob2d <- as.Date(dob2, format = "%B %d, %Y")
birth_dates <- c(dob1d, dob2d)
disaster_date <- as.Date("1912-04-15")

# Add birth_dates to titanic (column Birth)
titanic$Birth <- birth_dates
str(titanic)
## 'data.frame':    891 obs. of  13 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 354 273 16 555 516 625 413 577 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
##  $ Birth      : Date, format: "1890-01-03" "1874-02-28" "1886-01-17" ...
# Create subset: survivors
survivors <- subset(titanic, Survived == TRUE)
str(survivors)
## 'data.frame':    342 obs. of  13 variables:
##  $ PassengerId: int  2 3 4 9 10 11 12 16 18 20 ...
##  $ Survived   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Pclass     : int  1 3 1 3 2 3 1 2 2 3 ...
##  $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 191 354 273 413 577 728 96 360 868 513 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 1 1 1 1 1 1 1 1 2 1 ...
##  $ Age        : num  38 26 35 27 14 4 58 55 NA NA ...
##  $ SibSp      : int  1 0 1 0 1 1 0 0 0 0 ...
##  $ Parch      : int  0 0 0 2 0 1 0 0 0 0 ...
##  $ Ticket     : Factor w/ 681 levels "110152","110413",..: 597 670 50 345 133 617 39 154 152 185 ...
##  $ Fare       : num  71.28 7.92 53.1 11.13 30.07 ...
##  $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 83 1 57 1 1 147 51 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 2 4 4 4 2 4 4 4 4 2 ...
##  $ Birth      : Date, format: "1874-02-28" "1886-01-17" "1877-03-04" ...
# Calculate average age of survivors
mean(disaster_date - survivors$Birth, na.rm=T)
## Time difference of 10532.41 days

Age in perspective

## How old was the youngest survivor when Apollo 11 landed on the moon on July 20, 1969?

min(as.Date('1969-07-20') - survivors$Birth, na.rm=T)
## Time difference of 21285 days

Conclusion