This document was updated on 2017-07-04.
library(dplyr)
==
<
>
&&
||
!
if
else
## my grade in the class (hypothetically)
me
## [1] 89
## the other students grades
glimpse(other_199)
## num [1:188] 66 81 77 90 95 54 78 90 71 96 ...
## grades for students over the past 4 years
glimpse(previous_4)
## int [1:200, 1:4] 82 66 66 86 76 74 56 80 71 69 ...
head(previous_4)
## [,1] [,2] [,3] [,4]
## [1,] 82 89 53 70
## [2,] 66 89 66 76
## [3,] 66 65 85 73
## [4,] 86 79 65 64
## [5,] 76 83 75 56
## [6,] 74 73 69 71
# Merge me and other_199: my_class
my_class <- c(me, other_199)
head(my_class)
## [1] 89 66 81 77 90 95
# cbind() my_class and previous_4: last_5
last_5 <- cbind(my_class, previous_4)
head(last_5)
## my_class
## [1,] 89 82 89 53 70
## [2,] 66 66 89 66 76
## [3,] 81 66 65 85 73
## [4,] 77 86 79 65 64
## [5,] 90 76 83 75 56
## [6,] 95 74 73 69 71
# Name last_5 appropriately
nms <- paste0("year_", 1:5)
colnames(last_5) <- nms
head(last_5)
## year_1 year_2 year_3 year_4 year_5
## [1,] 89 82 89 53 70
## [2,] 66 66 89 66 76
## [3,] 81 66 65 85 73
## [4,] 77 86 79 65 64
## [5,] 90 76 83 75 56
## [6,] 95 74 73 69 71
# Build histogram of my_class
hist(my_class)
# Generate summary of last_5
summary(last_5)
## year_1 year_2 year_3 year_4 year_5
## Min. : 50.00 Min. : 50.00 Min. : 50.00 Min. : 50.00 Min. : 50.00
## 1st Qu.: 68.00 1st Qu.: 63.00 1st Qu.: 70.00 1st Qu.: 66.00 1st Qu.: 70.00
## Median : 75.50 Median : 71.50 Median : 77.00 Median : 73.50 Median : 78.00
## Mean : 75.78 Mean : 72.28 Mean : 76.24 Mean : 74.50 Mean : 77.69
## 3rd Qu.: 83.25 3rd Qu.: 80.00 3rd Qu.: 84.00 3rd Qu.: 82.25 3rd Qu.: 88.00
## Max. :100.00 Max. :100.00 Max. :100.00 Max. :100.00 Max. :100.00
# Build boxplot of last_5
boxplot(last_5)
# Is your grade equal to 72?
me == 72
## [1] FALSE
# Which grades in your class are higher than 75?
head(my_class > 75, 10)
## [1] TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
# Which grades in the last 5 years are below or equal to 64?
head(last_5 <= 64, 10)
## year_1 year_2 year_3 year_4 year_5
## [1,] FALSE FALSE FALSE TRUE FALSE
## [2,] FALSE FALSE FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE FALSE TRUE
## [5,] FALSE FALSE FALSE FALSE TRUE
## [6,] FALSE FALSE FALSE FALSE FALSE
## [7,] TRUE TRUE FALSE FALSE FALSE
## [8,] FALSE FALSE FALSE FALSE FALSE
## [9,] FALSE FALSE TRUE FALSE FALSE
## [10,] FALSE FALSE FALSE FALSE FALSE
# How many grades in your class are higher than 75?
sum(my_class > 75)
## [1] 92
# How many students in your class scored strictly higher than you?
sum(my_class > me)
## [1] 26
# What's the proportion of grades below or equal to 64 in the last 5 years?
mean(last_5 <= 64)
## [1] 0.2
# Is your grade greater than 87 and smaller than or equal to 89?
me > 87 & me <= 89
## [1] TRUE
# Which grades in your class are below 60 or above 90?
head(my_class < 60 | my_class > 90, 20)
## [1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## [17] FALSE TRUE FALSE TRUE
# What's the proportion of grades in your class that is average?
mean(my_class >= 70 & my_class <= 85)
## [1] 0.5026455
# How many students in the last 5 years had a grade of 80 or 90?
sum(last_5 == 80 | last_5 == 90)
## [1] 86
if (me > 80) {
print("Good student!")
} else {
print("Better luck next year!")
}
## [1] "Good student!"
# Define n_smart
n_smart <- sum(my_class >= 80)
# Code the if-else construct
if (n_smart > 50) {
print("smart class")
} else {
print("rather average")
}
## [1] "smart class"
# Define prop_less
prop_less <- mean(my_class < me)
# Code the control construct
if(prop_less > .90) {
print("you're among the best 10 percent")
} else if (prop_less > .80) {
print("you're among the best 20 percent")
} else {
print("need more analysis")
}
## [1] "you're among the best 20 percent"
# Embedded control structure: fix the error
if (mean(my_class) < 75) {
if (mean(my_class) > me) {
print("average year, but still smarter than me")
} else {
print("average year, but I'm not that bad")
}
} else {
if (mean(my_class) > me) {
print("smart year, even smarter than me")
} else {
print("smart year, but I am smarter")
}
}
## [1] "smart year, but I am smarter"
# Create top_grades
top_grades <- my_class[my_class >= 85]
# Create worst_grades
worst_grades <- my_class[my_class < 65]
# Write conditional statement
if (length(top_grades) > length(worst_grades)) {
print("top grades prevail")
}
## [1] "top grades prevail"
I have loaded the data into the notebook manually
str()
of logs will print the str of all entriesI shorten here to save space
length(logs)
## [1] 96
logs[1:3]
## [[1]]
## [[1]]$success
## [1] TRUE
##
## [[1]]$details
## [[1]]$details$message
## [1] "check"
##
##
## [[1]]$timestamp
## [1] "2015-09-14 19:01:07 EDT"
##
##
## [[2]]
## [[2]]$success
## [1] TRUE
##
## [[2]]$details
## [[2]]$details$message
## [1] "all good"
##
##
## [[2]]$timestamp
## [1] "2015-09-14 20:00:13 EDT"
##
##
## [[3]]
## [[3]]$success
## [1] TRUE
##
## [[3]]$details
## [[3]]$details$message
## [1] "check"
##
##
## [[3]]$timestamp
## [1] "2015-09-14 21:00:43 EDT"
str(logs[1:3])
## List of 3
## $ :List of 3
## ..$ success : logi TRUE
## ..$ details :List of 1
## .. ..$ message: chr "check"
## ..$ timestamp: POSIXct[1:1], format: "2015-09-14 19:01:07"
## $ :List of 3
## ..$ success : logi TRUE
## ..$ details :List of 1
## .. ..$ message: chr "all good"
## ..$ timestamp: POSIXct[1:1], format: "2015-09-14 20:00:13"
## $ :List of 3
## ..$ success : logi TRUE
## ..$ details :List of 1
## .. ..$ message: chr "check"
## ..$ timestamp: POSIXct[1:1], format: "2015-09-14 21:00:43"
# Print the structure of logs
# str(logs)
# Use list subsetting to print the details part of 11th logs entry
logs[11]
## [[1]]
## [[1]]$success
## [1] TRUE
##
## [[1]]$details
## [[1]]$details$message
## [1] "ok"
##
##
## [[1]]$timestamp
## [1] "2015-09-15 04:59:54 EDT"
# Print the class of the timestamp component of the first entry
class(logs[[1]]$timestamp)
## [1] "POSIXct" "POSIXt"
## You can index into a list like this
logs[[2]][[2]][[1]]
## [1] "all good"
## You can also use an array to index
logs[[c(2, 2, 1)]]
## [1] "all good"
## Note that if success == FALSE then there are two elements in the details
# Initialize the iterator i to be 1
i <- 1
# Code the while loop
while ( logs[[i]]$success == TRUE ) {
print(i)
i <- i + 1
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
# Adapt the while loop
i <- 1
while (logs[[i]]$success) {
print(logs[[i]]$details$message)
i <- i + 1
}
## [1] "check"
## [1] "all good"
## [1] "check"
## [1] "check"
## [1] "ok"
## [1] "all good"
## [1] "check"
## [1] "all good"
## [1] "all good"
# Initialize i and found
i <- 1
found <- FALSE
# Code the while loop
while (found == FALSE) {
if (logs[[i]]$success == FALSE && logs[[i]]$details$location == "waste") {
print("found")
found <- TRUE
} else {
print("still looking")
i <- i + 1
}
}
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "still looking"
## [1] "found"
# Code a for loop that prints the timestamp of each log
for (log in logs[1:10]) {
print(log$timestamp)
}
## [1] "2015-09-14 19:01:07 EDT"
## [1] "2015-09-14 20:00:13 EDT"
## [1] "2015-09-14 21:00:43 EDT"
## [1] "2015-09-14 22:01:18 EDT"
## [1] "2015-09-14 22:59:59 EDT"
## [1] "2015-09-15 00:01:08 EDT"
## [1] "2015-09-15 01:03:20 EDT"
## [1] "2015-09-15 01:59:25 EDT"
## [1] "2015-09-15 02:59:29 EDT"
## [1] "2015-09-15 04:00:53 EDT"
# Make the printout conditional: only if success
for (log in logs[1:20]) {
if (log$success == TRUE) {
print(log$timestamp)
}
}
## [1] "2015-09-14 19:01:07 EDT"
## [1] "2015-09-14 20:00:13 EDT"
## [1] "2015-09-14 21:00:43 EDT"
## [1] "2015-09-14 22:01:18 EDT"
## [1] "2015-09-14 22:59:59 EDT"
## [1] "2015-09-15 00:01:08 EDT"
## [1] "2015-09-15 01:03:20 EDT"
## [1] "2015-09-15 01:59:25 EDT"
## [1] "2015-09-15 02:59:29 EDT"
## [1] "2015-09-15 04:59:54 EDT"
## [1] "2015-09-15 06:00:39 EDT"
## [1] "2015-09-15 07:03:18 EDT"
## [1] "2015-09-15 08:01:49 EDT"
## [1] "2015-09-15 09:01:54 EDT"
## [1] "2015-09-15 10:02:27 EDT"
## [1] "2015-09-15 11:06:20 EDT"
## [1] "2015-09-15 12:05:48 EDT"
## [1] "2015-09-15 13:07:10 EDT"
# Finish the for loop: add date element for each entry
for (i in 1:length(logs)) {
logs[[i]]$date <- as.Date(logs[[i]]$timestamp)
}
# Print first 6 elements in logs
head(logs, 3)
## [[1]]
## [[1]]$success
## [1] TRUE
##
## [[1]]$details
## [[1]]$details$message
## [1] "check"
##
##
## [[1]]$timestamp
## [1] "2015-09-14 19:01:07 EDT"
##
## [[1]]$date
## [1] "2015-09-14"
##
##
## [[2]]
## [[2]]$success
## [1] TRUE
##
## [[2]]$details
## [[2]]$details$message
## [1] "all good"
##
##
## [[2]]$timestamp
## [1] "2015-09-14 20:00:13 EDT"
##
## [[2]]$date
## [1] "2015-09-15"
##
##
## [[3]]
## [[3]]$success
## [1] TRUE
##
## [[3]]$details
## [[3]]$details$message
## [1] "check"
##
##
## [[3]]$timestamp
## [1] "2015-09-14 21:00:43 EDT"
##
## [[3]]$date
## [1] "2015-09-15"
# Intialize empty list: failures
failures <- list()
# Finish the for loop: add each failure to failures
for (log in logs) {
if (log$success == FALSE) {
failures <- c(failures, list(log))
}
}
# Display the structure of failures
str(failures)
## List of 7
## $ :List of 4
## ..$ success : logi FALSE
## ..$ details :List of 2
## .. ..$ message : chr "stack overflow"
## .. ..$ location: chr "control room"
## ..$ timestamp: POSIXct[1:1], format: "2015-09-15 04:00:53"
## ..$ date : Date[1:1], format: "2015-09-15"
## $ :List of 4
## ..$ success : logi FALSE
## ..$ details :List of 2
## .. ..$ message : chr "segmentation fault"
## .. ..$ location: chr "waste"
## ..$ timestamp: POSIXct[1:1], format: "2015-09-15 14:07:58"
## ..$ date : Date[1:1], format: "2015-09-15"
## $ :List of 4
## ..$ success : logi FALSE
## ..$ details :List of 2
## .. ..$ message : chr "human error"
## .. ..$ location: chr "reactor"
## ..$ timestamp: POSIXct[1:1], format: "2015-09-15 17:14:16"
## ..$ date : Date[1:1], format: "2015-09-15"
## $ :List of 4
## ..$ success : logi FALSE
## ..$ details :List of 2
## .. ..$ message : chr "stack overflow"
## .. ..$ location: chr "tubes"
## ..$ timestamp: POSIXct[1:1], format: "2015-09-16 21:31:51"
## ..$ date : Date[1:1], format: "2015-09-17"
## $ :List of 4
## ..$ success : logi FALSE
## ..$ details :List of 2
## .. ..$ message : chr "segmentation fault"
## .. ..$ location: chr "waste"
## ..$ timestamp: POSIXct[1:1], format: "2015-09-17 09:33:43"
## ..$ date : Date[1:1], format: "2015-09-17"
## $ :List of 4
## ..$ success : logi FALSE
## ..$ details :List of 2
## .. ..$ message : chr "human error"
## .. ..$ location: chr "waste"
## ..$ timestamp: POSIXct[1:1], format: "2015-09-17 11:34:22"
## ..$ date : Date[1:1], format: "2015-09-17"
## $ :List of 4
## ..$ success : logi FALSE
## ..$ details :List of 2
## .. ..$ message : chr "human error"
## .. ..$ location: chr "waste"
## ..$ timestamp: POSIXct[1:1], format: "2015-09-17 23:37:18"
## ..$ date : Date[1:1], format: "2015-09-18"
dplyr
and other libraries that are specifically made to crunch data.Convert the variable level list to a dataframe:
as.data.frame
function to each log recorddetails.location
is not always present.bind_rows
(from dplyr
package) to combine the list of dataframes
message
and details.message
are the same column
details.message
to be message
when its NAas.data.frame
would always pull in the full name path from a list as the column namemessage
when its the only thing in details or details.message
when there are multiple recoreds under detailsmessage
column since we have moved it into details.message
df_logs <- logs %>%
sapply(as.data.frame) %>%
bind_rows() %>%
mutate(
details.message = ifelse(is.na(details.message), message, details.message)
) %>%
select(-message)
head(df_logs, 12)
## success timestamp date details.message details.location
## 1 TRUE 2015-09-14 19:01:07 2015-09-14 check <NA>
## 2 TRUE 2015-09-14 20:00:13 2015-09-15 all good <NA>
## 3 TRUE 2015-09-14 21:00:43 2015-09-15 check <NA>
## 4 TRUE 2015-09-14 22:01:18 2015-09-15 check <NA>
## 5 TRUE 2015-09-14 22:59:59 2015-09-15 ok <NA>
## 6 TRUE 2015-09-15 00:01:08 2015-09-15 all good <NA>
## 7 TRUE 2015-09-15 01:03:20 2015-09-15 check <NA>
## 8 TRUE 2015-09-15 01:59:25 2015-09-15 all good <NA>
## 9 TRUE 2015-09-15 02:59:29 2015-09-15 all good <NA>
## 10 FALSE 2015-09-15 04:00:53 2015-09-15 stack overflow control room
## 11 TRUE 2015-09-15 04:59:54 2015-09-15 ok <NA>
## 12 TRUE 2015-09-15 06:00:39 2015-09-15 ok <NA>
## subset the data
df_logs %>% filter(success==FALSE)
## success timestamp date details.message details.location
## 1 FALSE 2015-09-15 04:00:53 2015-09-15 stack overflow control room
## 2 FALSE 2015-09-15 14:07:58 2015-09-15 segmentation fault waste
## 3 FALSE 2015-09-15 17:14:16 2015-09-15 human error reactor
## 4 FALSE 2015-09-16 21:31:51 2015-09-17 stack overflow tubes
## 5 FALSE 2015-09-17 09:33:43 2015-09-17 segmentation fault waste
## 6 FALSE 2015-09-17 11:34:22 2015-09-17 human error waste
## 7 FALSE 2015-09-17 23:37:18 2015-09-18 human error waste
df_logs %>% filter(success==FALSE & details.location == 'waste')
## success timestamp date details.message details.location
## 1 FALSE 2015-09-15 14:07:58 2015-09-15 segmentation fault waste
## 2 FALSE 2015-09-17 09:33:43 2015-09-17 segmentation fault waste
## 3 FALSE 2015-09-17 11:34:22 2015-09-17 human error waste
## 4 FALSE 2015-09-17 23:37:18 2015-09-18 human error waste
## add data column
df_logs %>% mutate(date = as.Date(timestamp)) %>% head(10)
## success timestamp date details.message details.location
## 1 TRUE 2015-09-14 19:01:07 2015-09-14 check <NA>
## 2 TRUE 2015-09-14 20:00:13 2015-09-15 all good <NA>
## 3 TRUE 2015-09-14 21:00:43 2015-09-15 check <NA>
## 4 TRUE 2015-09-14 22:01:18 2015-09-15 check <NA>
## 5 TRUE 2015-09-14 22:59:59 2015-09-15 ok <NA>
## 6 TRUE 2015-09-15 00:01:08 2015-09-15 all good <NA>
## 7 TRUE 2015-09-15 01:03:20 2015-09-15 check <NA>
## 8 TRUE 2015-09-15 01:59:25 2015-09-15 all good <NA>
## 9 TRUE 2015-09-15 02:59:29 2015-09-15 all good <NA>
## 10 FALSE 2015-09-15 04:00:53 2015-09-15 stack overflow control room
timestamps
data into the notebook manually# Call max() on timestamps
max(timestamps)
## [1] "2015-09-17 23:37:18 EDT"
# What is the date of the latest timestamp?
as.Date(max(timestamps))
## [1] "2015-09-18"
# Print out timestamps
print(timestamps)
## [1] "2015-09-15 04:00:53 EDT" "2015-09-15 14:07:58 EDT" "2015-09-15 17:14:16 EDT"
## [4] "2015-09-16 21:31:51 EDT" "2015-09-17 09:33:43 EDT" "2015-09-17 11:34:22 EDT"
## [7] "2015-09-17 23:37:18 EDT"
# Call max() on timestamps, no additional arguments
max(timestamps)
## [1] "2015-09-17 23:37:18 EDT"
# Call max() on timestamps, specify na.rm
max(timestamps, na.rm=T)
## [1] "2015-09-17 23:37:18 EDT"
# Build a function extract_info(): use for loop, add return statement
extract_info <- function(x) {
info <- c()
for (log in x) {
info <- c(info, log$timestamp)
}
return(info)
}
# Call extract_info() on logs
extract_info(logs)
## [1] 1442271667 1442275213 1442278843 1442282479 1442286000 1442289669 1442293400 1442296766
## [9] 1442300369 1442304054 1442307595 1442311239 1442314999 1442318509 1442322115 1442325748
## [17] 1442329581 1442333148 1442336831 1442340478 1442344104 1442347809 1442351656 1442355230
## [25] 1442358897 1442362565 1442366247 1442369825 1442373613 1442377367 1442381036 1442384587
## [33] 1442388127 1442391716 1442395201 1442398873 1442402487 1442406138 1442409894 1442413503
## [41] 1442416980 1442420588 1442424320 1442427945 1442431540 1442435144 1442438737 1442442497
## [49] 1442446129 1442449737 1442453512 1442457227 1442460737 1442464501 1442468239 1442471721
## [57] 1442475370 1442478869 1442482442 1442486000 1442489681 1442493222 1442496823 1442500519
## [65] 1442504062 1442507676 1442511301 1442514920 1442518449 1442522024 1442525547 1442529154
## [73] 1442532866 1442536496 1442540274 1442543775 1442547438 1442550882 1442554480 1442558131
## [81] 1442561734 1442565493 1442569141 1442572776 1442576398 1442580077 1442583672 1442587330
## [89] 1442591011 1442594690 1442598327 1442602124 1442605778 1442609424 1442613054 1442616469
# Adapt the extract_info() function.
extract_info <- function(x, property) {
info <- c()
for (log in x) {
info <- c(info, log[[property]])
}
return(info)
}
# Call extract_info() on logs, set property to "timestamp"
extract_info(logs, "timestamp")
## [1] 1442271667 1442275213 1442278843 1442282479 1442286000 1442289669 1442293400 1442296766
## [9] 1442300369 1442304054 1442307595 1442311239 1442314999 1442318509 1442322115 1442325748
## [17] 1442329581 1442333148 1442336831 1442340478 1442344104 1442347809 1442351656 1442355230
## [25] 1442358897 1442362565 1442366247 1442369825 1442373613 1442377367 1442381036 1442384587
## [33] 1442388127 1442391716 1442395201 1442398873 1442402487 1442406138 1442409894 1442413503
## [41] 1442416980 1442420588 1442424320 1442427945 1442431540 1442435144 1442438737 1442442497
## [49] 1442446129 1442449737 1442453512 1442457227 1442460737 1442464501 1442468239 1442471721
## [57] 1442475370 1442478869 1442482442 1442486000 1442489681 1442493222 1442496823 1442500519
## [65] 1442504062 1442507676 1442511301 1442514920 1442518449 1442522024 1442525547 1442529154
## [73] 1442532866 1442536496 1442540274 1442543775 1442547438 1442550882 1442554480 1442558131
## [81] 1442561734 1442565493 1442569141 1442572776 1442576398 1442580077 1442583672 1442587330
## [89] 1442591011 1442594690 1442598327 1442602124 1442605778 1442609424 1442613054 1442616469
# Call extract_info() on logs, set property to "success"
extract_info(logs, "success")
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [17] TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [33] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [49] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [65] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [81] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Add default value for property argument
extract_info <- function(x, property="success") {
info <- c()
for (log in x) {
info <- c(info, log[[property]])
}
return(info)
}
# Call extract_info() on logs, don't specify property
extract_info(logs)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [17] TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [33] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [49] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [65] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [81] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Call extract_info() on logs, set property to "timestamp"
extract_info(logs, "timestamp")
## [1] 1442271667 1442275213 1442278843 1442282479 1442286000 1442289669 1442293400 1442296766
## [9] 1442300369 1442304054 1442307595 1442311239 1442314999 1442318509 1442322115 1442325748
## [17] 1442329581 1442333148 1442336831 1442340478 1442344104 1442347809 1442351656 1442355230
## [25] 1442358897 1442362565 1442366247 1442369825 1442373613 1442377367 1442381036 1442384587
## [33] 1442388127 1442391716 1442395201 1442398873 1442402487 1442406138 1442409894 1442413503
## [41] 1442416980 1442420588 1442424320 1442427945 1442431540 1442435144 1442438737 1442442497
## [49] 1442446129 1442449737 1442453512 1442457227 1442460737 1442464501 1442468239 1442471721
## [57] 1442475370 1442478869 1442482442 1442486000 1442489681 1442493222 1442496823 1442500519
## [65] 1442504062 1442507676 1442511301 1442514920 1442518449 1442522024 1442525547 1442529154
## [73] 1442532866 1442536496 1442540274 1442543775 1442547438 1442550882 1442554480 1442558131
## [81] 1442561734 1442565493 1442569141 1442572776 1442576398 1442580077 1442583672 1442587330
## [89] 1442591011 1442594690 1442598327 1442602124 1442605778 1442609424 1442613054 1442616469
# Adapt extract_info():
# - add argument with default value
# - change function body
extract_info <- function(x, property = "success", include_all=T) {
info <- c()
for (log in x) {
# add if construct around the line below
if (include_all || !log$success) {
info <- c(info, log[[property]])
}
}
return(info)
}
# Call extract_info() on logs, no additional arguments
extract_info(logs)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [17] TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [33] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [49] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [65] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [81] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Call extract_info() on logs, set include_all to FALSE
extract_info(logs, include_all=F)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# Defition of the extract_info() function
extract_info <- function(x, property = "success", include_all = TRUE) {
info <- c()
for (log in x) {
if (include_all || !log$success) {
info <- c(info, log[[property]])
}
}
return(info)
}
# Generate vector of messages
extract_info(logs, property = c('details', 'message'))
## [1] "check" "all good" "check" "check"
## [5] "ok" "all good" "check" "all good"
## [9] "all good" "stack overflow" "ok" "ok"
## [13] "all good" "check" "check" "ok"
## [17] "check" "check" "all good" "segmentation fault"
## [21] "check" "all good" "human error" "check"
## [25] "ok" "all good" "check" "ok"
## [29] "ok" "all good" "ok" "ok"
## [33] "ok" "ok" "check" "all good"
## [37] "ok" "ok" "ok" "ok"
## [41] "ok" "ok" "check" "all good"
## [45] "all good" "check" "check" "check"
## [49] "check" "check" "stack overflow" "ok"
## [53] "ok" "all good" "check" "ok"
## [57] "check" "ok" "check" "ok"
## [61] "ok" "ok" "segmentation fault" "check"
## [65] "human error" "ok" "ok" "all good"
## [69] "ok" "all good" "check" "check"
## [73] "ok" "check" "all good" "ok"
## [77] "human error" "ok" "check" "all good"
## [81] "check" "check" "check" "check"
## [85] "check" "ok" "ok" "all good"
## [89] "ok" "all good" "all good" "all good"
## [93] "all good" "ok" "check" "check"
# Generate vector of locations for failed log entries
extract_info(logs, property = c('details','location'), include_all = F)
## [1] "control room" "waste" "reactor" "tubes" "waste" "waste"
## [7] "waste"
# Write the function compute_fail_pct
compute_fail_pct <- function(logs) {
is_fail <- c()
for (i in 1:length(logs)) {
is_fail[i] <- logs[[i]]$success == FALSE
}
fail_pct <- sum(is_fail)/length(is_fail) * 100
return(fail_pct)
}
# Call compute_fail_pct on logs
compute_fail_pct(logs)
## [1] 7.291667
map
function or something like the apply
functions.# Call length() on each element of logs
lapply(logs,length) %>% head()
## [[1]]
## [1] 4
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 4
##
## [[4]]
## [1] 4
##
## [[5]]
## [1] 4
##
## [[6]]
## [1] 4
# Call class() on each element of logs
lapply(logs,class) %>% head()
## [[1]]
## [1] "list"
##
## [[2]]
## [1] "list"
##
## [[3]]
## [1] "list"
##
## [[4]]
## [1] "list"
##
## [[5]]
## [1] "list"
##
## [[6]]
## [1] "list"
# Define get_timestamp()
get_timestamp <- function(x) {
return(x$timestamp)
}
# Apply get_timestamp() over all elements in logs
lapply(logs, get_timestamp) %>% head()
## [[1]]
## [1] "2015-09-14 19:01:07 EDT"
##
## [[2]]
## [1] "2015-09-14 20:00:13 EDT"
##
## [[3]]
## [1] "2015-09-14 21:00:43 EDT"
##
## [[4]]
## [1] "2015-09-14 22:01:18 EDT"
##
## [[5]]
## [1] "2015-09-14 22:59:59 EDT"
##
## [[6]]
## [1] "2015-09-15 00:01:08 EDT"
# Have lapply() use an anonymous function
lapply(logs, function(x) {x$timestamp}) %>% head()
## [[1]]
## [1] "2015-09-14 19:01:07 EDT"
##
## [[2]]
## [1] "2015-09-14 20:00:13 EDT"
##
## [[3]]
## [1] "2015-09-14 21:00:43 EDT"
##
## [[4]]
## [1] "2015-09-14 22:01:18 EDT"
##
## [[5]]
## [1] "2015-09-14 22:59:59 EDT"
##
## [[6]]
## [1] "2015-09-15 00:01:08 EDT"
# Replace the anonymous function with `[[`
lapply(logs, `[[`, 'timestamp') %>% head()
## [[1]]
## [1] "2015-09-14 19:01:07 EDT"
##
## [[2]]
## [1] "2015-09-14 20:00:13 EDT"
##
## [[3]]
## [1] "2015-09-14 21:00:43 EDT"
##
## [[4]]
## [1] "2015-09-14 22:01:18 EDT"
##
## [[5]]
## [1] "2015-09-14 22:59:59 EDT"
##
## [[6]]
## [1] "2015-09-15 00:01:08 EDT"
# Call length() on each element of logs using sapply()
sapply(logs, length)
## [1] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [49] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
# Definition of get_timestamp
get_timestamp <- function(x) {
x$timestamp
}
# Get vector of log entries' timestamps
sapply(logs, get_timestamp)
## [1] 1442271667 1442275213 1442278843 1442282479 1442286000 1442289669 1442293400 1442296766
## [9] 1442300369 1442304054 1442307595 1442311239 1442314999 1442318509 1442322115 1442325748
## [17] 1442329581 1442333148 1442336831 1442340478 1442344104 1442347809 1442351656 1442355230
## [25] 1442358897 1442362565 1442366247 1442369825 1442373613 1442377367 1442381036 1442384587
## [33] 1442388127 1442391716 1442395201 1442398873 1442402487 1442406138 1442409894 1442413503
## [41] 1442416980 1442420588 1442424320 1442427945 1442431540 1442435144 1442438737 1442442497
## [49] 1442446129 1442449737 1442453512 1442457227 1442460737 1442464501 1442468239 1442471721
## [57] 1442475370 1442478869 1442482442 1442486000 1442489681 1442493222 1442496823 1442500519
## [65] 1442504062 1442507676 1442511301 1442514920 1442518449 1442522024 1442525547 1442529154
## [73] 1442532866 1442536496 1442540274 1442543775 1442547438 1442550882 1442554480 1442558131
## [81] 1442561734 1442565493 1442569141 1442572776 1442576398 1442580077 1442583672 1442587330
## [89] 1442591011 1442594690 1442598327 1442602124 1442605778 1442609424 1442613054 1442616469
# Use sapply() to select the success element from each log: results
results <- sapply(logs, `[[`, 'success')
# Call mean() on results
mean(results)
## [1] 0.9270833
# Use sapply() to select the details element from each log
sapply(logs, `[[`, 'details') %>% head()
## [[1]]
## [[1]]$message
## [1] "check"
##
##
## [[2]]
## [[2]]$message
## [1] "all good"
##
##
## [[3]]
## [[3]]$message
## [1] "check"
##
##
## [[4]]
## [[4]]$message
## [1] "check"
##
##
## [[5]]
## [[5]]$message
## [1] "ok"
##
##
## [[6]]
## [[6]]$message
## [1] "all good"
# Implement function get_failure_loc
get_failure_loc <- function(x) {
if (x$success==T) {
return(NULL)
} else {
return(x$details$location)
}
}
# Use sapply() to call get_failure_loc on logs
sapply(logs, get_failure_loc) %>% head(11)
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## [1] "control room"
##
## [[11]]
## NULL
# Convert the sapply call to vapply
vapply(logs, length, integer(1))
## [1] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [49] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
# Convert the sapply call to vapply
vapply(logs, `[[`, "success", FUN.VALUE = logical(1))
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [17] TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [33] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [49] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [65] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [81] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Convert the sapply() call to a vapply() or lapply() call
vapply(logs, `[[`, c("details", "message"), FUN.VALUE=character(1))
## [1] "check" "all good" "check" "check"
## [5] "ok" "all good" "check" "all good"
## [9] "all good" "stack overflow" "ok" "ok"
## [13] "all good" "check" "check" "ok"
## [17] "check" "check" "all good" "segmentation fault"
## [21] "check" "all good" "human error" "check"
## [25] "ok" "all good" "check" "ok"
## [29] "ok" "all good" "ok" "ok"
## [33] "ok" "ok" "check" "all good"
## [37] "ok" "ok" "ok" "ok"
## [41] "ok" "ok" "check" "all good"
## [45] "all good" "check" "check" "check"
## [49] "check" "check" "stack overflow" "ok"
## [53] "ok" "all good" "check" "ok"
## [57] "check" "ok" "check" "ok"
## [61] "ok" "ok" "segmentation fault" "check"
## [65] "human error" "ok" "ok" "all good"
## [69] "ok" "all good" "check" "check"
## [73] "ok" "check" "all good" "ok"
## [77] "human error" "ok" "check" "all good"
## [81] "check" "check" "check" "check"
## [85] "check" "ok" "ok" "all good"
## [89] "ok" "all good" "all good" "all good"
## [93] "all good" "ok" "check" "check"
# Convert the sapply() call to a vapply() or lapply() call
lapply(logs, function(x) { x$details }) %>% head()
## [[1]]
## [[1]]$message
## [1] "check"
##
##
## [[2]]
## [[2]]$message
## [1] "all good"
##
##
## [[3]]
## [[3]]$message
## [1] "check"
##
##
## [[4]]
## [[4]]$message
## [1] "check"
##
##
## [[5]]
## [[5]]$message
## [1] "ok"
##
##
## [[6]]
## [[6]]$message
## [1] "all good"
# Return vector with uppercase version of message elements in log entries
toupper(sapply(logs, `[[`, c('details','message')))
## [1] "CHECK" "ALL GOOD" "CHECK" "CHECK"
## [5] "OK" "ALL GOOD" "CHECK" "ALL GOOD"
## [9] "ALL GOOD" "STACK OVERFLOW" "OK" "OK"
## [13] "ALL GOOD" "CHECK" "CHECK" "OK"
## [17] "CHECK" "CHECK" "ALL GOOD" "SEGMENTATION FAULT"
## [21] "CHECK" "ALL GOOD" "HUMAN ERROR" "CHECK"
## [25] "OK" "ALL GOOD" "CHECK" "OK"
## [29] "OK" "ALL GOOD" "OK" "OK"
## [33] "OK" "OK" "CHECK" "ALL GOOD"
## [37] "OK" "OK" "OK" "OK"
## [41] "OK" "OK" "CHECK" "ALL GOOD"
## [45] "ALL GOOD" "CHECK" "CHECK" "CHECK"
## [49] "CHECK" "CHECK" "STACK OVERFLOW" "OK"
## [53] "OK" "ALL GOOD" "CHECK" "OK"
## [57] "CHECK" "OK" "CHECK" "OK"
## [61] "OK" "OK" "SEGMENTATION FAULT" "CHECK"
## [65] "HUMAN ERROR" "OK" "OK" "ALL GOOD"
## [69] "OK" "ALL GOOD" "CHECK" "CHECK"
## [73] "OK" "CHECK" "ALL GOOD" "OK"
## [77] "HUMAN ERROR" "OK" "CHECK" "ALL GOOD"
## [81] "CHECK" "CHECK" "CHECK" "CHECK"
## [85] "CHECK" "OK" "OK" "ALL GOOD"
## [89] "OK" "ALL GOOD" "ALL GOOD" "ALL GOOD"
## [93] "ALL GOOD" "OK" "CHECK" "CHECK"
# Call dim on titanic
dim(titanic)
## [1] 891 12
# Generate histogram of Age column
hist(titanic$Age)
# Print out total value of fares
str(titanic)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 354 273 16 555 516 625 413 577 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
sum(titanic$Fare)
## [1] 28693.95
# Print out proportion of passengers that survived
mean(titanic$Survived)
## [1] 0.3838384
# Extract the name column from titanic
pass_names <- titanic$Name
# Create the logical vectror is_man
is_man <- grepl(", Mr\\.", pass_names)
# Count the number of men
sum(is_man)
## [1] 517
# Count number of men based on gender
sum(titanic$Sex == "male")
## [1] 577
# Extract the name column from titanic
pass_names <- titanic$Name
# Create titles
titles <- gsub("^.*, (.*?)\\..*$", "\\1", pass_names)
# Call unique() on titles
unique(titles)
## [1] "Mr" "Mrs" "Miss" "Master" "Don" "Rev"
## [7] "Dr" "Mme" "Ms" "Major" "Lady" "Sir"
## [13] "Mlle" "Col" "Capt" "the Countess" "Jonkheer"
pass_names <- titanic$Name
titles_male <- paste(",", c("Mr\\.", "Master", "Don", "Rev", "Dr\\.", "Major", "Sir", "Col", "Capt", "Jonkheer"))
# Finish the vapply() command
hits <- vapply(titles_male,
FUN = grepl,
FUN.VALUE = logical(length(pass_names)),
pass_names)
# Calculate the sum() of hits
sum(hits)
## [1] 578
# Count number of men based on gender
sum(titanic$Sex == "male")
## [1] 577
table
functiontable(titles, titanic$Sex)
##
## titles female male
## Capt 0 1
## Col 0 2
## Don 0 1
## Dr 1 6
## Jonkheer 0 1
## Lady 1 0
## Major 0 2
## Master 0 40
## Miss 182 0
## Mlle 2 0
## Mme 1 0
## Mr 0 517
## Mrs 125 0
## Ms 1 0
## Rev 0 6
## Sir 0 1
## the Countess 1 0
# pass_names is available in your workspace
convert_name <- function(name) {
# women: take name from inside parentheses
if (grepl("\\(.*?\\)", name)) {
gsub("^.*?\\((.*?)\\)$", "\\1", name)
# men: take name before comma and after title
} else {
# Finish the gsub() function
gsub("^(.*?),\\s[a-zA-Z\\.]*?\\s(.*?)$", "\\2 \\1", name)
}
}
# Call convert_name on name
clean_pass_names <- vapply(pass_names, FUN = convert_name,
FUN.VALUE = character(1), USE.NAMES = FALSE)
# Print out clean_pass_names
head(clean_pass_names, 20)
## [1] "Owen Harris Braund" "Florence Briggs Thayer"
## [3] "Laina Heikkinen" "Lily May Peel"
## [5] "William Henry Allen" "James Moran"
## [7] "Timothy J McCarthy" "Gosta Leonard Palsson"
## [9] "Elisabeth Vilhelmina Berg" "Adele Achem"
## [11] "Marguerite Rut Sandstrom" "Elizabeth Bonnell"
## [13] "William Henry Saundercock" "Anders Johan Andersson"
## [15] "Hulda Amanda Adolfina Vestrom" "Hewlett, Mrs. (Mary D Kingcome) "
## [17] "Eugene Rice" "Charles Eugene Williams"
## [19] "Emelia Maria Vandemoortele" "Fatima Masselmani"
I’m adding the dob data manually
the strptime
function documentation has all the codes for matching dates
# Have a look at head() of dob1 and dob2
head(dob1)
## [1] "1890-01-03" "1874-02-28" "1886-01-17" "1877-03-04" "1876-09-28" NA
head(dob2)
## [1] "November 26, 1872" "September 09, 1885" "September 09, 1890" "April 01, 1884"
## [5] "August 08, 1891" "April 20, 1877"
# Convert dob1 to dob1d, convert dob2 to dob2d
dob1d <- as.Date(dob1, '%Y-%m-%d')
dob2d <- as.Date(dob2, '%b %d, %Y')
# Combine dob1d and dob2d into single vector: birth_dates
birth_dates <- c(dob1d,dob2d)
dplyr::filter
now# titanic, dob1 and dob2 are preloaded
dob1d <- as.Date(dob1)
dob2d <- as.Date(dob2, format = "%B %d, %Y")
birth_dates <- c(dob1d, dob2d)
disaster_date <- as.Date("1912-04-15")
# Add birth_dates to titanic (column Birth)
titanic$Birth <- birth_dates
str(titanic)
## 'data.frame': 891 obs. of 13 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 354 273 16 555 516 625 413 577 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
## $ Birth : Date, format: "1890-01-03" "1874-02-28" "1886-01-17" ...
# Create subset: survivors
survivors <- subset(titanic, Survived == TRUE)
str(survivors)
## 'data.frame': 342 obs. of 13 variables:
## $ PassengerId: int 2 3 4 9 10 11 12 16 18 20 ...
## $ Survived : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Pclass : int 1 3 1 3 2 3 1 2 2 3 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 191 354 273 413 577 728 96 360 868 513 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 1 1 1 1 1 1 1 2 1 ...
## $ Age : num 38 26 35 27 14 4 58 55 NA NA ...
## $ SibSp : int 1 0 1 0 1 1 0 0 0 0 ...
## $ Parch : int 0 0 0 2 0 1 0 0 0 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 597 670 50 345 133 617 39 154 152 185 ...
## $ Fare : num 71.28 7.92 53.1 11.13 30.07 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 83 1 57 1 1 147 51 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 2 4 4 4 2 4 4 4 4 2 ...
## $ Birth : Date, format: "1874-02-28" "1886-01-17" "1877-03-04" ...
# Calculate average age of survivors
mean(disaster_date - survivors$Birth, na.rm=T)
## Time difference of 10532.41 days
## How old was the youngest survivor when Apollo 11 landed on the moon on July 20, 1969?
min(as.Date('1969-07-20') - survivors$Birth, na.rm=T)
## Time difference of 21285 days