library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('tidyr')
library("stringr")
library("knitr")
library("psych")
  1. Load Data Frame from website
l <- read.csv(
              "/Users/scottkarr/IS607Spring2016/hw6/more/untidy-data.csv",
              sep=",",
              na.strings = "",
              blank.lines.skip = TRUE,
              col.names = c("Airline", "Time",  "City1", "City2", "City3", "City4","City5"),
              stringsAsFactors=FALSE
    )
df = data.frame(l)
kable(head(df), align = 'l')
Airline Time City1 City2 City3 City4 City5
ALASKA on time 497 221 212 503 1841
NA delayed 62 12 20 102 305
NA NA NA NA NA NA NA
AM WEST on time 694 4840 383 320 201
NA delayed 117 415 65 129 61
  1. Remove Null Row
df_staging <- df[is.na(df$Airline) %in% FALSE | is.na(df$Time) %in% FALSE, ]
kable(head(df_staging))
Airline Time City1 City2 City3 City4 City5
1 ALASKA on time 497 221 212 503 1841
2 NA delayed 62 12 20 102 305
4 AM WEST on time 694 4840 383 320 201
5 NA delayed 117 415 65 129 61
  1. Fill Missing Airline data
# adds an Airline lag column
df_staging <- mutate(df_staging, lag = lag(Airline))
# indexes Airline missing data and assigns from lag column
df_staging$Airline[is.na(df_staging$Airline) == TRUE] <- df_staging$lag[is.na(df_staging$Airline) == TRUE]
kable(head(df_staging), align = 'l')
Airline Time City1 City2 City3 City4 City5 lag
ALASKA on time 497 221 212 503 1841 NA
ALASKA delayed 62 12 20 102 305 ALASKA
AM WEST on time 694 4840 383 320 201 NA
AM WEST delayed 117 415 65 129 61 AM WEST
  1. Spread and combine city data *Note, gather may be more effective here but not on the first go-round
# Keep it simple, process 1 city at a time
df_city1 <- df_staging[,1:3]
df_city2 <- df_staging[,c(1:2,4)]
df_city3 <- df_staging[,c(1:2,5)]
df_city4 <- df_staging[,c(1:2,6)]
df_city5 <- df_staging[,c(1:2,7)]

# De-normalize the repeating groups
df_city1 <- spread(df_city1, Time, City1)
df_city1["City"] <- "City1"
df_city2 <- spread(df_city2, Time, City2)
df_city2["City"] <- "City2"
df_city3 <- spread(df_city3, Time, City3)
df_city3["City"] <- "City3"
df_city4 <- spread(df_city4, Time, City4)
df_city4["City"] <- "City4"
df_city5 <- spread(df_city5, Time, City5)
df_city5["City"] <- "City5"

# Combine longform for output
df_Output <- rbind(df_city1,df_city2,df_city3,df_city4,df_city5)
colnames(df_Output)[3] <- "OnTime"
df_Output <- df_Output[c(1,4,2,3)]

# Subset numeric columns to compute statistics
kable(df_Output, align = 'l')
Airline City delayed OnTime
ALASKA City1 62 497
AM WEST City1 117 694
ALASKA City2 12 221
AM WEST City2 415 4840
ALASKA City3 20 212
AM WEST City3 65 383
ALASKA City4 102 503
AM WEST City4 129 320
ALASKA City5 305 1841
AM WEST City5 61 201
kable(df_staging, align = 'l')
Airline Time City1 City2 City3 City4 City5 lag
ALASKA on time 497 221 212 503 1841 NA
ALASKA delayed 62 12 20 102 305 ALASKA
AM WEST on time 694 4840 383 320 201 NA
AM WEST delayed 117 415 65 129 61 AM WEST
  1. OK, so now that we have our arrival/departure data in long form, let’s compute summary statistics for on-time arrivals to see how the cities compare . . .
df_Output <- mutate(df_Output, OnTimePct = (OnTime/(delayed + OnTime)) * 100 )
nums <- sapply(df_Output, is.numeric)
df_stats <- df_Output[ , nums]
df_summary <- describe(df_stats)
kable(df_summary,align='l')
vars n mean sd median trimmed mad min max range skew kurtosis se
delayed 1 10 128.80000 130.138217 83.50000 107.6250 58.562700 12.00000 415.00000 403.0000 1.1407134 -0.1907822 41.153318
OnTime 2 10 971.20000 1443.280045 440.00000 583.8750 331.361100 201.00000 4840.00000 4639.0000 1.8805608 2.2085754 456.405224
OnTimePct 3 10 85.52201 7.168528 85.68044 86.1376 6.617732 71.26949 94.84979 23.5803 -0.6120945 -0.8013766 2.266888
boxplot(split(df_Output$OnTimePct,df_Output$City),main='On Time Arrivals By City')