library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('tidyr')
library("stringr")
library("knitr")
library("psych")
- Load Data Frame from website
l <- read.csv(
"/Users/scottkarr/IS607Spring2016/hw6/more/untidy-data.csv",
sep=",",
na.strings = "",
blank.lines.skip = TRUE,
col.names = c("Airline", "Time", "City1", "City2", "City3", "City4","City5"),
stringsAsFactors=FALSE
)
df = data.frame(l)
kable(head(df), align = 'l')
ALASKA |
on time |
497 |
221 |
212 |
503 |
1841 |
NA |
delayed |
62 |
12 |
20 |
102 |
305 |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
AM WEST |
on time |
694 |
4840 |
383 |
320 |
201 |
NA |
delayed |
117 |
415 |
65 |
129 |
61 |
- Remove Null Row
df_staging <- df[is.na(df$Airline) %in% FALSE | is.na(df$Time) %in% FALSE, ]
kable(head(df_staging))
1 |
ALASKA |
on time |
497 |
221 |
212 |
503 |
1841 |
2 |
NA |
delayed |
62 |
12 |
20 |
102 |
305 |
4 |
AM WEST |
on time |
694 |
4840 |
383 |
320 |
201 |
5 |
NA |
delayed |
117 |
415 |
65 |
129 |
61 |
- Fill Missing Airline data
# adds an Airline lag column
df_staging <- mutate(df_staging, lag = lag(Airline))
# indexes Airline missing data and assigns from lag column
df_staging$Airline[is.na(df_staging$Airline) == TRUE] <- df_staging$lag[is.na(df_staging$Airline) == TRUE]
kable(head(df_staging), align = 'l')
ALASKA |
on time |
497 |
221 |
212 |
503 |
1841 |
NA |
ALASKA |
delayed |
62 |
12 |
20 |
102 |
305 |
ALASKA |
AM WEST |
on time |
694 |
4840 |
383 |
320 |
201 |
NA |
AM WEST |
delayed |
117 |
415 |
65 |
129 |
61 |
AM WEST |
- Spread and combine city data *Note, gather may be more effective here but not on the first go-round
# Keep it simple, process 1 city at a time
df_city1 <- df_staging[,1:3]
df_city2 <- df_staging[,c(1:2,4)]
df_city3 <- df_staging[,c(1:2,5)]
df_city4 <- df_staging[,c(1:2,6)]
df_city5 <- df_staging[,c(1:2,7)]
# De-normalize the repeating groups
df_city1 <- spread(df_city1, Time, City1)
df_city1["City"] <- "City1"
df_city2 <- spread(df_city2, Time, City2)
df_city2["City"] <- "City2"
df_city3 <- spread(df_city3, Time, City3)
df_city3["City"] <- "City3"
df_city4 <- spread(df_city4, Time, City4)
df_city4["City"] <- "City4"
df_city5 <- spread(df_city5, Time, City5)
df_city5["City"] <- "City5"
# Combine longform for output
df_Output <- rbind(df_city1,df_city2,df_city3,df_city4,df_city5)
colnames(df_Output)[3] <- "OnTime"
df_Output <- df_Output[c(1,4,2,3)]
# Subset numeric columns to compute statistics
kable(df_Output, align = 'l')
ALASKA |
City1 |
62 |
497 |
AM WEST |
City1 |
117 |
694 |
ALASKA |
City2 |
12 |
221 |
AM WEST |
City2 |
415 |
4840 |
ALASKA |
City3 |
20 |
212 |
AM WEST |
City3 |
65 |
383 |
ALASKA |
City4 |
102 |
503 |
AM WEST |
City4 |
129 |
320 |
ALASKA |
City5 |
305 |
1841 |
AM WEST |
City5 |
61 |
201 |
kable(df_staging, align = 'l')
ALASKA |
on time |
497 |
221 |
212 |
503 |
1841 |
NA |
ALASKA |
delayed |
62 |
12 |
20 |
102 |
305 |
ALASKA |
AM WEST |
on time |
694 |
4840 |
383 |
320 |
201 |
NA |
AM WEST |
delayed |
117 |
415 |
65 |
129 |
61 |
AM WEST |
- OK, so now that we have our arrival/departure data in long form, let’s compute summary statistics for on-time arrivals to see how the cities compare . . .
- add on time percentage column so we’re comparing apples with apples
- show summary statistics for on time percentage as well as nominal delay and on-time counts
- use boxplot to show on time percentages by city comparitively
df_Output <- mutate(df_Output, OnTimePct = (OnTime/(delayed + OnTime)) * 100 )
nums <- sapply(df_Output, is.numeric)
df_stats <- df_Output[ , nums]
df_summary <- describe(df_stats)
kable(df_summary,align='l')
delayed |
1 |
10 |
128.80000 |
130.138217 |
83.50000 |
107.6250 |
58.562700 |
12.00000 |
415.00000 |
403.0000 |
1.1407134 |
-0.1907822 |
41.153318 |
OnTime |
2 |
10 |
971.20000 |
1443.280045 |
440.00000 |
583.8750 |
331.361100 |
201.00000 |
4840.00000 |
4639.0000 |
1.8805608 |
2.2085754 |
456.405224 |
OnTimePct |
3 |
10 |
85.52201 |
7.168528 |
85.68044 |
86.1376 |
6.617732 |
71.26949 |
94.84979 |
23.5803 |
-0.6120945 |
-0.8013766 |
2.266888 |
boxplot(split(df_Output$OnTimePct,df_Output$City),main='On Time Arrivals By City')
