# pull Airline Delays JPG file from github - only usable for HTML knit -- will not work for PDF knit
airline_delays_pic_URL <- "https://raw.githubusercontent.com/myampol/MY607/master/Airline_Delays.JPG"
### Determine whether we are knitting to PDF ("latex") or HTML
whichknit <- knitr::opts_knit$get("rmarkdown.pandoc.to") # Works only if knit() is called via render()
if (!is.null(whichknit)) { # can't test the below items if whichknit returned NULL
if (whichknit=="latex") {
knitr::include_graphics('Airline_Delays.JPG') ### PDF knit requires that the file be local
} else if (whichknit=="html") {
knitr::include_graphics(airline_delays_pic_URL) ### HTML knit will accept URL to insert picture
}
}tidyr and dplyr as needed to tidy and transform your data.##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
#setwd("C:/Users/Michael/Dropbox/priv/CUNY/MSDS/201909-Fall/DATA607_Tati_Andy/20190929_Week05")
### Don't pull file from local drive
#inputfile <- "InputFlightData.csv"
### Pull the input data file from github, rather than local drive
inputfile <- "https://raw.githubusercontent.com/myampol/MY607/master/InputFlightData.csv"
rawflights <- read.csv(inputfile,stringsAsFactors = F)
rawflights %>% kable() %>% kable_styling(c("striped", "bordered"))| X | X.1 | Los.Angeles | Phoenix | San.Diego | San.Francisco | Seattle |
|---|---|---|---|---|---|---|
| ALASKA | on time | 497 | 221 | 212 | 503 | 1841 |
| delayed | 62 | 12 | 20 | 102 | 305 | |
| NA | NA | NA | NA | NA | ||
| AM WEST | on time | 694 | 4840 | 383 | 320 | 201 |
| delayed | 117 | 415 | 65 | 129 | 61 |
rf1 <- rename(.data = rawflights, Airline=X, Status=`X.1`,
LosAngeles=`Los.Angeles`, SanDiego=`San.Diego`, SanFrancisco=`San.Francisco`)
rf1 %>% kable() %>% kable_styling(c("striped", "bordered"))| Airline | Status | LosAngeles | Phoenix | SanDiego | SanFrancisco | Seattle |
|---|---|---|---|---|---|---|
| ALASKA | on time | 497 | 221 | 212 | 503 | 1841 |
| delayed | 62 | 12 | 20 | 102 | 305 | |
| NA | NA | NA | NA | NA | ||
| AM WEST | on time | 694 | 4840 | 383 | 320 | 201 |
| delayed | 117 | 415 | 65 | 129 | 61 |
for each (Airline,City) pair.
rf2 <- gather(data = rf1 , key = City, value = NumFlights, ... = LosAngeles:Seattle, na.rm = T)
rf2 %>% kable() %>% kable_styling(c("striped", "bordered"))| Airline | Status | City | NumFlights | |
|---|---|---|---|---|
| 1 | ALASKA | on time | LosAngeles | 497 |
| 2 | delayed | LosAngeles | 62 | |
| 4 | AM WEST | on time | LosAngeles | 694 |
| 5 | delayed | LosAngeles | 117 | |
| 6 | ALASKA | on time | Phoenix | 221 |
| 7 | delayed | Phoenix | 12 | |
| 9 | AM WEST | on time | Phoenix | 4840 |
| 10 | delayed | Phoenix | 415 | |
| 11 | ALASKA | on time | SanDiego | 212 |
| 12 | delayed | SanDiego | 20 | |
| 14 | AM WEST | on time | SanDiego | 383 |
| 15 | delayed | SanDiego | 65 | |
| 16 | ALASKA | on time | SanFrancisco | 503 |
| 17 | delayed | SanFrancisco | 102 | |
| 19 | AM WEST | on time | SanFrancisco | 320 |
| 20 | delayed | SanFrancisco | 129 | |
| 21 | ALASKA | on time | Seattle | 1841 |
| 22 | delayed | Seattle | 305 | |
| 24 | AM WEST | on time | Seattle | 201 |
| 25 | delayed | Seattle | 61 |
rf3 <- mutate(.data = rf2, Airline= ifelse(Airline=="", lag(Airline), Airline))
rf3 %>% kable() %>% kable_styling(c("striped", "bordered"))| Airline | Status | City | NumFlights |
|---|---|---|---|
| ALASKA | on time | LosAngeles | 497 |
| ALASKA | delayed | LosAngeles | 62 |
| AM WEST | on time | LosAngeles | 694 |
| AM WEST | delayed | LosAngeles | 117 |
| ALASKA | on time | Phoenix | 221 |
| ALASKA | delayed | Phoenix | 12 |
| AM WEST | on time | Phoenix | 4840 |
| AM WEST | delayed | Phoenix | 415 |
| ALASKA | on time | SanDiego | 212 |
| ALASKA | delayed | SanDiego | 20 |
| AM WEST | on time | SanDiego | 383 |
| AM WEST | delayed | SanDiego | 65 |
| ALASKA | on time | SanFrancisco | 503 |
| ALASKA | delayed | SanFrancisco | 102 |
| AM WEST | on time | SanFrancisco | 320 |
| AM WEST | delayed | SanFrancisco | 129 |
| ALASKA | on time | Seattle | 1841 |
| ALASKA | delayed | Seattle | 305 |
| AM WEST | on time | Seattle | 201 |
| AM WEST | delayed | Seattle | 61 |
rf4 <- spread(data = rf3, key = Status, value = NumFlights)
rf4 %>% kable() %>% kable_styling(c("striped", "bordered"))| Airline | City | delayed | on time |
|---|---|---|---|
| ALASKA | LosAngeles | 62 | 497 |
| ALASKA | Phoenix | 12 | 221 |
| ALASKA | SanDiego | 20 | 212 |
| ALASKA | SanFrancisco | 102 | 503 |
| ALASKA | Seattle | 305 | 1841 |
| AM WEST | LosAngeles | 117 | 694 |
| AM WEST | Phoenix | 415 | 4840 |
| AM WEST | SanDiego | 65 | 383 |
| AM WEST | SanFrancisco | 129 | 320 |
| AM WEST | Seattle | 61 | 201 |
rf5 <- rename(.data = rf4, NumFlightsDelayed=delayed, NumFlightsOnTime=`on time`)
rf5 %>% kable() %>% kable_styling(c("striped", "bordered"))| Airline | City | NumFlightsDelayed | NumFlightsOnTime |
|---|---|---|---|
| ALASKA | LosAngeles | 62 | 497 |
| ALASKA | Phoenix | 12 | 221 |
| ALASKA | SanDiego | 20 | 212 |
| ALASKA | SanFrancisco | 102 | 503 |
| ALASKA | Seattle | 305 | 1841 |
| AM WEST | LosAngeles | 117 | 694 |
| AM WEST | Phoenix | 415 | 4840 |
| AM WEST | SanDiego | 65 | 383 |
| AM WEST | SanFrancisco | 129 | 320 |
| AM WEST | Seattle | 61 | 201 |
rf6 <- mutate(.data = rf5, NumFlightsTotal = NumFlightsDelayed + NumFlightsOnTime)
rf6 %>% kable() %>% kable_styling(c("striped", "bordered"))| Airline | City | NumFlightsDelayed | NumFlightsOnTime | NumFlightsTotal |
|---|---|---|---|---|
| ALASKA | LosAngeles | 62 | 497 | 559 |
| ALASKA | Phoenix | 12 | 221 | 233 |
| ALASKA | SanDiego | 20 | 212 | 232 |
| ALASKA | SanFrancisco | 102 | 503 | 605 |
| ALASKA | Seattle | 305 | 1841 | 2146 |
| AM WEST | LosAngeles | 117 | 694 | 811 |
| AM WEST | Phoenix | 415 | 4840 | 5255 |
| AM WEST | SanDiego | 65 | 383 | 448 |
| AM WEST | SanFrancisco | 129 | 320 | 449 |
| AM WEST | Seattle | 61 | 201 | 262 |
rf7 <- mutate(.data = rf6, PctFlightsDelayed=NumFlightsDelayed/NumFlightsTotal,
PctFlightsOnTime=NumFlightsOnTime/NumFlightsTotal)
rf7 %>% kable() %>% kable_styling(c("striped", "bordered"))| Airline | City | NumFlightsDelayed | NumFlightsOnTime | NumFlightsTotal | PctFlightsDelayed | PctFlightsOnTime |
|---|---|---|---|---|---|---|
| ALASKA | LosAngeles | 62 | 497 | 559 | 0.110912 | 0.889088 |
| ALASKA | Phoenix | 12 | 221 | 233 | 0.051502 | 0.948498 |
| ALASKA | SanDiego | 20 | 212 | 232 | 0.086207 | 0.913793 |
| ALASKA | SanFrancisco | 102 | 503 | 605 | 0.168595 | 0.831405 |
| ALASKA | Seattle | 305 | 1841 | 2146 | 0.142125 | 0.857875 |
| AM WEST | LosAngeles | 117 | 694 | 811 | 0.144266 | 0.855734 |
| AM WEST | Phoenix | 415 | 4840 | 5255 | 0.078972 | 0.921028 |
| AM WEST | SanDiego | 65 | 383 | 448 | 0.145089 | 0.854911 |
| AM WEST | SanFrancisco | 129 | 320 | 449 | 0.287305 | 0.712695 |
| AM WEST | Seattle | 61 | 201 | 262 | 0.232824 | 0.767176 |
Using the pipe connector “%>%”, all the above steps can be specified in a single chain:
tidy_flights <- rawflights %>%
rename(.data = ., Airline=X, Status=`X.1`, LosAngeles=`Los.Angeles`,
SanDiego=`San.Diego`,
SanFrancisco=`San.Francisco`) %>%
gather( data = ., key = City, value = NumFlights, ... = LosAngeles:Seattle, na.rm = T) %>%
mutate(.data = ., Airline= ifelse(Airline=="", lag(Airline), Airline)) %>%
spread( data = ., key = Status, value = NumFlights) %>%
rename(.data = ., NumFlightsDelayed=delayed, NumFlightsOnTime=`on time`) %>%
mutate(.data = ., NumFlightsTotal = NumFlightsDelayed + NumFlightsOnTime) %>%
mutate(.data = ., PctFlightsDelayed=NumFlightsDelayed/NumFlightsTotal,
PctFlightsOnTime=NumFlightsOnTime/NumFlightsTotal)
tidy_flights %>% kable() %>% kable_styling(c("striped", "bordered"))| Airline | City | NumFlightsDelayed | NumFlightsOnTime | NumFlightsTotal | PctFlightsDelayed | PctFlightsOnTime |
|---|---|---|---|---|---|---|
| ALASKA | LosAngeles | 62 | 497 | 559 | 0.110912 | 0.889088 |
| ALASKA | Phoenix | 12 | 221 | 233 | 0.051502 | 0.948498 |
| ALASKA | SanDiego | 20 | 212 | 232 | 0.086207 | 0.913793 |
| ALASKA | SanFrancisco | 102 | 503 | 605 | 0.168595 | 0.831405 |
| ALASKA | Seattle | 305 | 1841 | 2146 | 0.142125 | 0.857875 |
| AM WEST | LosAngeles | 117 | 694 | 811 | 0.144266 | 0.855734 |
| AM WEST | Phoenix | 415 | 4840 | 5255 | 0.078972 | 0.921028 |
| AM WEST | SanDiego | 65 | 383 | 448 | 0.145089 | 0.854911 |
| AM WEST | SanFrancisco | 129 | 320 | 449 | 0.287305 | 0.712695 |
| AM WEST | Seattle | 61 | 201 | 262 | 0.232824 | 0.767176 |
| Airline | City | NumFlightsDelayed | NumFlightsOnTime | NumFlightsTotal | PctFlightsDelayed | PctFlightsOnTime |
|---|---|---|---|---|---|---|
| ALASKA | LosAngeles | 62 | 497 | 559 | 0.110912 | 0.889088 |
| AM WEST | LosAngeles | 117 | 694 | 811 | 0.144266 | 0.855734 |
| ALASKA | Phoenix | 12 | 221 | 233 | 0.051502 | 0.948498 |
| AM WEST | Phoenix | 415 | 4840 | 5255 | 0.078972 | 0.921028 |
| ALASKA | SanDiego | 20 | 212 | 232 | 0.086207 | 0.913793 |
| AM WEST | SanDiego | 65 | 383 | 448 | 0.145089 | 0.854911 |
| ALASKA | SanFrancisco | 102 | 503 | 605 | 0.168595 | 0.831405 |
| AM WEST | SanFrancisco | 129 | 320 | 449 | 0.287305 | 0.712695 |
| ALASKA | Seattle | 305 | 1841 | 2146 | 0.142125 | 0.857875 |
| AM WEST | Seattle | 61 | 201 | 262 | 0.232824 | 0.767176 |
ALASKA_Phoenix_delays <- filter(.data=tidy_flights, Airline=="ALASKA" & City=="Phoenix") %>%
select(PctFlightsDelayed)
AMWEST_Phoenix_delays <- filter(.data=tidy_flights, Airline=="AM WEST" & City=="Phoenix") %>%
select(PctFlightsDelayed)
ALASKA_SanFrancisco_delays <- filter(.data=tidy_flights, Airline=="ALASKA" & City=="SanFrancisco") %>%
select(PctFlightsDelayed)
AMWEST_SanFrancisco_delays <- filter(.data=tidy_flights, Airline=="AM WEST" & City=="SanFrancisco") %>%
select(PctFlightsDelayed)Pct_Delays_by_City <- tidy_flights %>%
select(.data = ., -NumFlightsDelayed, -NumFlightsOnTime,
-NumFlightsTotal, -PctFlightsOnTime) %>%
spread(data = . , key = Airline, value = PctFlightsDelayed)
Pct_Delays_by_City %>% kable() %>% kable_styling(c("striped", "bordered"))| City | ALASKA | AM WEST |
|---|---|---|
| LosAngeles | 0.110912 | 0.144266 |
| Phoenix | 0.051502 | 0.078972 |
| SanDiego | 0.086207 | 0.145089 |
| SanFrancisco | 0.168595 | 0.287305 |
| Seattle | 0.142125 | 0.232824 |
Pct_Delays_by_City %>% gather(data = ., key = Airline,
value = Pct_Delays_by_City,...=ALASKA:`AM WEST`) %>%
ggplot(data = ., aes(factor(City), Pct_Delays_by_City, fill = Airline)) +
geom_bar(stat="identity", position = "dodge") +
theme_minimal()+
labs( x="City", y="Percentage of flights which are Delayed") +
ggtitle("Percentage of flights which are Delayed, by City and Airline")ResultsByAirline <- group_by(tidy_flights,Airline) %>% summarize(
TotalDelays=sum(NumFlightsDelayed),
TotalOnTime=sum(NumFlightsOnTime),
TotalFlights=sum(NumFlightsTotal),
PctDelayed=TotalDelays/TotalFlights,
PctOnTime=TotalOnTime/TotalFlights)
ResultsByAirline %>% kable() %>% kable_styling(c("striped", "bordered"))| Airline | TotalDelays | TotalOnTime | TotalFlights | PctDelayed | PctOnTime |
|---|---|---|---|---|---|
| ALASKA | 501 | 3274 | 3775 | 0.132715 | 0.867285 |
| AM WEST | 787 | 6438 | 7225 | 0.108927 | 0.891073 |
ALASKAdelays <- filter(.data = ResultsByAirline, Airline=="ALASKA") %>% select(PctDelayed)
AMWESTdelays <- filter(.data = ResultsByAirline, Airline=="AM WEST") %>% select(PctDelayed)
ALASKAtotals <- filter(.data = ResultsByAirline, Airline=="ALASKA") %>% select(TotalFlights)
AMWESTtotals <- filter(.data = ResultsByAirline, Airline=="AM WEST") %>% select(TotalFlights)ResultsByAirline %>% select(.data = ., Airline, PctDelayed) %>%
ggplot(data = ., aes(factor(Airline), PctDelayed, fill = Airline)) +
geom_bar(stat="identity", position = "dodge") +
theme_minimal()+
labs( x="Airline", y="Percentage of flights which are Delayed") +
ggtitle("Percentage of flights which are Delayed, by Airline")ResultsByCity <- group_by(tidy_flights,City) %>% summarize(
TotalDelays=sum(NumFlightsDelayed),
TotalOnTime=sum(NumFlightsOnTime),
TotalFlights=sum(NumFlightsTotal),
PctDelayed=TotalDelays/TotalFlights,
PctOnTime=TotalOnTime/TotalFlights)
ResultsByCity %>% kable() %>% kable_styling(c("striped", "bordered"))| City | TotalDelays | TotalOnTime | TotalFlights | PctDelayed | PctOnTime |
|---|---|---|---|---|---|
| LosAngeles | 179 | 1191 | 1370 | 0.130657 | 0.869343 |
| Phoenix | 427 | 5061 | 5488 | 0.077806 | 0.922194 |
| SanDiego | 85 | 595 | 680 | 0.125000 | 0.875000 |
| SanFrancisco | 231 | 823 | 1054 | 0.219165 | 0.780835 |
| Seattle | 366 | 2042 | 2408 | 0.151993 | 0.848007 |
| City | TotalDelays | TotalOnTime | TotalFlights | PctDelayed | PctOnTime |
|---|---|---|---|---|---|
| Phoenix | 427 | 5061 | 5488 | 0.077806 | 0.922194 |
| SanDiego | 85 | 595 | 680 | 0.125000 | 0.875000 |
| LosAngeles | 179 | 1191 | 1370 | 0.130657 | 0.869343 |
| Seattle | 366 | 2042 | 2408 | 0.151993 | 0.848007 |
| SanFrancisco | 231 | 823 | 1054 | 0.219165 | 0.780835 |
select(.data = ResultsByCity, City, TotalDelays) %>%
ggplot(data = ., aes(factor(City), TotalDelays, fill = City)) +
geom_bar(stat="identity", position = "dodge") +
theme_minimal()+
labs( x="", y="Total Number of Delays") +
ggtitle("Total Number of Delays, by city") select(.data = ResultsByCity, City, TotalFlights) %>%
ggplot(data = ., aes(factor(City), TotalFlights, fill = City)) +
geom_bar(stat="identity", position = "dodge") +
theme_minimal()+
labs( x="", y="Total Number of Flights") +
ggtitle("Total Number of Flights, by city") select(.data = ResultsByCity, City, PctDelayed) %>%
ggplot(data = ., aes(factor(City), PctDelayed, fill = City)) +
geom_bar(stat="identity", position = "dodge") +
theme_minimal()+
labs( x="", y="Percentage of Delayed Flights") +
ggtitle("Percentage of Delayed Flights, by city")