Import the flights csv file which describes arrivals and delays for two airlines across five destinations. The CSV file was created and exported in MySQL Workbench 6.3 CE.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.3
## Warning: package 'readr' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.3
## Warning: package 'dplyr' was built under R version 3.4.2
## Warning: package 'forcats' was built under R version 3.4.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
library(dplyr)
library(knitr)
flights <- read_csv("https://raw.githubusercontent.com/LilesB/DATA-607-5WK-Assignment/master/flights.csv")
## Parsed with column specification:
## cols(
## AID = col_integer(),
## Airline = col_character(),
## FlightStatus = col_character(),
## LosAngeles = col_integer(),
## Phoenix = col_integer(),
## SanDiego = col_integer(),
## SanFrancisco = col_integer(),
## Seattle = col_integer()
## )
glimpse(flights)
## Observations: 4
## Variables: 8
## $ AID <int> 1, 2, 3, 4
## $ Airline <chr> "ALASKA", "ALASKA", "AM WEST", "AM WEST"
## $ FlightStatus <chr> "on time", "delayed", "on time", "delayed"
## $ LosAngeles <int> 497, 62, 694, 117
## $ Phoenix <int> 221, 12, 4840, 415
## $ SanDiego <int> 212, 20, 383, 65
## $ SanFrancisco <int> 503, 102, 320, 129
## $ Seattle <int> 1841, 305, 201, 61
The flights tibble has 4 observations and 8 variables. In order to create a clean environment, we will remove the AID primary key variable and then use the kable function to display the contents of the tibble.
flights <- subset(flights,select=-AID)
kable(flights)
| Airline | FlightStatus | LosAngeles | Phoenix | SanDiego | SanFrancisco | Seattle |
|---|---|---|---|---|---|---|
| ALASKA | on time | 497 | 221 | 212 | 503 | 1841 |
| ALASKA | delayed | 62 | 12 | 20 | 102 | 305 |
| AM WEST | on time | 694 | 4840 | 383 | 320 | 201 |
| AM WEST | delayed | 117 | 415 | 65 | 129 | 61 |
Use the gather() function to condense the original 4 observations & 7 variable tibble to a tibble with 20 observations & 4 variables and create a tibble entitled flights_gather
flights_gather <- flights %>% gather(Destination,NumFlights,-Airline,-FlightStatus)
kable(flights_gather)
| Airline | FlightStatus | Destination | NumFlights |
|---|---|---|---|
| ALASKA | on time | LosAngeles | 497 |
| ALASKA | delayed | LosAngeles | 62 |
| AM WEST | on time | LosAngeles | 694 |
| AM WEST | delayed | LosAngeles | 117 |
| ALASKA | on time | Phoenix | 221 |
| ALASKA | delayed | Phoenix | 12 |
| AM WEST | on time | Phoenix | 4840 |
| AM WEST | delayed | Phoenix | 415 |
| ALASKA | on time | SanDiego | 212 |
| ALASKA | delayed | SanDiego | 20 |
| AM WEST | on time | SanDiego | 383 |
| AM WEST | delayed | SanDiego | 65 |
| ALASKA | on time | SanFrancisco | 503 |
| ALASKA | delayed | SanFrancisco | 102 |
| AM WEST | on time | SanFrancisco | 320 |
| AM WEST | delayed | SanFrancisco | 129 |
| ALASKA | on time | Seattle | 1841 |
| ALASKA | delayed | Seattle | 305 |
| AM WEST | on time | Seattle | 201 |
| AM WEST | delayed | Seattle | 61 |
Next, we use the spread() function to condense flights_long to a tibble with 10 observations & 4 variables named flights_spread
flights_spread <- flights_gather %>% spread(FlightStatus,NumFlights)
kable(flights_spread)
| Airline | Destination | delayed | on time |
|---|---|---|---|
| ALASKA | LosAngeles | 62 | 497 |
| ALASKA | Phoenix | 12 | 221 |
| ALASKA | SanDiego | 20 | 212 |
| ALASKA | SanFrancisco | 102 | 503 |
| ALASKA | Seattle | 305 | 1841 |
| AM WEST | LosAngeles | 117 | 694 |
| AM WEST | Phoenix | 415 | 4840 |
| AM WEST | SanDiego | 65 | 383 |
| AM WEST | SanFrancisco | 129 | 320 |
| AM WEST | Seattle | 61 | 201 |
In order to analyze the data first the glimpse function was used in order to see if the delayed and on time variables were integers.
After utilizing the spread() function we see that on time needs to be renamed which was done immediately after the glimpse function.
glimpse(flights_spread)
## Observations: 10
## Variables: 4
## $ Airline <chr> "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", ...
## $ Destination <chr> "LosAngeles", "Phoenix", "SanDiego", "SanFrancisco...
## $ delayed <int> 62, 12, 20, 102, 305, 117, 415, 65, 129, 61
## $ `on time` <int> 497, 221, 212, 503, 1841, 694, 4840, 383, 320, 201
colnames (flights_spread) <- c("Airline", "Destination", "Delayed","OnTime")
names(flights_spread)
## [1] "Airline" "Destination" "Delayed" "OnTime"
Next we create a variable entitled TotFlights which adds the Delayed and OnTime variables.
flights_spread <- flights_spread %>% mutate(TotFlights = Delayed + OnTime)
flights_spread %>% select(Airline,Destination,TotFlights)
## # A tibble: 10 x 3
## Airline Destination TotFlights
## <chr> <chr> <int>
## 1 ALASKA LosAngeles 559
## 2 ALASKA Phoenix 233
## 3 ALASKA SanDiego 232
## 4 ALASKA SanFrancisco 605
## 5 ALASKA Seattle 2146
## 6 AM WEST LosAngeles 811
## 7 AM WEST Phoenix 5255
## 8 AM WEST SanDiego 448
## 9 AM WEST SanFrancisco 449
## 10 AM WEST Seattle 262
Next we create a variable entitled DelayedRatio which divides the Delayed variable by TotFlights
flights_spread <- flights_spread %>% mutate(DelayedRatio = Delayed/TotFlights)
flights_spread %>% select(Airline,Destination,DelayedRatio)
## # A tibble: 10 x 3
## Airline Destination DelayedRatio
## <chr> <chr> <dbl>
## 1 ALASKA LosAngeles 0.111
## 2 ALASKA Phoenix 0.0515
## 3 ALASKA SanDiego 0.0862
## 4 ALASKA SanFrancisco 0.169
## 5 ALASKA Seattle 0.142
## 6 AM WEST LosAngeles 0.144
## 7 AM WEST Phoenix 0.0790
## 8 AM WEST SanDiego 0.145
## 9 AM WEST SanFrancisco 0.287
## 10 AM WEST Seattle 0.233
Next we create a variable entitled OnTimeRatio which divides the OnTime variable by TotFlights
flights_spread <- flights_spread %>% mutate(OnTimeRatio = OnTime/TotFlights)
flights_spread %>% select(Airline,Destination,OnTimeRatio)
## # A tibble: 10 x 3
## Airline Destination OnTimeRatio
## <chr> <chr> <dbl>
## 1 ALASKA LosAngeles 0.889
## 2 ALASKA Phoenix 0.948
## 3 ALASKA SanDiego 0.914
## 4 ALASKA SanFrancisco 0.831
## 5 ALASKA Seattle 0.858
## 6 AM WEST LosAngeles 0.856
## 7 AM WEST Phoenix 0.921
## 8 AM WEST SanDiego 0.855
## 9 AM WEST SanFrancisco 0.713
## 10 AM WEST Seattle 0.767
LowDel5 <- arrange(flights_spread, Delayed)
LowDel5 %>% select(Airline:Delayed) %>% slice(1:5)
## # A tibble: 5 x 3
## Airline Destination Delayed
## <chr> <chr> <int>
## 1 ALASKA Phoenix 12
## 2 ALASKA SanDiego 20
## 3 AM WEST Seattle 61
## 4 ALASKA LosAngeles 62
## 5 AM WEST SanDiego 65
TopDel5 <- arrange(flights_spread,desc (Delayed))
TopDel5 %>% select(Airline:Delayed) %>% slice(1:5)
## # A tibble: 5 x 3
## Airline Destination Delayed
## <chr> <chr> <int>
## 1 AM WEST Phoenix 415
## 2 ALASKA Seattle 305
## 3 AM WEST SanFrancisco 129
## 4 AM WEST LosAngeles 117
## 5 ALASKA SanFrancisco 102
LowOnTime5 <- arrange(flights_spread, OnTime)
LowOnTime5 %>% select(Airline,Destination,OnTime) %>%
slice(1:5)
## # A tibble: 5 x 3
## Airline Destination OnTime
## <chr> <chr> <int>
## 1 AM WEST Seattle 201
## 2 ALASKA SanDiego 212
## 3 ALASKA Phoenix 221
## 4 AM WEST SanFrancisco 320
## 5 AM WEST SanDiego 383
TopOnTime5 <- arrange(flights_spread, desc(OnTime))
TopOnTime5 %>% select(Airline,Destination,OnTime) %>%
slice(1:5)
## # A tibble: 5 x 3
## Airline Destination OnTime
## <chr> <chr> <int>
## 1 AM WEST Phoenix 4840
## 2 ALASKA Seattle 1841
## 3 AM WEST LosAngeles 694
## 4 ALASKA SanFrancisco 503
## 5 ALASKA LosAngeles 497
LowTotFlights5 <- arrange(flights_spread, TotFlights)
LowTotFlights5 %>% select(Airline,Destination,TotFlights) %>%
slice(1:5)
## # A tibble: 5 x 3
## Airline Destination TotFlights
## <chr> <chr> <int>
## 1 ALASKA SanDiego 232
## 2 ALASKA Phoenix 233
## 3 AM WEST Seattle 262
## 4 AM WEST SanDiego 448
## 5 AM WEST SanFrancisco 449
TopTotFlights5 <- arrange(flights_spread, desc(TotFlights))
TopTotFlights5 %>% select(Airline,Destination,TotFlights) %>%
slice(1:5)
## # A tibble: 5 x 3
## Airline Destination TotFlights
## <chr> <chr> <int>
## 1 AM WEST Phoenix 5255
## 2 ALASKA Seattle 2146
## 3 AM WEST LosAngeles 811
## 4 ALASKA SanFrancisco 605
## 5 ALASKA LosAngeles 559
LowDelRats5 <- arrange(flights_spread, DelayedRatio)
LowDelRats5 %>% select(Airline,Destination,DelayedRatio) %>%
slice(1:5)
## # A tibble: 5 x 3
## Airline Destination DelayedRatio
## <chr> <chr> <dbl>
## 1 ALASKA Phoenix 0.0515
## 2 AM WEST Phoenix 0.0790
## 3 ALASKA SanDiego 0.0862
## 4 ALASKA LosAngeles 0.111
## 5 ALASKA Seattle 0.142
TopDelRats5 <- arrange(flights_spread, desc(DelayedRatio))
TopDelRats5 %>% select(Airline,Destination,DelayedRatio) %>%
slice(1:5)
## # A tibble: 5 x 3
## Airline Destination DelayedRatio
## <chr> <chr> <dbl>
## 1 AM WEST SanFrancisco 0.287
## 2 AM WEST Seattle 0.233
## 3 ALASKA SanFrancisco 0.169
## 4 AM WEST SanDiego 0.145
## 5 AM WEST LosAngeles 0.144
LowOnTimeRat5 <- arrange(flights_spread, OnTimeRatio)
LowOnTimeRat5 %>% select(Airline,Destination,OnTimeRatio) %>%
slice(1:5)
## # A tibble: 5 x 3
## Airline Destination OnTimeRatio
## <chr> <chr> <dbl>
## 1 AM WEST SanFrancisco 0.713
## 2 AM WEST Seattle 0.767
## 3 ALASKA SanFrancisco 0.831
## 4 AM WEST SanDiego 0.855
## 5 AM WEST LosAngeles 0.856
TopOnTimeRat5 <- arrange(flights_spread, desc(OnTimeRatio))
TopOnTimeRat5 %>% select(Airline,Destination,OnTimeRatio) %>%
slice(1:5)
## # A tibble: 5 x 3
## Airline Destination OnTimeRatio
## <chr> <chr> <dbl>
## 1 ALASKA Phoenix 0.948
## 2 AM WEST Phoenix 0.921
## 3 ALASKA SanDiego 0.914
## 4 ALASKA LosAngeles 0.889
## 5 ALASKA Seattle 0.858
delayed <- flights_spread %>% group_by(Airline) %>% summarise(DelayedMean=mean(Delayed), DelayedSD=sd(Delayed), DelayedMed=median(Delayed))
delayed
## # A tibble: 2 x 4
## Airline DelayedMean DelayedSD DelayedMed
## <chr> <dbl> <dbl> <int>
## 1 ALASKA 100 120 62
## 2 AM WEST 157 147 117
boxplot(flights_spread$Delayed ~ flights_spread$Airline, main = "Delayed Flights Box Plot ALASKA / AM WEST Airlines", xlab = "Airlines", ylab = "Number of Delayed Flights", col = c("yellow","beige"))
barplot(delayed$DelayedMean, horiz=FALSE, col = c("yellow","beige"),main="ALASKA / AM West: Delayed Flights Mean", names.arg=c("ALASKA","AM WEST"), xlab = "Airlines")
on_time <- flights_spread %>% group_by(Airline) %>% summarise(OnTimeMean=mean(OnTime), OnTimeSD=sd(OnTime), OnTimeMed=median(OnTime))
on_time
## # A tibble: 2 x 4
## Airline OnTimeMean OnTimeSD OnTimeMed
## <chr> <dbl> <dbl> <int>
## 1 ALASKA 655 678 497
## 2 AM WEST 1288 1994 383
boxplot(flights_spread$OnTime ~ flights_spread$Airline, main = "On Time Flights Box Plot ALASKA / AM WEST Airlines", xlab = "Airlines", ylab = "Number of On Time Flights", col = c("yellow","beige"))
barplot(on_time$OnTimeMed, horiz=FALSE, col = c("yellow","beige"),main="ALASKA / AM West: On Time Flights Median", names.arg=c("ALASKA","AM WEST"), xlab = "Airlines")
total_flights <- flights_spread %>% group_by(Airline) %>% summarise(TotFlightsMean=mean(TotFlights), TotFlightsSD=sd(TotFlights), TotFlightsMed=median(TotFlights))
total_flights
## # A tibble: 2 x 4
## Airline TotFlightsMean TotFlightsSD TotFlightsMed
## <chr> <dbl> <dbl> <int>
## 1 ALASKA 755 797 559
## 2 AM WEST 1445 2139 449
boxplot(flights_spread$TotFlights ~ flights_spread$Airline, main = "Total Flights Box Plot ALASKA / AM WEST Airlines", xlab = "Airlines", ylab = "Number of Flights", col = c("yellow","beige"))
barplot(total_flights$TotFlightsMed, horiz=FALSE, col = c("yellow","beige"),main="ALASKA / AM West: Total Number of Flights Median", names.arg=c("ALASKA","AM WEST"), xlab = "Airlines")
delayed_ratio <- flights_spread %>% group_by(Airline) %>% summarise(DelRatMean=mean(DelayedRatio), DelRatSD=sd(DelayedRatio), DelRatMed=median(DelayedRatio))
delayed_ratio
## # A tibble: 2 x 4
## Airline DelRatMean DelRatSD DelRatMed
## <chr> <dbl> <dbl> <dbl>
## 1 ALASKA 0.112 0.0459 0.111
## 2 AM WEST 0.178 0.0821 0.145
boxplot(flights_spread$DelayedRatio ~ flights_spread$Airline, main = "Delayed Ratio Box Plot ALASKA / AM WEST Airlines", xlab = "Airlines", ylab = "Number of Flights", col = c("yellow","beige"))
round(delayed_ratio$DelRatMean,digits = 2)
## [1] 0.11 0.18
barplot(delayed_ratio$DelRatMean, horiz=FALSE, col = c("yellow","beige"),main="ALASKA / AM West: Delayed Ratio", names.arg=c("ALASKA","AM WEST"), xlab = "Airlines")
ontime_ratio <- flights_spread %>% group_by(Airline) %>% summarise(OnTRatMean=mean(OnTimeRatio), OnTRatSD=sd(OnTimeRatio), OnTRatMed=median(OnTimeRatio))
ontime_ratio
## # A tibble: 2 x 4
## Airline OnTRatMean OnTRatSD OnTRatMed
## <chr> <dbl> <dbl> <dbl>
## 1 ALASKA 0.888 0.0459 0.889
## 2 AM WEST 0.822 0.0821 0.855
boxplot(flights_spread$OnTimeRatio ~ flights_spread$Airline, main = "On Time Ratio Box Plot ALASKA / AM WEST Airlines", xlab = "Airlines", ylab = "Number of Flights", col = c("yellow","beige"))
round(ontime_ratio$OnTRatMed,digits = 2)
## [1] 0.89 0.85
barplot(ontime_ratio$OnTRatMed, horiz=FALSE, col = c("yellow","beige"),main="ALASKA / AM West: Delayed Ratio Median", names.arg=c("ALASKA","AM WEST"), xlab = "Airlines")
Creating the TotFlights,DelayedRatio, and OnTimeRatio allows us to explore deeper and see how the the airlines have a close median of on time and delayed flights.