library("tidyr")
library(dplyr)
library(ggplot2)
filename <- c("https://raw.githubusercontent.com/nobieyi00/CUNY_MSDA_R/master/flight_dataset.csv")
flight_dataset_df <- read.csv(filename,stringsAsFactors = FALSE)
flight_dataset_df
## Airline Arrival_Status Los.Angeles Phoenix San.Diego San.Franciso
## 1 ALASKA On time 497 221 212 503
## 2 ALASKA Delayed 62 12 20 102
## 3 AMWEST On time 694 4840 383 320
## 4 AMWEST Delayed 117 415 65 129
## Seattle
## 1 1841
## 2 305
## 3 201
## 4 61
Flight_tidy <- tbl_df(flight_dataset_df)
#Narrow down the table structure
flight_freq <- Flight_tidy %>%
gather (State,frequency,-Airline,-Arrival_Status)
flight_freq
## # A tibble: 20 × 4
## Airline Arrival_Status State frequency
## <chr> <chr> <chr> <int>
## 1 ALASKA On time Los.Angeles 497
## 2 ALASKA Delayed Los.Angeles 62
## 3 AMWEST On time Los.Angeles 694
## 4 AMWEST Delayed Los.Angeles 117
## 5 ALASKA On time Phoenix 221
## 6 ALASKA Delayed Phoenix 12
## 7 AMWEST On time Phoenix 4840
## 8 AMWEST Delayed Phoenix 415
## 9 ALASKA On time San.Diego 212
## 10 ALASKA Delayed San.Diego 20
## 11 AMWEST On time San.Diego 383
## 12 AMWEST Delayed San.Diego 65
## 13 ALASKA On time San.Franciso 503
## 14 ALASKA Delayed San.Franciso 102
## 15 AMWEST On time San.Franciso 320
## 16 AMWEST Delayed San.Franciso 129
## 17 ALASKA On time Seattle 1841
## 18 ALASKA Delayed Seattle 305
## 19 AMWEST On time Seattle 201
## 20 AMWEST Delayed Seattle 61
on_time_ds<- select (
filter(flight_freq, Arrival_Status=='On time' )
,Airline,State,On_time_freq=frequency
)
delayed_ds <-select (
filter(flight_freq, Arrival_Status=='Delayed' )
,Airline,State,Delayed_freq=frequency
)
flight_ds <-inner_join(on_time_ds, delayed_ds, by = c("Airline","State"))
flight_ds
## # A tibble: 10 × 4
## Airline State On_time_freq Delayed_freq
## <chr> <chr> <int> <int>
## 1 ALASKA Los.Angeles 497 62
## 2 AMWEST Los.Angeles 694 117
## 3 ALASKA Phoenix 221 12
## 4 AMWEST Phoenix 4840 415
## 5 ALASKA San.Diego 212 20
## 6 AMWEST San.Diego 383 65
## 7 ALASKA San.Franciso 503 102
## 8 AMWEST San.Franciso 320 129
## 9 ALASKA Seattle 1841 305
## 10 AMWEST Seattle 201 61
Ratio_delay_ds <- mutate(flight_ds, Total_freq= On_time_freq + Delayed_freq,
Average_Delays = Delayed_freq/Total_freq)
arrange(Ratio_delay_ds,State,Average_Delays)
## # A tibble: 10 × 6
## Airline State On_time_freq Delayed_freq Total_freq
## <chr> <chr> <int> <int> <int>
## 1 ALASKA Los.Angeles 497 62 559
## 2 AMWEST Los.Angeles 694 117 811
## 3 ALASKA Phoenix 221 12 233
## 4 AMWEST Phoenix 4840 415 5255
## 5 ALASKA San.Diego 212 20 232
## 6 AMWEST San.Diego 383 65 448
## 7 ALASKA San.Franciso 503 102 605
## 8 AMWEST San.Franciso 320 129 449
## 9 ALASKA Seattle 1841 305 2146
## 10 AMWEST Seattle 201 61 262
## # ... with 1 more variables: Average_Delays <dbl>
ggplot(Ratio_delay_ds,aes(x = State, y =Average_Delays))+
geom_point(aes(color=Airline), # colour depends on cond2
size=3)
Flight_graph <-mutate(Ratio_delay_ds, Airline_State= paste(Airline, State, sep ="_"))
ggplot(Flight_graph, aes(x=State,y=Average_Delays)) + geom_boxplot(aes(color=Airline))
Flight_summary <- Ratio_delay_ds %>%
group_by(Airline) %>%
summarise(Total_average_Delay = sum(Average_Delays))
Flight_summary
## # A tibble: 2 × 2
## Airline Total_average_Delay
## <chr> <dbl>
## 1 ALASKA 0.5593413
## 2 AMWEST 0.8884576
ggplot(Flight_summary, aes(y=Total_average_Delay,x=Airline)) + geom_boxplot(aes(color= Airline),size =3)
City_summary <- Ratio_delay_ds %>%
group_by(State) %>%
summarise(Total_average_Delay = sum(Average_Delays))
arrange(City_summary,Total_average_Delay)
## # A tibble: 5 × 2
## State Total_average_Delay
## <chr> <dbl>
## 1 Phoenix 0.1304746
## 2 San.Diego 0.2312962
## 3 Los.Angeles 0.2551787
## 4 Seattle 0.3749493
## 5 San.Franciso 0.4559002
ggplot(City_summary, aes(y=Total_average_Delay,x=State)) + geom_boxplot(aes(color= State),size =3)
Simpson Paradox we can see that the trend reverses after we aggregate the result set to Airline level
sp<- Ratio_delay_ds %>%
group_by(Airline) %>%
summarise(Total_ontime_freq =sum(On_time_freq), Total_Delay_freq = sum(Delayed_freq), Total_entire_freq = sum(Total_freq)) %>%
mutate(Delay_total_avg = Total_Delay_freq/Total_entire_freq)
sp
## # A tibble: 2 × 5
## Airline Total_ontime_freq Total_Delay_freq Total_entire_freq
## <chr> <int> <int> <int>
## 1 ALASKA 3274 501 3775
## 2 AMWEST 6438 787 7225
## # ... with 1 more variables: Delay_total_avg <dbl>
We can see that the AMWEST airline had a better performance overall while ALASKA had more delays overall.
ggplot(sp, aes(y=Airline,x=Delay_total_avg)) + geom_boxplot(aes(color= Airline),size =3)