The
chart above describes arrival delays for two airlines across five
destinations. Your task is to:
(1) Create a .CSV file (or
optionally, a MySQL database!) that includes all of the information
above. You’re encouraged to use a “wide” structure similar to how the
information appears above, so that you can practice tidying and
transformations as described below.
(2) Read the information from
your .CSV file into R, and use tidyr and dplyr as needed to tidy and
transform your data.
(3) Perform analysis to compare the arrival
delays for the two airlines.
(4) Your code should be in an R
Markdown file, posted to rpubs.com, and should include narrative
descriptions of your data cleanup work, analysis, and conclusions.
Please include in your homework submission:
The URL to the .Rmd
file in your GitHub repository. and The URL for your rpubs.com web
page.
Flight_Data <- read_csv('https://raw.githubusercontent.com/BeshkiaKvarnstrom/MSDS-DATA607/main/Flight_Details.csv',
show_col_types = FALSE)
## New names:
## • `` -> `...1`
## • `` -> `...2`
Flight_Data
## # A tibble: 5 × 7
## ...1 ...2 los_angeles phoenix san_diego san_francisco seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
Flight_data <- Flight_Data[!apply(is.na(Flight_Data[1:7]),1,all), ]
Flight_data <- data.frame(Flight_data)
colnames(Flight_data) <- c("AIRLINE", "FLIGHT STATUS", "LOS ANGELES", "PHOENIX", "SAN DIEGO", "SAN FRANCISCO", "SEATTLE")
Flight_data
## AIRLINE FLIGHT STATUS LOS ANGELES PHOENIX SAN DIEGO SAN FRANCISCO SEATTLE
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 <NA> delayed 117 415 65 129 61
Flight_data[Flight_data==""]<-NA
Flight_data <- fill(Flight_data, AIRLINE)
Flight_data
## AIRLINE FLIGHT STATUS LOS ANGELES PHOENIX SAN DIEGO SAN FRANCISCO SEATTLE
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
new_flight_df <- Flight_data %>%
pivot_longer(!c("AIRLINE", "FLIGHT STATUS"),
names_to = "DESTINATION", values_to = "FLIGHT TIMES")
new_flight_df
## # A tibble: 20 × 4
## AIRLINE `FLIGHT STATUS` DESTINATION `FLIGHT TIMES`
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time LOS ANGELES 497
## 2 ALASKA on time PHOENIX 221
## 3 ALASKA on time SAN DIEGO 212
## 4 ALASKA on time SAN FRANCISCO 503
## 5 ALASKA on time SEATTLE 1841
## 6 ALASKA delayed LOS ANGELES 62
## 7 ALASKA delayed PHOENIX 12
## 8 ALASKA delayed SAN DIEGO 20
## 9 ALASKA delayed SAN FRANCISCO 102
## 10 ALASKA delayed SEATTLE 305
## 11 AM WEST on time LOS ANGELES 694
## 12 AM WEST on time PHOENIX 4840
## 13 AM WEST on time SAN DIEGO 383
## 14 AM WEST on time SAN FRANCISCO 320
## 15 AM WEST on time SEATTLE 201
## 16 AM WEST delayed LOS ANGELES 117
## 17 AM WEST delayed PHOENIX 415
## 18 AM WEST delayed SAN DIEGO 65
## 19 AM WEST delayed SAN FRANCISCO 129
## 20 AM WEST delayed SEATTLE 61
flight_delay <- spread(new_flight_df, `FLIGHT STATUS` , `FLIGHT TIMES`)
flight_delay <- flight_delay|>
mutate(`Total Flights`= (delayed + `on time`))|>
select(AIRLINE,DESTINATION,delayed,`on time`, `Total Flights`)|>
group_by(AIRLINE)
flight_delay
## # A tibble: 10 × 5
## # Groups: AIRLINE [2]
## AIRLINE DESTINATION delayed `on time` `Total Flights`
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 ALASKA LOS ANGELES 62 497 559
## 2 ALASKA PHOENIX 12 221 233
## 3 ALASKA SAN DIEGO 20 212 232
## 4 ALASKA SAN FRANCISCO 102 503 605
## 5 ALASKA SEATTLE 305 1841 2146
## 6 AM WEST LOS ANGELES 117 694 811
## 7 AM WEST PHOENIX 415 4840 5255
## 8 AM WEST SAN DIEGO 65 383 448
## 9 AM WEST SAN FRANCISCO 129 320 449
## 10 AM WEST SEATTLE 61 201 262
Delayed_DF <- new_flight_df%>% filter(str_detect(`FLIGHT STATUS`,"delayed"))
Delayed_DF
## # A tibble: 10 × 4
## AIRLINE `FLIGHT STATUS` DESTINATION `FLIGHT TIMES`
## <chr> <chr> <chr> <dbl>
## 1 ALASKA delayed LOS ANGELES 62
## 2 ALASKA delayed PHOENIX 12
## 3 ALASKA delayed SAN DIEGO 20
## 4 ALASKA delayed SAN FRANCISCO 102
## 5 ALASKA delayed SEATTLE 305
## 6 AM WEST delayed LOS ANGELES 117
## 7 AM WEST delayed PHOENIX 415
## 8 AM WEST delayed SAN DIEGO 65
## 9 AM WEST delayed SAN FRANCISCO 129
## 10 AM WEST delayed SEATTLE 61
Plot_Delayed <- ggplot(data=Delayed_DF, aes(x=DESTINATION, y=`FLIGHT TIMES`, fill=AIRLINE)) + labs(title = "AIRLINE FLIGHTS DELAYED", x= "DESTINATION", y = "DELAYED TIMES") + theme(plot.title = element_text(hjust = 0.5)) + scale_fill_brewer(palette="Paired") +
geom_bar(stat="identity", position=position_dodge()) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
Plot_Delayed
ontime_DF <- new_flight_df%>% filter(str_detect(`FLIGHT STATUS`,"on time"))
ontime_DF
## # A tibble: 10 × 4
## AIRLINE `FLIGHT STATUS` DESTINATION `FLIGHT TIMES`
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time LOS ANGELES 497
## 2 ALASKA on time PHOENIX 221
## 3 ALASKA on time SAN DIEGO 212
## 4 ALASKA on time SAN FRANCISCO 503
## 5 ALASKA on time SEATTLE 1841
## 6 AM WEST on time LOS ANGELES 694
## 7 AM WEST on time PHOENIX 4840
## 8 AM WEST on time SAN DIEGO 383
## 9 AM WEST on time SAN FRANCISCO 320
## 10 AM WEST on time SEATTLE 201
Plot_ontime <- ggplot(data=ontime_DF, aes(x=DESTINATION, y=`FLIGHT TIMES`, fill=AIRLINE)) + labs(title = "AIRLINE FLIGHTS THAT ARRIVED ONTIME", x= "DESTINATION", y = "ON-TIME TIMES") + theme(plot.title = element_text(hjust = 0.5)) + scale_fill_brewer(palette="Paired") +
geom_bar(stat="identity", position=position_dodge()) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
Plot_ontime
### Perform analysis to compare the delay and on time arrivals for the
Alaska airlines.
Alaska <- new_flight_df%>% filter(str_detect(`AIRLINE`,"ALASKA"))
compare_plot <- ggplot(NULL, aes(x, y)) +
geom_line(data=Alaska, aes(x=DESTINATION, y=`FLIGHT TIMES`, group=1), col="green", size = 1) +
geom_line(data=ontime_DF, aes(x=DESTINATION, y=`FLIGHT TIMES`, group=1), col="purple", size = 1)+
labs(title = "DELAYED & ON-TIME FLIGHT - ALASKA", x= "DESTINATION", y = "FLIGHT TIMES", ylim=c(100,1000), breaks = 10) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))+
scale_color_manual(name = "FLIGHT STATUS")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
# display the plot
compare_plot
### Perform analysis to compare the delay and on time arrivals for the
AM West airlines.
Alaska <- new_flight_df%>% filter(str_detect(`AIRLINE`,"AM WEST"))
compare_plot <- ggplot(NULL, aes(x, y)) +
geom_line(data=Alaska, aes(x=DESTINATION, y=`FLIGHT TIMES`, group=1), col="green", size = 1) +
geom_line(data=ontime_DF, aes(x=DESTINATION, y=`FLIGHT TIMES`, group=1), col="purple", size = 1)+
labs(title = "DELAYED & ON-TIME FLIGHT - AM WEST", x= "DESTINATION", y = "FLIGHT TIMES", ylim=c(100,1000), breaks = 10) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))+
scale_color_manual(name = "FLIGHT STATUS")
# display the plot
compare_plot
The Airline AM West have both more on-time and delayed flights than Alaska airline.
Phoenix Airport had a considerable amount of flights over the other airlines. This airport also had more on-time and delayed flights.