## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
## Used Rbind to generate the reproducible data
flights <- rbind(c(NA, NA, "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
c("ALASKA", "on time", 497, 221, 212, 503, 1841),
c(NA, "delayed", 62, 12, 20, 102, 305),
c(NA, NA, NA, NA, NA, NA, NA),
c("AM WEST", "on time", 694, 4840, 383, 320, 201),
c(NA, "delayed", 117, 415, 65, 129, 61))
write.table(flights, file = "C:\\cuny education material\\Data607_projects_git\\flight.csv", sep = ",", col.names=F, row.names=F)#display data in tabular format using kable
flights <-read.csv(paste("https://raw.githubusercontent.com/yathdeep/data607_labs/main/flight.csv"))
#fill the airline value in the NA place in 1 column,2nd and 4 row
#and also remove space between on time value
flights[2,1]<-flights[1,1]
flights[5,1]<-flights[4,1]
flights[,2] <- sapply(flights[,2], str_replace, " ", "_")
kable(flights) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
scroll_box(width = "100%", height = "700px") | NA. | NA..1 | Los.Angeles | Phoenix | San.Diego | San.Francisco | Seattle |
|---|---|---|---|---|---|---|
| ALASKA | on_time | 497 | 221 | 212 | 503 | 1841 |
| ALASKA | delayed | 62 | 12 | 20 | 102 | 305 |
| NA | NA | NA | NA | NA | NA | NA |
| AM WEST | on_time | 694 | 4840 | 383 | 320 | 201 |
| AM WEST | delayed | 117 | 415 | 65 | 129 | 61 |
##rename NA. column with Airlines and NA.1 column with Status
#gather() will be used to transform dataset from its untidy form
#to a normalized tidy long form to fulfill the rule to convert variables into
#observation. Here, variables for destinations like Los.Angeles, San.Francisco
#etc. Seattle became observations.
tidy_airlines_data<-flights%>%na.omit()%>%rename("Airlines"="NA.","Status"="NA..1")%>%gather("Destination","Number of Flights",3:7)
tidy_airlines_data<-tidy_airlines_data%>%arrange(Airlines,desc(`Number of Flights`))
max_numberofflightsindicator<-which.max(tidy_airlines_data$`Number of Flights`)
kable(tidy_airlines_data) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
row_spec(max_numberofflightsindicator, background = "pink") %>%
scroll_box(width = "100%", height = "700px")| Airlines | Status | Destination | Number of Flights |
|---|---|---|---|
| ALASKA | on_time | Seattle | 1841 |
| ALASKA | on_time | San.Francisco | 503 |
| ALASKA | on_time | Los.Angeles | 497 |
| ALASKA | delayed | Seattle | 305 |
| ALASKA | on_time | Phoenix | 221 |
| ALASKA | on_time | San.Diego | 212 |
| ALASKA | delayed | San.Francisco | 102 |
| ALASKA | delayed | Los.Angeles | 62 |
| ALASKA | delayed | San.Diego | 20 |
| ALASKA | delayed | Phoenix | 12 |
| AM WEST | on_time | Phoenix | 4840 |
| AM WEST | on_time | Los.Angeles | 694 |
| AM WEST | delayed | Phoenix | 415 |
| AM WEST | on_time | San.Diego | 383 |
| AM WEST | on_time | San.Francisco | 320 |
| AM WEST | on_time | Seattle | 201 |
| AM WEST | delayed | San.Francisco | 129 |
| AM WEST | delayed | Los.Angeles | 117 |
| AM WEST | delayed | San.Diego | 65 |
| AM WEST | delayed | Seattle | 61 |
#spread is being used to transform data to wide form for better analysis.
wide_tidy_flights<- tidy_airlines_data %>%
spread(key = Status, value = `Number of Flights`)
kable(wide_tidy_flights) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
scroll_box(width = "100%", height = "700px")| Airlines | Destination | delayed | on_time |
|---|---|---|---|
| ALASKA | Los.Angeles | 62 | 497 |
| ALASKA | Phoenix | 12 | 221 |
| ALASKA | San.Diego | 20 | 212 |
| ALASKA | San.Francisco | 102 | 503 |
| ALASKA | Seattle | 305 | 1841 |
| AM WEST | Los.Angeles | 117 | 694 |
| AM WEST | Phoenix | 415 | 4840 |
| AM WEST | San.Diego | 65 | 383 |
| AM WEST | San.Francisco | 129 | 320 |
| AM WEST | Seattle | 61 | 201 |
flights_with_otherstats <-wide_tidy_flights%>%
mutate(total_flights=delayed + on_time,
on_time_percent = round(on_time/total_flights*100, 2),
delayed_percent = round(delayed/total_flights*100, 2))
kable(flights_with_otherstats) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
scroll_box(width = "100%", height = "700px")| Airlines | Destination | delayed | on_time | total_flights | on_time_percent | delayed_percent |
|---|---|---|---|---|---|---|
| ALASKA | Los.Angeles | 62 | 497 | 559 | 88.91 | 11.09 |
| ALASKA | Phoenix | 12 | 221 | 233 | 94.85 | 5.15 |
| ALASKA | San.Diego | 20 | 212 | 232 | 91.38 | 8.62 |
| ALASKA | San.Francisco | 102 | 503 | 605 | 83.14 | 16.86 |
| ALASKA | Seattle | 305 | 1841 | 2146 | 85.79 | 14.21 |
| AM WEST | Los.Angeles | 117 | 694 | 811 | 85.57 | 14.43 |
| AM WEST | Phoenix | 415 | 4840 | 5255 | 92.10 | 7.90 |
| AM WEST | San.Diego | 65 | 383 | 448 | 85.49 | 14.51 |
| AM WEST | San.Francisco | 129 | 320 | 449 | 71.27 | 28.73 |
| AM WEST | Seattle | 61 | 201 | 262 | 76.72 | 23.28 |
#arrange the flights to show most delays -in the order of airlines
flight_delays<-flights_with_otherstats%>%arrange(Airlines,desc(delayed_percent))
kable(flight_delays) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
scroll_box(width = "100%", height = "700px")| Airlines | Destination | delayed | on_time | total_flights | on_time_percent | delayed_percent |
|---|---|---|---|---|---|---|
| ALASKA | San.Francisco | 102 | 503 | 605 | 83.14 | 16.86 |
| ALASKA | Seattle | 305 | 1841 | 2146 | 85.79 | 14.21 |
| ALASKA | Los.Angeles | 62 | 497 | 559 | 88.91 | 11.09 |
| ALASKA | San.Diego | 20 | 212 | 232 | 91.38 | 8.62 |
| ALASKA | Phoenix | 12 | 221 | 233 | 94.85 | 5.15 |
| AM WEST | San.Francisco | 129 | 320 | 449 | 71.27 | 28.73 |
| AM WEST | Seattle | 61 | 201 | 262 | 76.72 | 23.28 |
| AM WEST | San.Diego | 65 | 383 | 448 | 85.49 | 14.51 |
| AM WEST | Los.Angeles | 117 | 694 | 811 | 85.57 | 14.43 |
| AM WEST | Phoenix | 415 | 4840 | 5255 | 92.10 | 7.90 |
ontimeflights<-flights_with_otherstats%>%arrange(desc(on_time_percent))
kable(ontimeflights) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
scroll_box(width = "100%", height = "700px")| Airlines | Destination | delayed | on_time | total_flights | on_time_percent | delayed_percent |
|---|---|---|---|---|---|---|
| ALASKA | Phoenix | 12 | 221 | 233 | 94.85 | 5.15 |
| AM WEST | Phoenix | 415 | 4840 | 5255 | 92.10 | 7.90 |
| ALASKA | San.Diego | 20 | 212 | 232 | 91.38 | 8.62 |
| ALASKA | Los.Angeles | 62 | 497 | 559 | 88.91 | 11.09 |
| ALASKA | Seattle | 305 | 1841 | 2146 | 85.79 | 14.21 |
| AM WEST | Los.Angeles | 117 | 694 | 811 | 85.57 | 14.43 |
| AM WEST | San.Diego | 65 | 383 | 448 | 85.49 | 14.51 |
| ALASKA | San.Francisco | 102 | 503 | 605 | 83.14 | 16.86 |
| AM WEST | Seattle | 61 | 201 | 262 | 76.72 | 23.28 |
| AM WEST | San.Francisco | 129 | 320 | 449 | 71.27 | 28.73 |
ggplot(flights_with_otherstats, aes(x=Destination, y=delayed_percent, fill=Airlines))+
geom_bar(stat="identity", position=position_dodge()) +
theme_classic()+
geom_text(aes(label=delayed_percent), hjust=1,position= position_dodge(width=0.9), color="black") +
ggtitle("Flights Delayed by Airline by City %") +
xlab("Destination") + ylab("Flights Delayed %") +
coord_flip() ## More comparison
airlines_compared<-flights_with_otherstats%>%group_by(Airlines)%>%
summarise(flight_on_time = sum(on_time), flights_delayed = sum(delayed),
percent_on_time = round(flight_on_time/sum(flight_on_time,flights_delayed )*100,2),
percent_delay = round(flights_delayed/sum(flight_on_time,flights_delayed)*100,2))%>%
select(Airlines,percent_on_time,percent_delay)## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## Airlines percent_on_time percent_delay
## <chr> <dbl> <dbl>
## 1 AM WEST 89.1 10.9
## 2 ALASKA 86.7 13.3
#view the above comparison on the chart
ggplot(airlines_compared, aes(x=Airlines, y=percent_delay , fill=Airlines))+
geom_bar(stat="identity", position=position_dodge(),width = 0.3) +
theme_classic()+
ggtitle("Overall % Of Delayed Flights by Airlines") +
xlab("AirLines") + ylab("Delayed %")