airflight_chart<-read.csv('https://raw.githubusercontent.com/vitugo23/DATA607/main/airflight_chart.csv', stringsAsFactors = FALSE)
###loading the libraries required for the assignment. ###
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
colnames(airflight_chart)[1] = "Airline_name"
colnames(airflight_chart)[2] = "Arrival_Status"
destination_names <- colnames(airflight_chart[3:length(airflight_chart)])
destination_names
## [1] "Los.Angeles" "Phoenix" "San.Diego" "San.Fransisco"
## [5] "Seattle"
airflight_chart <- airflight_chart %>% na.omit()
print(airflight_chart)
## Airline_name Arrival_Status Los.Angeles Phoenix San.Diego San.Fransisco
## 1 ALASKA on time 497 221 212 503
## 2 delayed 62 12 20 102
## 4 AM WEST on time 694 4,840 383 320
## 5 delayed 117 415 65 129
## Seattle
## 1 1,841
## 2 305
## 4 201
## 5 61
airflight_chart[destination_names] <- airflight_chart[destination_names] %>% apply(MARGIN = 2, FUN = function(x) as.numeric(str_remove(x,",")))
print(airflight_chart)
## Airline_name Arrival_Status Los.Angeles Phoenix San.Diego San.Fransisco
## 1 ALASKA on time 497 221 212 503
## 2 delayed 62 12 20 102
## 4 AM WEST on time 694 4840 383 320
## 5 delayed 117 415 65 129
## Seattle
## 1 1841
## 2 305
## 4 201
## 5 61
l_data <- airflight_chart %>% pivot_longer(cols=destination_names,names_to = "Destination_name", values_to = "Num_of_flights")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(destination_names)
##
## # Now:
## data %>% select(all_of(destination_names))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(l_data)
## # A tibble: 20 × 4
## Airline_name Arrival_Status Destination_name Num_of_flights
## <chr> <chr> <chr> <dbl>
## 1 "ALASKA" on time Los.Angeles 497
## 2 "ALASKA" on time Phoenix 221
## 3 "ALASKA" on time San.Diego 212
## 4 "ALASKA" on time San.Fransisco 503
## 5 "ALASKA" on time Seattle 1841
## 6 "" delayed Los.Angeles 62
## 7 "" delayed Phoenix 12
## 8 "" delayed San.Diego 20
## 9 "" delayed San.Fransisco 102
## 10 "" delayed Seattle 305
## 11 "AM WEST" on time Los.Angeles 694
## 12 "AM WEST" on time Phoenix 4840
## 13 "AM WEST" on time San.Diego 383
## 14 "AM WEST" on time San.Fransisco 320
## 15 "AM WEST" on time Seattle 201
## 16 "" delayed Los.Angeles 117
## 17 "" delayed Phoenix 415
## 18 "" delayed San.Diego 65
## 19 "" delayed San.Fransisco 129
## 20 "" delayed Seattle 61
l_data <- l_data %>% mutate(Airline_name = ifelse(Airline_name == "", NA, Airline_name))
print(l_data)
## # A tibble: 20 × 4
## Airline_name Arrival_Status Destination_name Num_of_flights
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Fransisco 503
## 5 ALASKA on time Seattle 1841
## 6 <NA> delayed Los.Angeles 62
## 7 <NA> delayed Phoenix 12
## 8 <NA> delayed San.Diego 20
## 9 <NA> delayed San.Fransisco 102
## 10 <NA> delayed Seattle 305
## 11 AM WEST on time Los.Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San.Diego 383
## 14 AM WEST on time San.Fransisco 320
## 15 AM WEST on time Seattle 201
## 16 <NA> delayed Los.Angeles 117
## 17 <NA> delayed Phoenix 415
## 18 <NA> delayed San.Diego 65
## 19 <NA> delayed San.Fransisco 129
## 20 <NA> delayed Seattle 61
l_data <- l_data %>% fill(Airline_name)
print(l_data)
## # A tibble: 20 × 4
## Airline_name Arrival_Status Destination_name Num_of_flights
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Fransisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los.Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San.Diego 20
## 9 ALASKA delayed San.Fransisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los.Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San.Diego 383
## 14 AM WEST on time San.Fransisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los.Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San.Diego 65
## 19 AM WEST delayed San.Fransisco 129
## 20 AM WEST delayed Seattle 61
l_data <- l_data %>% janitor::clean_names()
print(l_data)
## # A tibble: 20 × 4
## airline_name arrival_status destination_name num_of_flights
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Fransisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los.Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San.Diego 20
## 9 ALASKA delayed San.Fransisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los.Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San.Diego 383
## 14 AM WEST on time San.Fransisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los.Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San.Diego 65
## 19 AM WEST delayed San.Fransisco 129
## 20 AM WEST delayed Seattle 61
t_data <- l_data %>% spread(key=arrival_status, value=num_of_flights)
print(t_data)
## # A tibble: 10 × 4
## airline_name destination_name delayed `on time`
## <chr> <chr> <dbl> <dbl>
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Fransisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Fransisco 129 320
## 10 AM WEST Seattle 61 201
t_data <- t_data %>% janitor::clean_names()
print(t_data)
## # A tibble: 10 × 4
## airline_name destination_name delayed on_time
## <chr> <chr> <dbl> <dbl>
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Fransisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Fransisco 129 320
## 10 AM WEST Seattle 61 201
t_data <- t_data %>% mutate(destination_name = str_replace(destination_name, "\\.", " "))
print(t_data)
## # A tibble: 10 × 4
## airline_name destination_name delayed on_time
## <chr> <chr> <dbl> <dbl>
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Fransisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San Diego 65 383
## 9 AM WEST San Fransisco 129 320
## 10 AM WEST Seattle 61 201
#Analisys# ### I am adding a number of flights, delayed, and on time fields to analized data between the two airlines.###
airflight_data <- t_data
airflight_data <- airflight_data %>% mutate(num_of_flights = delayed+on_time,
pct_delayed = delayed/num_of_flights,
pct_ontime = on_time/num_of_flights)
print(airflight_data)
## # A tibble: 10 × 7
## airline_name destination_name delayed on_time num_of_flights pct_delayed
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA Los Angeles 62 497 559 0.111
## 2 ALASKA Phoenix 12 221 233 0.0515
## 3 ALASKA San Diego 20 212 232 0.0862
## 4 ALASKA San Fransisco 102 503 605 0.169
## 5 ALASKA Seattle 305 1841 2146 0.142
## 6 AM WEST Los Angeles 117 694 811 0.144
## 7 AM WEST Phoenix 415 4840 5255 0.0790
## 8 AM WEST San Diego 65 383 448 0.145
## 9 AM WEST San Fransisco 129 320 449 0.287
## 10 AM WEST Seattle 61 201 262 0.233
## # ℹ 1 more variable: pct_ontime <dbl>
airline_summary <- airflight_data %>% group_by(airline_name) %>% summarize(total_delayed = sum(delayed),
total_ontime = sum(on_time),
total_flights = sum(num_of_flights),
pct_delayed = (total_delayed/total_flights),
pct_ontime = (total_ontime/total_flights))
print(airline_summary)
## # A tibble: 2 × 6
## airline_name total_delayed total_ontime total_flights pct_delayed pct_ontime
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA 501 3274 3775 0.133 0.867
## 2 AM WEST 787 6438 7225 0.109 0.891
library(ggplot2)
ggplot(data=airline_summary, aes(x=airline_name, y=total_delayed, fill=airline_name)) +
geom_bar(stat='identity') +
labs(title = "\n Number of Delayed Flights by Airline", y="Number of Delayed Flights", fill="Airline Name") +
theme(axis.title.x = element_blank()) +
geom_text(aes(label=total_delayed), vjust = 1.2,position = position_dodge(.8), size = 4)