A CSV was created to match the picture in the assignment using excel. The file was then put into github. The csv was scraped using R and put into a data frame. The data frame was put into a tidy format. Some basic information on the delays between the two airlines was conducted.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RCurl)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::complete() masks RCurl::complete()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
Libraries Loaded
c <- getURL("https://raw.githubusercontent.com/division-zero/Data607/main/Week%205%20Assignment/airline.csv")
#read in the raw file
airline_info_df <- data.frame(read.csv(text = c ))
#put the csv into a dataframe
head(airline_info_df)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
CSV from github was scraped.
airline_info_df[2,1] <- "ALASKA"
airline_info_df[5,1] <- "AM WEST"
#add airlines to their corresponding rows
airline_info_df <- airline_info_df[-c(3), ]
#remove blank row
head(airline_info_df)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
The missing information was added to the dataframe. There
were two rows that did not mention which airline the data was for. The
blank row was deleted.
airline_info_df <- airline_info_df |> rename(Airline = 1, Time = 2)
# rename the columns to something useful
airline_info_df <- airline_info_df |> pivot_longer(Los.Angeles:Seattle, values_to = "frequency", values_drop_na = TRUE)
#pivot the data to a long format by destination. The number of flights were put into "frequency"
airline_info_df <- airline_info_df |> rename(Destination = 3)
#column was renamed to destination
head(airline_info_df)
## # A tibble: 6 × 4
## Airline Time Destination frequency
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los.Angeles 62
Data was converted into a long format using pivot by
destination.
head(airline_info_df)
## # A tibble: 6 × 4
## Airline Time Destination frequency
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los.Angeles 62
airline_info_df <- pivot_wider(airline_info_df,
names_from = Time,
values_from = frequency)
# in order to make the data more in line with a tidy format the on time and delay flights were converted into columns
head(airline_info_df)
## # A tibble: 6 × 4
## Airline Destination `on time` delayed
## <chr> <chr> <int> <int>
## 1 ALASKA Los.Angeles 497 62
## 2 ALASKA Phoenix 221 12
## 3 ALASKA San.Diego 212 20
## 4 ALASKA San.Francisco 503 102
## 5 ALASKA Seattle 1841 305
## 6 AM WEST Los.Angeles 694 117
Data was pivoted again to include the on time flight and
delay flights numbers for each airline and destination. This is the data
frame that was written as a csv at the end.
airline_info_df_analysis <- mutate(airline_info_df, '%_delays' = airline_info_df$delayed / (airline_info_df$`on time`+airline_info_df$delayed) *100 )
airline_info_df_analysis %>% tbl_df %>% print(n=Inf)
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::as_tibble()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 10 × 5
## Airline Destination `on time` delayed `%_delays`
## <chr> <chr> <int> <int> <dbl>
## 1 ALASKA Los.Angeles 497 62 11.1
## 2 ALASKA Phoenix 221 12 5.15
## 3 ALASKA San.Diego 212 20 8.62
## 4 ALASKA San.Francisco 503 102 16.9
## 5 ALASKA Seattle 1841 305 14.2
## 6 AM WEST Los.Angeles 694 117 14.4
## 7 AM WEST Phoenix 4840 415 7.90
## 8 AM WEST San.Diego 383 65 14.5
## 9 AM WEST San.Francisco 320 129 28.7
## 10 AM WEST Seattle 201 61 23.3
#what percent of flights are delayed for each destination?
alaska_only_df <- airline_info_df_analysis |>
filter(Airline == 'ALASKA')
#df with only Alaska airline in it
mean(alaska_only_df$`%_delays`)
## [1] 11.18683
#what is the average percent delay for Alaska?
am_west_only_df <- airline_info_df_analysis |>
filter(Airline == 'AM WEST')
#df with only AM WEST info
mean(am_west_only_df$`%_delays`)
## [1] 17.76915
#what is the average percent delay for AM WEST?
AM_W_On_time_total <- sum(am_west_only_df$`on time`)
AM_W_delayed_total <- sum(am_west_only_df$delayed)
ALASKA_On_time_total <- sum(alaska_only_df$`on time`)
ALASKA_delayed_total <- sum(alaska_only_df$delayed)
AM_W_delay_percent <- AM_W_delayed_total/(AM_W_delayed_total+AM_W_On_time_total)*100
ALASKA_delay_percent <- ALASKA_delayed_total/(ALASKA_delayed_total+ALASKA_On_time_total)*100
#computing the average percent delay adding all flight together for each airline.
print('Average delay % across all flights for ALASKA')
## [1] "Average delay % across all flights for ALASKA"
print(ALASKA_delay_percent)
## [1] 13.27152
print('Average delay % across all flights for AM WEST')
## [1] "Average delay % across all flights for AM WEST"
print(AM_W_delay_percent)
## [1] 10.89273
ggplot( data = airline_info_df_analysis, aes(x = Airline,y = `%_delays`)) + geom_boxplot()
write.csv(airline_info_df, 'tidy_airline.csv')
Alaska has smaller percentage of delays to each destination.
AM West has a smaller percentage of delays overall for all flights
ignoring destination. This discrepancy is explained by the fact that AM
west has the most flights to Phoenix, which has a low % delay.
airline_info_df_analysis shows the percent delay for each destination
for each airline.