library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
library(stringr)
#read the csv file without any header row
load <- read.csv('https://raw.githubusercontent.com/datanerddhanya/DATA607/main/Arrival_delays.csv',sep = ",", header= FALSE,stringsAsFactors=FALSE, quote = "\"")
# provide a name to the columns
load <- rename(load,
Airline = V1 ,
Arrivaltype = V2,
Los_Angeles = V3,
Phoenix = V4,
San_Diego = V5 ,
San_Francisco = V6,
Seattle = V7
)
#delete the blank row and the row with text showing destination names
load<- load[grep("^[a-z]" , load$Arrivaltype),]
#update the two rows with missing airlines
load$Airline[2]= "ALASKA"
load$Airline[4]= "AMWEST"
#pivot the data.frame longer
load_final <- load |>
pivot_longer(
cols = !(Airline:Arrivaltype) ,
names_to = "Destinations",
values_to = "flight_count" ,
values_drop_na = TRUE
)
#remove comma from the flight count values
load_final$flight_count<- as.numeric(gsub(",","",load_final$flight_count))
AMWEST has more delayed flights than ALASKA Airlines. However for Seattle destination, the arrival delays are more for Alaska Airlines than AMWEST.
load_final %>%
group_by(Airline) %>%
filter(Arrivaltype == "delayed") %>%
summarize(flight_count = sum(flight_count) )
## # A tibble: 2 × 2
## Airline flight_count
## <chr> <dbl>
## 1 ALASKA 501
## 2 AMWEST 787
load_final %>%
group_by(Destinations,Airline) %>%
filter(Arrivaltype == "delayed") %>%
summarize(flight_count = sum(flight_count), )
## `summarise()` has grouped output by 'Destinations'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 3
## # Groups: Destinations [5]
## Destinations Airline flight_count
## <chr> <chr> <dbl>
## 1 Los_Angeles ALASKA 62
## 2 Los_Angeles AMWEST 117
## 3 Phoenix ALASKA 12
## 4 Phoenix AMWEST 415
## 5 San_Diego ALASKA 20
## 6 San_Diego AMWEST 65
## 7 San_Francisco ALASKA 102
## 8 San_Francisco AMWEST 129
## 9 Seattle ALASKA 305
## 10 Seattle AMWEST 61
By plotting the data, i see that Alaska has recorded greater proportion of delayed flights
library(ggplot2)
ggplot( data = load_final , aes( x=Airline , y=flight_count, fill = Arrivaltype )) +
geom_bar(stat="identity")
# geom_line(aes(y= girl_ratio), color = "BLACK") +
# labs( title = "Arrival delays for 2 Airlines", subtitle = "proportion of boys(blue) / girls(black) over time " ,x= "years", y= "propotion of boys/girls")