# Loading the required packages
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Downloading the url file and reading it into a variable called "airline data"
url_file<- "https://raw.githubusercontent.com/ursulapodosenin/DAT-607/main/airline_data.csv"
airline_data <- read.table(url_file, header=TRUE, sep=",", na.strings = c("", "NA"))
airline_data
## X X.1 Los.Angeles Pheonix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA <NA> NA NA <NA>
## 4 AM WEST on time 694 4,840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
# Extracting the information from the variable "airline_data" and organizing it as well as creating factors out of the cities
long_airline_data <- gather(airline_data, City, Num_Flights, Los.Angeles:Seattle, factor_key=TRUE)
long_airline_data
## X X.1 City Num_Flights
## 1 ALASKA on time Los.Angeles 497
## 2 <NA> delayed Los.Angeles 62
## 3 <NA> <NA> Los.Angeles <NA>
## 4 AM WEST on time Los.Angeles 694
## 5 <NA> delayed Los.Angeles 117
## 6 ALASKA on time Pheonix 221
## 7 <NA> delayed Pheonix 12
## 8 <NA> <NA> Pheonix <NA>
## 9 AM WEST on time Pheonix 4,840
## 10 <NA> delayed Pheonix 415
## 11 ALASKA on time San.Diego 212
## 12 <NA> delayed San.Diego 20
## 13 <NA> <NA> San.Diego <NA>
## 14 AM WEST on time San.Diego 383
## 15 <NA> delayed San.Diego 65
## 16 ALASKA on time San.Francisco 503
## 17 <NA> delayed San.Francisco 102
## 18 <NA> <NA> San.Francisco <NA>
## 19 AM WEST on time San.Francisco 320
## 20 <NA> delayed San.Francisco 129
## 21 ALASKA on time Seattle 1,841
## 22 <NA> delayed Seattle 305
## 23 <NA> <NA> Seattle <NA>
## 24 AM WEST on time Seattle 201
## 25 <NA> delayed Seattle 61
# Removing NA values and filling in the data
long_airline_data <-
long_airline_data|>
mutate(X = as.character(na_if(X, 'NA')))|>
fill(X, .direction = 'down')
# Removing the empty row
long_airline_data <- long_airline_data[-c(3, 8, 13, 18, 23), ]
# Replacing character values with integer values
long_airline_data[long_airline_data == "4,840"] <- 4840
long_airline_data[long_airline_data == "1,841"] <- 1841
# Turning the column of chracters into integers
long_airline_data <- transform(long_airline_data,
Num_Flights = as.integer(Num_Flights))
# Obtaining summary statistics
long_airline_data|>
group_by(X)|>
summarise(max = max(Num_Flights),
min=min(Num_Flights),
mean=mean(Num_Flights),
median=median(Num_Flights),
standard_deviation=sd(Num_Flights))
## # A tibble: 2 × 6
## X max min mean median standard_deviation
## <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 ALASKA 1841 12 378. 216. 544.
## 2 AM WEST 4840 61 722. 260. 1460.
# Calculating the total number of flights by airline
flight_counts <- long_airline_data |>
group_by(X) |>
summarise(total_flights = sum(Num_Flights))
flight_counts
## # A tibble: 2 × 2
## X total_flights
## <chr> <int>
## 1 ALASKA 3775
## 2 AM WEST 7225
# Alaska had 3775 flights, while AM West had 7225 flights, which is 3450 more than Alaska
# Calculating the proportion of flights that were delayed for each airline
delayed_proportions <- long_airline_data |>
group_by(X) |>
summarise(delayed_proportion = sum(Num_Flights[X.1 == "delayed"]) / sum(Num_Flights))
delayed_proportions
## # A tibble: 2 × 2
## X delayed_proportion
## <chr> <dbl>
## 1 ALASKA 0.133
## 2 AM WEST 0.109
# Roughly 13% of Alaska's flights were delayed, while only 11% of AM West's flights were delayed
# Visualizing the number of flights for each city using a bar plot
ggplot(long_airline_data, aes(x = City, y = Num_Flights)) +
geom_bar(stat = "identity") +
labs(title = "Number of Flights by City", x = "City", y = "Number of Flights")

# Based on this data, I would choose to fly with AM West, as they have more flights going out than Alaska, and have a lower percentage of delays.