UP_Assignment

# Loading the required packages
library(tidyr)

## Warning: package 'tidyr' was built under R version 4.3.2

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

# Downloading the url file and reading it into a variable called "airline data"
url_file<- "https://raw.githubusercontent.com/ursulapodosenin/DAT-607/main/airline_data.csv"
airline_data <- read.table(url_file, header=TRUE, sep=",", na.strings = c("", "NA"))
airline_data

##         X     X.1 Los.Angeles Pheonix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503   1,841
## 2    <NA> delayed          62      12        20           102     305
## 3    <NA>    <NA>          NA    <NA>        NA            NA    <NA>
## 4 AM WEST on time         694   4,840       383           320     201
## 5    <NA> delayed         117     415        65           129      61

# Extracting the information from the variable "airline_data" and organizing it as well as creating factors out of the cities 
long_airline_data <- gather(airline_data, City, Num_Flights, Los.Angeles:Seattle, factor_key=TRUE)
long_airline_data

##          X     X.1          City Num_Flights
## 1   ALASKA on time   Los.Angeles         497
## 2     <NA> delayed   Los.Angeles          62
## 3     <NA>    <NA>   Los.Angeles        <NA>
## 4  AM WEST on time   Los.Angeles         694
## 5     <NA> delayed   Los.Angeles         117
## 6   ALASKA on time       Pheonix         221
## 7     <NA> delayed       Pheonix          12
## 8     <NA>    <NA>       Pheonix        <NA>
## 9  AM WEST on time       Pheonix       4,840
## 10    <NA> delayed       Pheonix         415
## 11  ALASKA on time     San.Diego         212
## 12    <NA> delayed     San.Diego          20
## 13    <NA>    <NA>     San.Diego        <NA>
## 14 AM WEST on time     San.Diego         383
## 15    <NA> delayed     San.Diego          65
## 16  ALASKA on time San.Francisco         503
## 17    <NA> delayed San.Francisco         102
## 18    <NA>    <NA> San.Francisco        <NA>
## 19 AM WEST on time San.Francisco         320
## 20    <NA> delayed San.Francisco         129
## 21  ALASKA on time       Seattle       1,841
## 22    <NA> delayed       Seattle         305
## 23    <NA>    <NA>       Seattle        <NA>
## 24 AM WEST on time       Seattle         201
## 25    <NA> delayed       Seattle          61

# Removing NA values and filling in the data
long_airline_data <- 
  long_airline_data|>
    mutate(X = as.character(na_if(X, 'NA')))|>
      fill(X, .direction = 'down')

# Removing the empty row 
long_airline_data <- long_airline_data[-c(3, 8, 13, 18, 23), ]

# Replacing character values with integer values
long_airline_data[long_airline_data == "4,840"] <- 4840
long_airline_data[long_airline_data == "1,841"] <- 1841

# Turning the column of chracters into integers
long_airline_data <- transform(long_airline_data,
                             Num_Flights = as.integer(Num_Flights))

# Obtaining summary statistics 
long_airline_data|>
        group_by(X)|> 
          summarise(max = max(Num_Flights),
             min=min(Num_Flights),
             mean=mean(Num_Flights),  
             median=median(Num_Flights),
             standard_deviation=sd(Num_Flights))

## # A tibble: 2 × 6
##   X         max   min  mean median standard_deviation
##   <chr>   <int> <int> <dbl>  <dbl>              <dbl>
## 1 ALASKA   1841    12  378.   216.               544.
## 2 AM WEST  4840    61  722.   260.              1460.

# Calculating the total number of flights by airline
flight_counts <- long_airline_data |>
  group_by(X) |>
  summarise(total_flights = sum(Num_Flights))
flight_counts

## # A tibble: 2 × 2
##   X       total_flights
##   <chr>           <int>
## 1 ALASKA           3775
## 2 AM WEST          7225

# Alaska had 3775 flights, while AM West had 7225 flights, which is 3450 more than Alaska

# Calculating the proportion of flights that were delayed for each airline
delayed_proportions <- long_airline_data |>
  group_by(X) |>
  summarise(delayed_proportion = sum(Num_Flights[X.1 == "delayed"]) / sum(Num_Flights))
delayed_proportions

## # A tibble: 2 × 2
##   X       delayed_proportion
##   <chr>                <dbl>
## 1 ALASKA               0.133
## 2 AM WEST              0.109

# Roughly 13% of Alaska's flights were delayed, while only 11% of AM West's flights were delayed

# Visualizing the number of flights for each city using a bar plot
ggplot(long_airline_data, aes(x = City, y = Num_Flights)) +
  geom_bar(stat = "identity") +
  labs(title = "Number of Flights by City", x = "City", y = "Number of Flights")

# Based on this data, I would choose to fly with AM West, as they have more flights going out than Alaska, and have a lower percentage of delays.

UP_Assignment_Four

Ursula Podosenin

2024-02-24