knitr::opts_chunk$set(echo = TRUE)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)

urlfile="https://raw.githubusercontent.com/Nhodgkinson/607A4/main/Airline%20Info.csv"

mydata<-read_csv(url(urlfile))
## New names:
## • `` -> `...1`
## • `` -> `...2`
## Rows: 5 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ...1, ...2
## dbl (3): Los Angeles, San Diego, San Francisco
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df<-gather(mydata, "location", "n", 3:7)

df<-df[!is.na(df$n),] #Remove NA values from the "n" column

df<-df %>% fill(...1) #fill in NA rows in column 1 with value above

df$x <- paste(df$...1,df$...2)#Concatenating column 1 and 2 into a new column to use later

adf<-subset(df, select= c(3,5,4))#new dataframe that is a subset of relevant data. Location, n, and x

adf<-spread(adf, x, n)# this takes the x column and tidy's it into columns using n as the value
#using group_by and summarise to output mean and sum by location
df %>% group_by(location) %>%
  summarise(mean= mean(n), sum =sum(n))
## # A tibble: 5 × 3
##   location       mean   sum
##   <chr>         <dbl> <dbl>
## 1 Los Angeles    342.  1370
## 2 Phoenix       1372   5488
## 3 San Diego      170    680
## 4 San Francisco  264.  1054
## 5 Seattle        602   2408
#using the tidy df and creating new columns 
adf$total <- rowSums(adf[2:5])
adf$avg <- rowMeans(adf[2:5])
adf
## # A tibble: 5 × 7
##   location      `ALASKA delayed` `ALASKA on time` AM WEST …¹ AM WE…² total   avg
##   <chr>                    <dbl>            <dbl>      <dbl>   <dbl> <dbl> <dbl>
## 1 Los Angeles                 62              497        117     694  1370  342.
## 2 Phoenix                     12              221        415    4840  5488 1372 
## 3 San Diego                   20              212         65     383   680  170 
## 4 San Francisco              102              503        129     320  1054  264.
## 5 Seattle                    305             1841         61     201  2408  602 
## # … with abbreviated variable names ¹​`AM WEST delayed`, ²​`AM WEST on time`
#Here I have the mean of Alaska and AM West columns being displayed individually
print("Number of delayed Alaska flights: ") 
## [1] "Number of delayed Alaska flights: "
mean(adf$`ALASKA delayed`)
## [1] 100.2
print("Number of on time Alaska flights: ") 
## [1] "Number of on time Alaska flights: "
mean(adf$`ALASKA on time`)
## [1] 654.8
print("Number of delayed AM West flights: ") 
## [1] "Number of delayed AM West flights: "
mean(adf$`AM WEST delayed`)
## [1] 157.4
print("Number of on time AM West flights: ") 
## [1] "Number of on time AM West flights: "
mean(adf$`AM WEST on time`)
## [1] 1287.6
#I would like to know which airline has a higher ratio of on time flights

adf$AR <-  (AR = adf$`ALASKA on time`/adf$`ALASKA delayed`)
adf$AMR <- (AMR = adf$`AM WEST on time`/adf$`AM WEST delayed`)
adf$avg_AR<-mean(adf$AR)
adf$avg_AMR<-mean(adf$AMR)

#Alaska has the higher ratio of on time flights with a ratio of 9.6 compared to AM West with a ratio of 5.8