knitr::opts_chunk$set(echo = TRUE)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
urlfile="https://raw.githubusercontent.com/Nhodgkinson/607A4/main/Airline%20Info.csv"
mydata<-read_csv(url(urlfile))
## New names:
## • `` -> `...1`
## • `` -> `...2`
## Rows: 5 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ...1, ...2
## dbl (3): Los Angeles, San Diego, San Francisco
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df<-gather(mydata, "location", "n", 3:7)
df<-df[!is.na(df$n),] #Remove NA values from the "n" column
df<-df %>% fill(...1) #fill in NA rows in column 1 with value above
df$x <- paste(df$...1,df$...2)#Concatenating column 1 and 2 into a new column to use later
adf<-subset(df, select= c(3,5,4))#new dataframe that is a subset of relevant data. Location, n, and x
adf<-spread(adf, x, n)# this takes the x column and tidy's it into columns using n as the value
#using group_by and summarise to output mean and sum by location
df %>% group_by(location) %>%
summarise(mean= mean(n), sum =sum(n))
## # A tibble: 5 × 3
## location mean sum
## <chr> <dbl> <dbl>
## 1 Los Angeles 342. 1370
## 2 Phoenix 1372 5488
## 3 San Diego 170 680
## 4 San Francisco 264. 1054
## 5 Seattle 602 2408
#using the tidy df and creating new columns
adf$total <- rowSums(adf[2:5])
adf$avg <- rowMeans(adf[2:5])
adf
## # A tibble: 5 × 7
## location `ALASKA delayed` `ALASKA on time` AM WEST …¹ AM WE…² total avg
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Los Angeles 62 497 117 694 1370 342.
## 2 Phoenix 12 221 415 4840 5488 1372
## 3 San Diego 20 212 65 383 680 170
## 4 San Francisco 102 503 129 320 1054 264.
## 5 Seattle 305 1841 61 201 2408 602
## # … with abbreviated variable names ¹`AM WEST delayed`, ²`AM WEST on time`
#Here I have the mean of Alaska and AM West columns being displayed individually
print("Number of delayed Alaska flights: ")
## [1] "Number of delayed Alaska flights: "
mean(adf$`ALASKA delayed`)
## [1] 100.2
print("Number of on time Alaska flights: ")
## [1] "Number of on time Alaska flights: "
mean(adf$`ALASKA on time`)
## [1] 654.8
print("Number of delayed AM West flights: ")
## [1] "Number of delayed AM West flights: "
mean(adf$`AM WEST delayed`)
## [1] 157.4
print("Number of on time AM West flights: ")
## [1] "Number of on time AM West flights: "
mean(adf$`AM WEST on time`)
## [1] 1287.6
#I would like to know which airline has a higher ratio of on time flights
adf$AR <- (AR = adf$`ALASKA on time`/adf$`ALASKA delayed`)
adf$AMR <- (AMR = adf$`AM WEST on time`/adf$`AM WEST delayed`)
adf$avg_AR<-mean(adf$AR)
adf$avg_AMR<-mean(adf$AMR)
#Alaska has the higher ratio of on time flights with a ratio of 9.6 compared to AM West with a ratio of 5.8