First, I load the appropriate packages.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ readr 1.3.1
## ✔ tibble 2.0.1 ✔ purrr 0.3.0
## ✔ tidyr 0.8.3 ✔ stringr 1.3.1
## ✔ ggplot2 3.1.0 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidyr)
Then, I load the data.
raw_data <- read.csv("https://raw.githubusercontent.com/miasiracusa/Data607/master/assignment5/data607%20week5.csv", na.strings = "")
raw_data
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
Then, I rename the columns and remove the empty row.
airport.data <- (raw_data)
names(airport.data)[c(1, 2, 3, 5, 6)] <- c("airline", "flight status", "Los Angeles", "San Diego", "San Francisco")
airport.data[5,1] <- c("ALASKA")
airport.data[2,1] <- c("AM WEST")
airport.data <- airport.data[-3,]
airport.data
## airline flight status Los Angeles Phoenix San Diego San Francisco
## 1 ALASKA on time 497 221 212 503
## 2 AM WEST delayed 62 12 20 102
## 4 AM WEST on time 694 4840 383 320
## 5 ALASKA delayed 117 415 65 129
## Seattle
## 1 1841
## 2 305
## 4 201
## 5 61
Then, I transform the data using the tidyr and dplyr packages.
airport.data <- gather(airport.data, "airport", "flights", 3:7)
airport.data %>% mutate(flights = as.integer(flights))
## airline flight status airport flights
## 1 ALASKA on time Los Angeles 497
## 2 AM WEST delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 ALASKA delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 AM WEST delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 ALASKA delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 AM WEST delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 ALASKA delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 AM WEST delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 ALASKA delayed San Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 AM WEST delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 ALASKA delayed Seattle 61
airport.data <- spread(airport.data, "flight status", "flights")
Then, I perform an analysis using the dplyr package.
analysis <- airport.data %>%
group_by(airline) %>%
summarise(
total.delayed = sum(delayed),
total.ontime = sum(`on time`),
total.flights = sum(delayed + `on time`),
percent.delayed = (total.delayed / total.flights)
)
analysis
## # A tibble: 2 x 5
## airline total.delayed total.ontime total.flights percent.delayed
## <fct> <int> <int> <int> <dbl>
## 1 ALASKA 787 3274 4061 0.194
## 2 AM WEST 501 6438 6939 0.0722
We see that ALASKA airlines had proportionally more delayed flights than AM WEST.