## From, Arrival, Los Angeles,Phoenix,San Diego,San Francisco,Seattle
## ALASKA,on time,497,221,212,503,1841
## ALASKA,delayed,62,12,20,102,305
## AM WEST,on time,694,4840,383,320,201
## AM WEST,delayed,117,415,65,129,61
# use tidyr and dplyr as needed to tidy and transform your data.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
arrivals <- read.csv("../data/numbersense.csv")
#data preview
head(arrivals)
## From Arrival Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
arrival_delayed <- filter(arrivals, Arrival == "delayed")
arrival_delayed_ALASKA <- filter(arrival_delayed, From == "ALASKA")
arrival_delayed_AM_WEST <- filter(arrival_delayed, From == "AM WEST")
#TODO ITERATE Through list
#c(Phoenix, San.Diego, San.Francisco, Seattle)
arrival_delayed_LA <- arrival_delayed %>%
summarise(avg_delay = mean(Los.Angeles),
min_delay = min(Los.Angeles),
max_delay = max(Los.Angeles),
sd = sd(Los.Angeles),
total = n())
arrival_delayed_SD <- arrival_delayed %>%
summarise(avg_delay = mean(San.Diego),
min_delay = min(San.Diego),
max_delay = max(San.Diego),
sd = sd(San.Diego),
total = n())
arrival_delayed_SF <- arrival_delayed %>%
summarise(avg_delay = mean(San.Francisco),
min_delay = min(San.Francisco),
max_delay = max(San.Francisco),
sd = sd(San.Francisco),
total = n())
arrival_delayed_SEA <- arrival_delayed %>%
summarise(avg_delay = mean(Seattle),
min_delay = min(Seattle),
max_delay = max(Seattle),
sd = sd(Seattle),
total = n())
arrival_delayed_PHX <- arrival_delayed %>%
summarise(avg_delay = mean(Phoenix),
min_delay = min(Phoenix),
max_delay = max(Phoenix),
sd = sd(Phoenix),
total = n())
arrival_delayed_LA; arrival_delayed_SF; arrival_delayed_LA; arrival_delayed_PHX; arrival_delayed_SEA;
## avg_delay min_delay max_delay sd total
## 1 89.5 62 117 38.89087 2
## avg_delay min_delay max_delay sd total
## 1 115.5 102 129 19.09188 2
## avg_delay min_delay max_delay sd total
## 1 89.5 62 117 38.89087 2
## avg_delay min_delay max_delay sd total
## 1 213.5 12 415 284.964 2
## avg_delay min_delay max_delay sd total
## 1 183 61 305 172.5341 2
arrival_delayed_ALASKA_all <- arrival_delayed_ALASKA %>%
gather(arrival_delayed_ALASKA, all_cities_delayed, Los.Angeles, San.Diego, San.Francisco, Phoenix, Seattle)
arrival_delayed_AM_WEST_all <- arrival_delayed_AM_WEST %>%
gather(arrival_delayed_AM_WEST_all, all_cities_delayed, Los.Angeles, San.Diego, San.Francisco, Phoenix, Seattle)
arrival_delayed_AM_WEST_all_stats <- arrival_delayed_AM_WEST_all %>%
summarise(avg_delay = mean(all_cities_delayed),
min_delay = min(all_cities_delayed),
max_delay = max(all_cities_delayed),
sd = sd(all_cities_delayed),
sum = sum(all_cities_delayed),
total = n())
arrival_delayed_ALASKA_all_stats <- arrival_delayed_ALASKA_all %>%
summarise(avg_delay = mean(all_cities_delayed),
min_delay = min(all_cities_delayed),
max_delay = max(all_cities_delayed),
sd = sd(all_cities_delayed),
sum = sum(all_cities_delayed),
total = n())
summary(arrival_delayed_ALASKA_all); summary(arrival_delayed_AM_WEST_all)
## From Arrival arrival_delayed_ALASKA all_cities_delayed
## ALASKA :5 delayed:5 Length:5 Min. : 12.0
## AM WEST:0 on time:0 Class :character 1st Qu.: 20.0
## Mode :character Median : 62.0
## Mean :100.2
## 3rd Qu.:102.0
## Max. :305.0
## From Arrival arrival_delayed_AM_WEST_all all_cities_delayed
## ALASKA :0 delayed:5 Length:5 Min. : 61.0
## AM WEST:5 on time:0 Class :character 1st Qu.: 65.0
## Mode :character Median :117.0
## Mean :157.4
## 3rd Qu.:129.0
## Max. :415.0
print(arrival_delayed_AM_WEST_all_stats); print(arrival_delayed_ALASKA_all_stats)
## avg_delay min_delay max_delay sd sum total
## 1 157.4 61 415 147.1625 787 5
## avg_delay min_delay max_delay sd sum total
## 1 100.2 12 305 120.0175 501 5
After looking at both summaries and basic statistics for the delayed sample set, it can be inferred that the delays for AM WEST were more substantial with a higher sd, min, mean, and max.