1. question

source: Numbersense, Kaiser Fung, McGraw HIll, 2013

source: Numbersense, Kaiser Fung, McGraw HIll, 2013

The chart above describes arrival delays for two airlines across five destinations. Your task is to:
(1) Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
(2) Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
(3) Perform analysis to compare the arrival delays for the two airlines.
(4) Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions. Please include in your homework submission:
  • The URL to the .Rmd file in your GitHub repository. and
  • The URL for your rpubs.com web page.

2. methodology

1. build original table in csv
2. load and melt

2-1. data loading

# read csv into R
raw_data <- read.csv("airline_delays.csv", sep = ",")#, check.names = FALSE)
raw_data
##         X     X.1 los.angeles phoenix san.diego san.francisco seattle
## 1  alaska on time         497     221       212           503    1841
## 2  alaska delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 am west on time         694    4840       383           320     201
## 5 am west delayed         117     415        65           129      61

2-2. observations

this dataset contains four variables:
  • airline, stored in the rows
  • status, stored in the rows
  • city, spread across the column names,
  • count, stored in the cell values
# modify column names for easier reference and drop row with blank values 
headers.raw_data <- raw_data %>% 
  rename(airline = 1, status = 2) %>%
  drop_na()
headers.raw_data
##   airline  status los.angeles phoenix san.diego san.francisco seattle
## 1  alaska on time         497     221       212           503    1841
## 2  alaska delayed          62      12        20           102     305
## 4 am west on time         694    4840       383           320     201
## 5 am west delayed         117     415        65           129      61

2-3. tidying

data cleaning requires that:
  • the city names are gathered into a column against counts
  • the status are spread against count
  • city values have spaces instead of periods
  • descriptive calculations are available for analysis and visualization
# melt city and count
molten.raw_data <- headers.raw_data %>% 
  pivot_longer(-c(airline, status), names_to = "city", values_to = "count")
head(molten.raw_data)
## # A tibble: 6 x 4
##   airline status  city          count
##   <fct>   <fct>   <chr>         <int>
## 1 alaska  on time los.angeles     497
## 2 alaska  on time phoenix         221
## 3 alaska  on time san.diego       212
## 4 alaska  on time san.francisco   503
## 5 alaska  on time seattle        1841
## 6 alaska  delayed los.angeles      62
# cast status and count
casted.raw_data <- molten.raw_data %>%
  pivot_wider(names_from = "status", values_from = "count")
head(casted.raw_data)
## # A tibble: 6 x 4
##   airline city          `on time` delayed
##   <fct>   <chr>             <int>   <int>
## 1 alaska  los.angeles         497      62
## 2 alaska  phoenix             221      12
## 3 alaska  san.diego           212      20
## 4 alaska  san.francisco       503     102
## 5 alaska  seattle            1841     305
## 6 am west los.angeles         694     117
# replace period with spaces for values in column "city"
casted.raw_data$city <- str_replace_all(casted.raw_data$city, "\\.", " ")
head(casted.raw_data)
## # A tibble: 6 x 4
##   airline city          `on time` delayed
##   <fct>   <chr>             <int>   <int>
## 1 alaska  los angeles         497      62
## 2 alaska  phoenix             221      12
## 3 alaska  san diego           212      20
## 4 alaska  san francisco       503     102
## 5 alaska  seattle            1841     305
## 6 am west los angeles         694     117
# new column for total flights
casted.raw_data$flights <- casted.raw_data$`on time` + casted.raw_data$delayed

# new column for percentage of flights that are delayed
casted.raw_data$delayed_rate <- casted.raw_data$delayed / casted.raw_data$flights
casted.raw_data
## # A tibble: 10 x 6
##    airline city          `on time` delayed flights delayed_rate
##    <fct>   <chr>             <int>   <int>   <int>        <dbl>
##  1 alaska  los angeles         497      62     559       0.111 
##  2 alaska  phoenix             221      12     233       0.0515
##  3 alaska  san diego           212      20     232       0.0862
##  4 alaska  san francisco       503     102     605       0.169 
##  5 alaska  seattle            1841     305    2146       0.142 
##  6 am west los angeles         694     117     811       0.144 
##  7 am west phoenix            4840     415    5255       0.0790
##  8 am west san diego           383      65     448       0.145 
##  9 am west san francisco       320     129     449       0.287 
## 10 am west seattle             201      61     262       0.233

3. analysis

# calculate average rate of delay for all "alaska" flights
delayed_rate_alaska <- sum(subset(casted.raw_data, airline == "alaska")$delayed_rate) / sum(casted.raw_data$airline == "alaska") 

# calculate average rate of delay for all "am west" flights
delayed_rate_am_west <- sum(subset(casted.raw_data, airline == "am west")$delayed_rate) / sum(casted.raw_data$airline == "alaska") 

# dataframe storing average rate of delay for each airline
delays <- data.frame(c("alaska", "am west"), c(delayed_rate_alaska, delayed_rate_am_west))
colnames(delays) <- c("airline", "delay")
delays
##   airline     delay
## 1  alaska 0.1118683
## 2 am west 0.1776915
# plot delayed rate per airline
ggplot(casted.raw_data, aes(x = city, y = delayed_rate, fill = city)) +
   geom_bar(stat = 'identity', position = position_dodge()) + # get bar graph of count
   facet_grid(~airline) + # group by airline
   theme(axis.text.x = element_text(angle = 90)) + # rotate x-axis labels
   labs(y="count", title="rate of flight delays by airline and city", subtitle="alaska vs. am west") # labels

# plot delays stacked by flights per airline
ggplot(casted.raw_data, aes(x = city, y = delayed, fill = flights)) +
   geom_bar(stat = 'identity', position = position_stack()) + # get bar graph of count
   facet_grid(~airline) + # group by airline
   theme(axis.text.x = element_text(angle = 90)) + # rotate x-axis labels
   labs(y="delays", title="count of flight delays by airline and city", subtitle="alaska vs. am west") # labels

4. conclusions