Creating the table

The first task is to create a *.CSV file that includes specific untidy information on two airlines’ arrival delays across five destinations. Let’s get started on building the table.

The most simple way to build the table (below) is to create a series of lists and combine them into a data frame. When creating a data frame, R combines the lists by columns (not rows) so some planning might be necessary when creating a table this way. Otherwise, you can create a data frame and then transform (transpose) the data to switch the columns to rows.

a <- c("","ALASKA","","","AM WEST","")
b <- c("","on time","delayed","","on time","delayed")
c <- c("Los Angeles",497,62,"",694,117)
d <- c("Phoenix",221,12,"",4840,415)
e <- c("San Diego",212,20,"",383,65)
f <- c("San Francisco",503,102,"",320,129)
g <- c("Seattle",1841,305,"",201,61)
flightinfo <- data.frame(a,b,c,d,e,f,g)
flightinfo
##         a       b           c       d         e             f       g
## 1                 Los Angeles Phoenix San Diego San Francisco Seattle
## 2  ALASKA on time         497     221       212           503    1841
## 3         delayed          62      12        20           102     305
## 4                                                                    
## 5 AM WEST on time         694    4840       383           320     201
## 6         delayed         117     415        65           129      61

Table created, now we’ll write the table to a .CSV file which will be saved to (just assuming here) your My Documents folder.

#write.csv(flightinfo, file = "flightinfo.csv", header = TRUE)

Reading the data, tidying and transforming it

Now that we have our table, we’ll tidy it up using tidyr and dplyr. First we can load the two packages:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)

Since the data frame is already loaded, we can start to tidy it. First, I’ll transpose it since it will make it easier to work the table in columns rather than rows. Then I’ll create two tables: one for ALASKA and one for AM WEST. In addition, I’d rather not use that extra line for the carrier name, so I’m going to add an extra column at the end to specify which airline it is, remove that top line, then change the new topline to headers.

flightinfo <- t(flightinfo)
flightinfo
##   [,1]            [,2]      [,3]      [,4] [,5]      [,6]     
## a ""              "ALASKA"  ""        ""   "AM WEST" ""       
## b ""              "on time" "delayed" ""   "on time" "delayed"
## c "Los Angeles"   "497"     "62"      ""   "694"     "117"    
## d "Phoenix"       "221"     "12"      ""   "4840"    "415"    
## e "San Diego"     "212"     "20"      ""   "383"     "65"     
## f "San Francisco" "503"     "102"     ""   "320"     "129"    
## g "Seattle"       "1841"    "305"     ""   "201"     "61"
ALASKA <- flightinfo[,c(1:3)]
ALASKA <- data.frame(city = ALASKA[,1], carrier = ALASKA[1,2], on_time = ALASKA[,2], delayed = ALASKA[,3])
ALASKA <- ALASKA[c(3:length(ALASKA[,1])),]
kable(ALASKA)
city carrier on_time delayed
c Los Angeles ALASKA 497 62
d Phoenix ALASKA 221 12
e San Diego ALASKA 212 20
f San Francisco ALASKA 503 102
g Seattle ALASKA 1841 305
AMWEST <- flightinfo[,c(1,5:6)]
AMWEST <- data.frame(city = AMWEST[,1], carrier = AMWEST[1,2], on_time = AMWEST[,2], delayed = AMWEST[,3])
AMWEST <- AMWEST[c(3:length(AMWEST[,1])),]
kable(AMWEST)
city carrier on_time delayed
c Los Angeles AM WEST 694 117
d Phoenix AM WEST 4840 415
e San Diego AM WEST 383 65
f San Francisco AM WEST 320 129
g Seattle AM WEST 201 61

The data was seperated, now we want to re-join them so we can easier analyze them, and sort them so we can easily see them. Also, just to make sure we have the right type for all items, we’ll just reset everything to their appropriate types.

#bind/join the tables
flightinfo <- as.data.frame(bind_rows(AMWEST, ALASKA))

#re-establish the types
flightinfo$on_time <- as.integer(flightinfo$on_time)
flightinfo$delayed <- as.integer(flightinfo$delayed)
flightinfo$city <- as.character(flightinfo$city)
flightinfo$carrier <- as.character(flightinfo$carrier)
#sort and group the table
flightinfo <- flightinfo %>% 
  arrange(city, carrier)
kable(flightinfo)
city carrier on_time delayed
Los Angeles ALASKA 497 62
Los Angeles AM WEST 694 117
Phoenix ALASKA 221 12
Phoenix AM WEST 4840 415
San Diego ALASKA 212 20
San Diego AM WEST 383 65
San Francisco ALASKA 503 102
San Francisco AM WEST 320 129
Seattle ALASKA 1841 305
Seattle AM WEST 201 61

Comparing the arrival delays

Phew, all set to do some work now. We’ve got our table set up and organized so we can do some comparisons. The first thing we’ll do is find the total number of flights each carrier made to each city; this total will be added to a new column titled total. The second thing we’ll do is calculate the percentage of flights from each carrier and city that were delayed, we’ll add this to a columned titled percent_delayed. This second new column will utilize the first new column, so we have to set up the first one first. Also, the percent_delayed column has too many decimals so rounded is more visually appealing.

#add columns for percent of flights delayed and total flights
flightinfo$total <- flightinfo$on_time + flightinfo$delayed
flightinfo$percent_delayed <- round(flightinfo$delayed / flightinfo$total, 2)
kable(flightinfo)
city carrier on_time delayed total percent_delayed
Los Angeles ALASKA 497 62 559 0.11
Los Angeles AM WEST 694 117 811 0.14
Phoenix ALASKA 221 12 233 0.05
Phoenix AM WEST 4840 415 5255 0.08
San Diego ALASKA 212 20 232 0.09
San Diego AM WEST 383 65 448 0.15
San Francisco ALASKA 503 102 605 0.17
San Francisco AM WEST 320 129 449 0.29
Seattle ALASKA 1841 305 2146 0.14
Seattle AM WEST 201 61 262 0.23

Analyzing the data

I’ll pose a few questions for myself to answer:

  • What city did ALASKA have the least delays in?
  • What city did AM WEST have the most delays in?
  • Which carrier had the highest percentage of delays?
  • Which carrier was busier (had the most total flights)?
  • Which city was the busiest (had the most total flights)?
#city with least delays for ALASKA
kable(flightinfo %>% 
  filter(carrier == "ALASKA") %>% 
    arrange(delayed)  %>% 
      select(one_of(c("city", "delayed")))
)
city delayed
Phoenix 12
San Diego 20
Los Angeles 62
San Francisco 102
Seattle 305
flightinfo %>% 
  filter(carrier == "ALASKA") %>% 
    arrange(delayed)  %>% 
      select(one_of(c("city", "delayed"))) %>% 
        ggplot(., aes(x=city, y=delayed, fill=city)) + geom_bar(position="dodge", stat="identity", colour="black") + scale_fill_brewer(palette="Pastel1") + ggtitle("ALASKA Airlines' Delays")

#city with most delays for AM WEST
kable(flightinfo %>% 
  filter(carrier == "AM WEST") %>% 
    arrange(desc(delayed))  %>% 
      select(one_of(c("city", "delayed")))
)
city delayed
Phoenix 415
San Francisco 129
Los Angeles 117
San Diego 65
Seattle 61
flightinfo %>% 
  filter(carrier == "AM WEST") %>% 
    arrange(desc(delayed))  %>% 
      select(one_of(c("city", "delayed"))) %>% 
        ggplot(., aes(x=city, y=delayed, fill=city)) + geom_bar(position="dodge", stat="identity", colour="black") + scale_fill_brewer(palette="Pastel1") + ggtitle("AM WEST Airlines' Delays")

#carrier with highest percentage of delays
kable(flightinfo %>% 
  arrange(desc(percent_delayed)) %>% 
    group_by(carrier) %>% 
      summarise(round(sum(delayed) / sum(total), 2))
)
carrier round(sum(delayed)/sum(total), 2)
ALASKA 0.13
AM WEST 0.11
flightinfo %>% 
  arrange(desc(percent_delayed)) %>% 
    group_by(carrier) %>% 
      summarise(per_delay = round(sum(delayed) / sum(total), 2)) %>% 
        ggplot(., aes(x=carrier, y=per_delay, fill=carrier)) + geom_bar(position="dodge", stat="identity", colour="black") + scale_fill_brewer(palette="Pastel1") + ggtitle("Percent of Delays by Carrier")

#busiest carrier
kable(flightinfo %>% 
  group_by(carrier) %>% 
    summarise(sum(total))
)
carrier sum(total)
ALASKA 3775
AM WEST 7225
flightinfo %>% 
  group_by(carrier) %>% 
    summarise(sum1 = sum(total)) %>% 
      ggplot(., aes(x=carrier, y=sum1, fill=carrier)) + geom_bar(position="dodge", stat="identity", colour="black") + scale_fill_brewer(palette="Pastel1") + ggtitle("Number of Flights by Carrier")

#busiest city
kable(flightinfo %>% 
  group_by(city) %>% 
    summarise(sum(total))
)
city sum(total)
Los Angeles 1370
Phoenix 5488
San Diego 680
San Francisco 1054
Seattle 2408
flightinfo %>% 
  group_by(city) %>% 
    summarise(sum1 = sum(total)) %>% 
      ggplot(., aes(x=city, y=sum1, fill=city)) + geom_bar(position="dodge", stat="identity", colour="black") + scale_fill_brewer(palette="Pastel1") + ggtitle("Number of Flights by City")