Problem

The chart above describes arrival delays for two airlines across five destinations. Your task is to: (1) Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below. (2) Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data. (3) Perform analysis to compare the arrival delays for the two airlines

Data

,,Los Angeles,Phoenix,Sand Diego,San Francisco,Seattle Alaska,on time,497,221,212,503,1841 ,delayed,62,12,20,102,305 ,,,,,, AM West,on time,694,4840,383,320,201 ,delayed,117,415,65,129,61

Load library

Install and load necessary library

Data manupulation

#Load the csv file to flights dataframe

flights <- read.csv("Data/flights.csv",header=TRUE,sep=",",na.strings = c("","NA")) 


#Below are the steps followed
# 1. Remove all the NA's in the dataset
# 2. Add a new column with updated flight names. Some has NA in the data
# 3. Remove the unecesary columns
# 4. Change the data from rows to columns using Gather
# 5. Change the data from columns to rows using spread


flights.alter <- flights[rowSums(is.na(flights))<4,] %>% mutate(flight.name=na.locf(X)) %>% select(-X) %>% gather("location","timings",2:6,factor_key=TRUE) %>% spread("X.1","timings")


kable(flights.alter, align = 'l')
flight.name location delayed on time
Alaska Los.Angeles 62 497
Alaska Phoenix 12 221
Alaska Sand.Diego 20 212
Alaska San.Francisco 102 503
Alaska Seattle 305 1841
AM West Los.Angeles 117 694
AM West Phoenix 415 4840
AM West Sand.Diego 65 383
AM West San.Francisco 129 320
AM West Seattle 61 201

Compare the delay and ontime data

summary(flights.alter)
##   flight.name          location    delayed          on time      
##  Alaska :5    Los.Angeles  :2   Min.   : 12.00   Min.   : 201.0  
##  AM West:5    Phoenix      :2   1st Qu.: 61.25   1st Qu.: 245.8  
##               Sand.Diego   :2   Median : 83.50   Median : 440.0  
##               San.Francisco:2   Mean   :128.80   Mean   : 971.2  
##               Seattle      :2   3rd Qu.:126.00   3rd Qu.: 646.2  
##                                 Max.   :415.00   Max.   :4840.0
summarise(flights.alter, avg_delay = mean(delayed), avg_on_time = mean(`on time`))
##   avg_delay avg_on_time
## 1     128.8       971.2
# Total mean time by airlines

group_by(flights.alter, flight.name) %>% summarise(avg_delay = mean(delayed), avg_on_time = mean(`on time`)) 
## # A tibble: 2 × 3
##   flight.name avg_delay avg_on_time
##        <fctr>     <dbl>       <dbl>
## 1      Alaska     100.2       654.8
## 2     AM West     157.4      1287.6

Charts comparision

# comparision with location and delay via ggplot

ggplot(flights.alter, aes(location,delayed,`on time`)) + geom_point(aes(color=flight.name),size=4,alpha=1/2)  + geom_smooth(method = "lm") +
  facet_grid(.~flight.name)

# comparision with delay and on time via ggplot

ggplot(flights.alter, aes(delayed,`on time`)) + geom_point(aes(color=flight.name),size=4,alpha=1/2)  + geom_smooth(method = "lm") +
  facet_grid(.~flight.name)

# comparision with delay and on time via ggvis


flights.alter %>% ggvis(x= ~ location,y =~delayed) %>% layer_points(fill = ~factor(flight.name))