#install.packages("RMySQL")
library(RMySQL)
## Warning: package 'RMySQL' was built under R version 3.4.4
## Loading required package: DBI
## Warning: package 'DBI' was built under R version 3.4.4

(1)Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.

First we input the data into a MySQL database. My first attempt at this created a table that was too tidy. I reorganized the table, using the instructional video: https://www.rstudio.com/resources/webinars/data-wrangling-with-r-and-rstudio/ as a reference to create something more untidy to fit with the spirit of the exercise.

##movie_db = dbConnect(MySQL(), user='root', password='root', dbname='moviedb', host='localhost')
db <- dbConnect(MySQL(), user='root', password='root', dbname='flights', host='localhost')
dbSendQuery(db, "drop table if exists ontime, cities;")
## <MySQLResult:152768072,0,0>
#I'm am doing this as two tables.
dbSendQuery(db, "CREATE TABLE cities(
            city_id INT,
            city varchar(50),
            PRIMARY KEY (city_id)
            );")
## <MySQLResult:134126456,0,1>
dbSendQuery(db, "CREATE TABLE ontime(
            city_id INT,
            air_line varchar(50),
            arrived varchar(50),
            n INT,
            FOREIGN KEY (city_id) REFERENCES cities(city_id)
            );")
## <MySQLResult:73383376,0,2>
#populate the cities table
dbSendQuery(db, "INSERT INTO cities(
            city_id,city)
            VALUES (1, 'Los Angeles');")
## <MySQLResult:2,0,3>
dbSendQuery(db, "INSERT INTO cities(
            city_id,city)
            VALUES (2, 'Phoenix');")
## <MySQLResult:2,0,4>
dbSendQuery(db, "INSERT INTO cities(
            city_id,city)
            VALUES (3, 'San Diego');")
## <MySQLResult:1,0,5>
dbSendQuery(db, "INSERT INTO cities(
            city_id,city)
            VALUES (4, 'San Francisco');")
## <MySQLResult:4134424,0,6>
dbSendQuery(db, "INSERT INTO cities(
            city_id,city)
            VALUES (5, 'Seatle');")
## <MySQLResult:1,0,7>
#populate the ontime table for the Alaska Flights
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(1, 'Alaska', 'ontime', 497);")
## <MySQLResult:2,0,8>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(1, 'Alaska', 'delayed', 62);")
## <MySQLResult:1,0,9>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(2, 'Alaska', 'ontime', 221);")
## <MySQLResult:2,0,10>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(2, 'Alaska', 'delayed', 12);")
## <MySQLResult:4134424,0,11>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(3, 'Alaska', 'ontime', 212);")
## <MySQLResult:0,0,12>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(3, 'Alaska', 'delayed', 20);")
## <MySQLResult:4134424,0,13>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(4, 'Alaska', 'ontime', 503);")
## <MySQLResult:197933648,0,14>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(4, 'Alaska', 'delayed', 102);")
## <MySQLResult:119295584,0,15>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(5, 'Alaska', 'ontime', 1803);")
## <MySQLResult:73348664,0,16>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(5, 'Alaska', 'delayed', 305);")
## <MySQLResult:2,0,17>
#Populate the ontime table for AM West Flights
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(1, 'AM West', 'ontime', 694);")
## <MySQLResult:0,0,18>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(1, 'AM West', 'delayed', 117);")
## <MySQLResult:2,0,19>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(2, 'AM West', 'ontime', 4840);")
## <MySQLResult:148838368,0,20>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(2, 'AM West', 'delayed', 415);")
## <MySQLResult:4132496,0,21>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(3, 'AM West', 'ontime', 383);")
## <MySQLResult:194938800,0,22>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(3, 'AM West', 'delayed', 65);")
## <MySQLResult:73283168,0,23>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(4, 'AM West', 'ontime', 320);")
## <MySQLResult:158212000,0,24>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(4, 'AM West', 'delayed', 129);")
## <MySQLResult:4131776,0,25>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(5, 'AM West', 'ontime', 201);")
## <MySQLResult:1,0,26>
dbSendQuery(db, "INSERT INTO ontime
(city_id, air_line, arrived, n) 
VALUES(5, 'AM West', 'delayed', 61);")
## <MySQLResult:97327616,0,27>

(2)Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.

First we will load tidyr and dplyr:

library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Next I’ll read the information from the MySQL database into r.

arrival<- db %>% dbSendQuery("SELECT DISTINCT a.air_line, c.city, a.arrived, a.n FROM ontime a, cities c WHERE a.city_id = c.city_id GROUP BY a.air_line, c.city, a.arrived;") %>% fetch() %>% tbl_df()
arrival
## # A tibble: 20 x 4
##    air_line city          arrived     n
##    <chr>    <chr>         <chr>   <int>
##  1 Alaska   Los Angeles   delayed    62
##  2 Alaska   Los Angeles   ontime    497
##  3 Alaska   Phoenix       delayed    12
##  4 Alaska   Phoenix       ontime    221
##  5 Alaska   San Diego     delayed    20
##  6 Alaska   San Diego     ontime    212
##  7 Alaska   San Francisco delayed   102
##  8 Alaska   San Francisco ontime    503
##  9 Alaska   Seatle        delayed   305
## 10 Alaska   Seatle        ontime   1803
## 11 AM West  Los Angeles   delayed   117
## 12 AM West  Los Angeles   ontime    694
## 13 AM West  Phoenix       delayed   415
## 14 AM West  Phoenix       ontime   4840
## 15 AM West  San Diego     delayed    65
## 16 AM West  San Diego     ontime    383
## 17 AM West  San Francisco delayed   129
## 18 AM West  San Francisco ontime    320
## 19 AM West  Seatle        delayed    61
## 20 AM West  Seatle        ontime    201

It looks like the query I made put the data in a vertical table. I will use tidyr’s spread to organize it further:

arrival <- arrival %>% spread(arrived, n)
arrival
## # A tibble: 10 x 4
##    air_line city          delayed ontime
##    <chr>    <chr>           <int>  <int>
##  1 Alaska   Los Angeles        62    497
##  2 Alaska   Phoenix            12    221
##  3 Alaska   San Diego          20    212
##  4 Alaska   San Francisco     102    503
##  5 Alaska   Seatle            305   1803
##  6 AM West  Los Angeles       117    694
##  7 AM West  Phoenix           415   4840
##  8 AM West  San Diego          65    383
##  9 AM West  San Francisco     129    320
## 10 AM West  Seatle             61    201

I also want to use dplyr’s mutate to add a row of the percentage of on time flights, since the numbers range from 4840 to 12, it’s difficult to look at the data and tell which city or airline has the best performance.

arrival <- arrival %>% mutate(total_flights = ontime+delayed)
## Warning: package 'bindrcpp' was built under R version 3.4.4
arrival <- arrival %>% mutate(percent_ontime = ontime / total_flights*100)
arrival
## # A tibble: 10 x 6
##    air_line city          delayed ontime total_flights percent_ontime
##    <chr>    <chr>           <int>  <int>         <int>          <dbl>
##  1 Alaska   Los Angeles        62    497           559           88.9
##  2 Alaska   Phoenix            12    221           233           94.8
##  3 Alaska   San Diego          20    212           232           91.4
##  4 Alaska   San Francisco     102    503           605           83.1
##  5 Alaska   Seatle            305   1803          2108           85.5
##  6 AM West  Los Angeles       117    694           811           85.6
##  7 AM West  Phoenix           415   4840          5255           92.1
##  8 AM West  San Diego          65    383           448           85.5
##  9 AM West  San Francisco     129    320           449           71.3
## 10 AM West  Seatle             61    201           262           76.7

From this table it looks as if AM West handles more flights in this region than Alaska Airlines. AM West handles more flights out of Phoenix than Alaska handles in total, but more on that in the next section.

(3)Perform analysis to compare the arrival delays for the two airlines.

First we will do some simple summary statistics.

arrival %>% summarise(median_percent = median(percent_ontime), sd_percent = sd(percent_ontime))
## # A tibble: 1 x 2
##   median_percent sd_percent
##            <dbl>      <dbl>
## 1           85.6       7.17

The median ontime percentage is 85.6% with a standard deviation of 7.2% this will come in useful when we evaluate top performing cities and and airlines below.

arrival %>% arrange(desc(percent_ontime))
## # A tibble: 10 x 6
##    air_line city          delayed ontime total_flights percent_ontime
##    <chr>    <chr>           <int>  <int>         <int>          <dbl>
##  1 Alaska   Phoenix            12    221           233           94.8
##  2 AM West  Phoenix           415   4840          5255           92.1
##  3 Alaska   San Diego          20    212           232           91.4
##  4 Alaska   Los Angeles        62    497           559           88.9
##  5 AM West  Los Angeles       117    694           811           85.6
##  6 Alaska   Seatle            305   1803          2108           85.5
##  7 AM West  San Diego          65    383           448           85.5
##  8 Alaska   San Francisco     102    503           605           83.1
##  9 AM West  Seatle             61    201           262           76.7
## 10 AM West  San Francisco     129    320           449           71.3
arrival %>% filter(percent_ontime > median(percent_ontime)) %>% select(air_line,percent_ontime) %>% arrange()
## # A tibble: 5 x 2
##   air_line percent_ontime
##   <chr>             <dbl>
## 1 Alaska             88.9
## 2 Alaska             94.8
## 3 Alaska             91.4
## 4 AM West            85.6
## 5 AM West            92.1

Alaska Air holds 3 of the 5 top spots in on time percentage with 2 above 90%, whereas AM West only has 1 above 90%.

arrival %>% filter(percent_ontime > median(percent_ontime)) %>% select(city,percent_ontime) %>% arrange()
## # A tibble: 5 x 2
##   city        percent_ontime
##   <chr>                <dbl>
## 1 Los Angeles           88.9
## 2 Phoenix               94.8
## 3 San Diego             91.4
## 4 Los Angeles           85.6
## 5 Phoenix               92.1

Interestingly Phoenix is a top performing city for both airlines, however AM West handles 4840 flights out of Phoenix, whereas Alaska only handles 233.

arrival %>% filter(percent_ontime < median(percent_ontime)) %>% select(air_line,percent_ontime) %>% arrange()
## # A tibble: 5 x 2
##   air_line percent_ontime
##   <chr>             <dbl>
## 1 Alaska             83.1
## 2 Alaska             85.5
## 3 AM West            85.5
## 4 AM West            71.3
## 5 AM West            76.7

Conversely, AM West holds 3 of 5 bottom-half performing cities, with two below 80%. Alaska’s lowest performing city is 83.1%

arrival %>% filter(percent_ontime < median(percent_ontime)) %>% select(city,percent_ontime) %>% arrange()
## # A tibble: 5 x 2
##   city          percent_ontime
##   <chr>                  <dbl>
## 1 San Francisco           83.1
## 2 Seatle                  85.5
## 3 San Diego               85.5
## 4 San Francisco           71.3
## 5 Seatle                  76.7

Seattle is a bottom-half performing city for both airlines. However, that is a small market for AM West at 262 flights total, but that is Alaska’s top market with 2108 flights total.

par(las=2)
barplot(arrival$percent_ontime,names.arg =arrival$city,horiz = TRUE,cex.names = 0.5, col = c("darkgreen", "darkgreen","darkgreen","darkgreen","darkgreen", "Orange","Orange","Orange","Orange","Orange" ), legend = c("Orange = AM West", "Green = Alaska"),args.legend = list(x ='topright', bty='n', inset=c(-0.075,0), cex=0.8))

city_by_airline <- arrival %>%  select(city,air_line,percent_ontime) %>%spread(air_line,percent_ontime)
boxplot(city_by_airline$Alaska, city_by_airline$`AM West`, names = c("Alaska", "AM West") )

Again, from these plots it appears as if Alaska Airlines has more cities with a higher percent on time than AM West. However, we should not stop here as there are other factors to consider.

par(las=2)
barplot(arrival$total_flights,names.arg =arrival$city,horiz = TRUE,cex.names = 0.5, col = c("darkgreen", "darkgreen","darkgreen","darkgreen","darkgreen", "Orange","Orange","Orange","Orange","Orange" ), legend = c("Orange = AM West", "Green = Alaska"),args.legend = list(x ='topright', bty='n', inset=c(-0.075,0), cex=0.8))

As stated above, AM West handles more flights out of Phoenix (a top-half market) than Alaska does all five cities combined. Furthermore, Alaska’s top market is in the bottom-half of performance. Below we will take the ratio of AM West flights to Alaska flights.

alaska <- arrival %>% filter(air_line == "Alaska")
al_sum <-sum(alaska$total_flights)
am_west <- arrival %>% filter(air_line == "AM West")
am_sum <- sum(am_west$total_flights)
am_sum/al_sum
## [1] 1.933369

AM West handles nearly twice as much traffic. With this in mind we need to devise a better indicator of on time percentage. I will calculate the total ontime flights divided by the total flights for each airline to get a total on time percentage.

alaska <- arrival %>% filter(air_line == "Alaska")
al_tot_on <-sum(alaska$ontime)
al_tot_on/al_sum
## [1] 0.8659352
am_west <- arrival %>% filter(air_line == "AM West")
am_tot_on <- sum(am_west$ontime)
am_tot_on/am_sum
## [1] 0.8910727

So it appears that AM West slightly out-performs Alaska overall with 89.1% on time flights versus 86.6%. Weighting the on time percentages by the size of each city’s flights over the total for each airline may be a better indicator.

al_weighted <- arrival %>% filter(air_line == "Alaska")%>%mutate(weighted_ave = percent_ontime*total_flights/al_sum)
al_weighted %>% select(city,weighted_ave)
## # A tibble: 5 x 2
##   city          weighted_ave
##   <chr>                <dbl>
## 1 Los Angeles          13.3 
## 2 Phoenix               5.91
## 3 San Diego             5.67
## 4 San Francisco        13.5 
## 5 Seatle               48.2
am_weighted <- arrival %>% filter(air_line == "AM West")%>%mutate(weighted_ave = percent_ontime*total_flights/am_sum)
am_weighted %>% select(city,weighted_ave)
## # A tibble: 5 x 2
##   city          weighted_ave
##   <chr>                <dbl>
## 1 Los Angeles           9.61
## 2 Phoenix              67.0 
## 3 San Diego             5.30
## 4 San Francisco         4.43
## 5 Seatle                2.78

Finally we see that for Alaska, Seattle which is it’s largest market, but a bottom-half performing city makes up 48% of it’s on time flights, whereas for AM West Phoenix which is by large the largest market in the set and a top-half performing city makes up 67% of AM West’s on time flights. This further supports the claim the AM West does better overall even though Alaska has more top-half cities in it’s list.