Show the assignment: HW6Assignment

## Install and load tidyr and dplyr packages
install.packages("tidyr", dependencies = TRUE, repos = "http://lib.stat.cmu.edu/R/CRAN/")
## package 'tidyr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Karen\AppData\Local\Temp\RtmpWgOmwx\downloaded_packages
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.2
install.packages("dplyr", dependencies = TRUE, repos = "http://lib.stat.cmu.edu/R/CRAN/")
## package 'dplyr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Karen\AppData\Local\Temp\RtmpWgOmwx\downloaded_packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.2
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Read .csv file into variable called airline_data
airline_data <- read.csv(file = "https://raw.githubusercontent.com/karenweigandt/IS607/master/IS607Week6data.csv", header = TRUE, sep = ",")

## Convert data to tbl class
airline_data <- tbl_df(airline_data)
## Start to make the data tidy - make wide data long
tidy_airline_data <- gather(airline_data, "city", "Number of flights", 3:7, na.rm = TRUE)

## Check the column names
colnames(tidy_airline_data)
## [1] "X"                 "X.1"               "city"             
## [4] "Number of flights"
## Change the names of the unnamed columns to something that makes sense
names(tidy_airline_data)[names(tidy_airline_data) %in% c("X", "X.1")] <- c("airline", "status")

## Look at the data
tidy_airline_data  ## Note blank airline cells for delayed rows
## Source: local data frame [20 x 4]
## 
##    airline  status          city Number of flights
##     (fctr)  (fctr)        (fctr)             (int)
## 1   ALASKA on time   Los.Angeles               497
## 2          delayed   Los.Angeles                62
## 3   AMWEST on time   Los.Angeles               694
## 4          delayed   Los.Angeles               117
## 5   ALASKA on time       Phoenix               221
## 6          delayed       Phoenix                12
## 7   AMWEST on time       Phoenix              4840
## 8          delayed       Phoenix               415
## 9   ALASKA on time     San.Diego               212
## 10         delayed     San.Diego                20
## 11  AMWEST on time     San.Diego               383
## 12         delayed     San.Diego                65
## 13  ALASKA on time San.Francisco               503
## 14         delayed San.Francisco               102
## 15  AMWEST on time San.Francisco               320
## 16         delayed San.Francisco               129
## 17  ALASKA on time       Seattle              1841
## 18         delayed       Seattle               305
## 19  AMWEST on time       Seattle               201
## 20         delayed       Seattle                61
## Fill in blank airline names to maintain data integrity
tidy_airline_data$airline <- c("ALASKA", "ALASKA", "AMWEST", "AMWEST")
## Look at the data again to see blanks are filled in
tidy_airline_data
## Source: local data frame [20 x 4]
## 
##    airline  status          city Number of flights
##      (chr)  (fctr)        (fctr)             (int)
## 1   ALASKA on time   Los.Angeles               497
## 2   ALASKA delayed   Los.Angeles                62
## 3   AMWEST on time   Los.Angeles               694
## 4   AMWEST delayed   Los.Angeles               117
## 5   ALASKA on time       Phoenix               221
## 6   ALASKA delayed       Phoenix                12
## 7   AMWEST on time       Phoenix              4840
## 8   AMWEST delayed       Phoenix               415
## 9   ALASKA on time     San.Diego               212
## 10  ALASKA delayed     San.Diego                20
## 11  AMWEST on time     San.Diego               383
## 12  AMWEST delayed     San.Diego                65
## 13  ALASKA on time San.Francisco               503
## 14  ALASKA delayed San.Francisco               102
## 15  AMWEST on time San.Francisco               320
## 16  AMWEST delayed San.Francisco               129
## 17  ALASKA on time       Seattle              1841
## 18  ALASKA delayed       Seattle               305
## 19  AMWEST on time       Seattle               201
## 20  AMWEST delayed       Seattle                61
## Group by airline
## Calculate total number of flights per airline
tidy_al_data_by_al <-     ## Saved to new table to preserve integrity of previous table
tidy_airline_data %>%
  group_by(airline) %>%
  mutate(total_flights_by_airline = sum(`Number of flights`))

##  Look at the new column
tidy_al_data_by_al$total_flights_by_airline
##  [1] 3775 3775 7225 7225 3775 3775 7225 7225 3775 3775 7225 7225 3775 3775
## [15] 7225 7225 3775 3775 7225 7225
## Group by airline and then status
## Calculate percentage of flights for each status for each city
tidy_al_data_by_al_stat <-    ## Saved to new table to preserve integrity of previous table
tidy_al_data_by_al %>%
  group_by(airline, status) %>%
  mutate(pct_of_flights_by_al_n_stat = `Number of flights` *100 / total_flights_by_airline)

##  Look at the new table
tidy_al_data_by_al_stat
## Source: local data frame [20 x 6]
## Groups: airline, status [4]
## 
##    airline  status          city Number of flights
##      (chr)  (fctr)        (fctr)             (int)
## 1   ALASKA on time   Los.Angeles               497
## 2   ALASKA delayed   Los.Angeles                62
## 3   AMWEST on time   Los.Angeles               694
## 4   AMWEST delayed   Los.Angeles               117
## 5   ALASKA on time       Phoenix               221
## 6   ALASKA delayed       Phoenix                12
## 7   AMWEST on time       Phoenix              4840
## 8   AMWEST delayed       Phoenix               415
## 9   ALASKA on time     San.Diego               212
## 10  ALASKA delayed     San.Diego                20
## 11  AMWEST on time     San.Diego               383
## 12  AMWEST delayed     San.Diego                65
## 13  ALASKA on time San.Francisco               503
## 14  ALASKA delayed San.Francisco               102
## 15  AMWEST on time San.Francisco               320
## 16  AMWEST delayed San.Francisco               129
## 17  ALASKA on time       Seattle              1841
## 18  ALASKA delayed       Seattle               305
## 19  AMWEST on time       Seattle               201
## 20  AMWEST delayed       Seattle                61
## Variables not shown: total_flights_by_airline (int),
##   pct_of_flights_by_al_n_stat (dbl)
tidy_al_data_by_al_stat$pct_of_flights_by_al_n_stat
##  [1] 13.1655629  1.6423841  9.6055363  1.6193772  5.8543046  0.3178808
##  [7] 66.9896194  5.7439446  5.6158940  0.5298013  5.3010381  0.8996540
## [13] 13.3245033  2.7019868  4.4290657  1.7854671 48.7682119  8.0794702
## [19]  2.7820069  0.8442907
## Filter latest data set to include only delayed flights
## Group output by airline
## Calculate sum of percentages
tidy_al_data_by_al_stat %>%
filter(status == "delayed") %>% 
  group_by(airline) %>% 
  summarise('%_flights_delayed_by_airline' = sum(pct_of_flights_by_al_n_stat))
## Source: local data frame [2 x 2]
## 
##   airline %_flights_delayed_by_airline
##     (chr)                        (dbl)
## 1  ALASKA                     13.27152
## 2  AMWEST                     10.89273

Conclusion:

13.27% of ALASKA airlines flights experience delayed arrival vs.

10.89% of AMWEST flights