Show the assignment:
## Install and load tidyr and dplyr packages
install.packages("tidyr", dependencies = TRUE, repos = "http://lib.stat.cmu.edu/R/CRAN/")
## package 'tidyr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Karen\AppData\Local\Temp\RtmpWgOmwx\downloaded_packages
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.2
install.packages("dplyr", dependencies = TRUE, repos = "http://lib.stat.cmu.edu/R/CRAN/")
## package 'dplyr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Karen\AppData\Local\Temp\RtmpWgOmwx\downloaded_packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.2
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Read .csv file into variable called airline_data
airline_data <- read.csv(file = "https://raw.githubusercontent.com/karenweigandt/IS607/master/IS607Week6data.csv", header = TRUE, sep = ",")
## Convert data to tbl class
airline_data <- tbl_df(airline_data)
## Start to make the data tidy - make wide data long
tidy_airline_data <- gather(airline_data, "city", "Number of flights", 3:7, na.rm = TRUE)
## Check the column names
colnames(tidy_airline_data)
## [1] "X" "X.1" "city"
## [4] "Number of flights"
## Change the names of the unnamed columns to something that makes sense
names(tidy_airline_data)[names(tidy_airline_data) %in% c("X", "X.1")] <- c("airline", "status")
## Look at the data
tidy_airline_data ## Note blank airline cells for delayed rows
## Source: local data frame [20 x 4]
##
## airline status city Number of flights
## (fctr) (fctr) (fctr) (int)
## 1 ALASKA on time Los.Angeles 497
## 2 delayed Los.Angeles 62
## 3 AMWEST on time Los.Angeles 694
## 4 delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 delayed Phoenix 12
## 7 AMWEST on time Phoenix 4840
## 8 delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 delayed San.Diego 20
## 11 AMWEST on time San.Diego 383
## 12 delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 delayed San.Francisco 102
## 15 AMWEST on time San.Francisco 320
## 16 delayed San.Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 delayed Seattle 305
## 19 AMWEST on time Seattle 201
## 20 delayed Seattle 61
## Fill in blank airline names to maintain data integrity
tidy_airline_data$airline <- c("ALASKA", "ALASKA", "AMWEST", "AMWEST")
## Look at the data again to see blanks are filled in
tidy_airline_data
## Source: local data frame [20 x 4]
##
## airline status city Number of flights
## (chr) (fctr) (fctr) (int)
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AMWEST on time Los.Angeles 694
## 4 AMWEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AMWEST on time Phoenix 4840
## 8 AMWEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AMWEST on time San.Diego 383
## 12 AMWEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AMWEST on time San.Francisco 320
## 16 AMWEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AMWEST on time Seattle 201
## 20 AMWEST delayed Seattle 61
## Group by airline
## Calculate total number of flights per airline
tidy_al_data_by_al <- ## Saved to new table to preserve integrity of previous table
tidy_airline_data %>%
group_by(airline) %>%
mutate(total_flights_by_airline = sum(`Number of flights`))
## Look at the new column
tidy_al_data_by_al$total_flights_by_airline
## [1] 3775 3775 7225 7225 3775 3775 7225 7225 3775 3775 7225 7225 3775 3775
## [15] 7225 7225 3775 3775 7225 7225
## Group by airline and then status
## Calculate percentage of flights for each status for each city
tidy_al_data_by_al_stat <- ## Saved to new table to preserve integrity of previous table
tidy_al_data_by_al %>%
group_by(airline, status) %>%
mutate(pct_of_flights_by_al_n_stat = `Number of flights` *100 / total_flights_by_airline)
## Look at the new table
tidy_al_data_by_al_stat
## Source: local data frame [20 x 6]
## Groups: airline, status [4]
##
## airline status city Number of flights
## (chr) (fctr) (fctr) (int)
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AMWEST on time Los.Angeles 694
## 4 AMWEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AMWEST on time Phoenix 4840
## 8 AMWEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AMWEST on time San.Diego 383
## 12 AMWEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AMWEST on time San.Francisco 320
## 16 AMWEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AMWEST on time Seattle 201
## 20 AMWEST delayed Seattle 61
## Variables not shown: total_flights_by_airline (int),
## pct_of_flights_by_al_n_stat (dbl)
tidy_al_data_by_al_stat$pct_of_flights_by_al_n_stat
## [1] 13.1655629 1.6423841 9.6055363 1.6193772 5.8543046 0.3178808
## [7] 66.9896194 5.7439446 5.6158940 0.5298013 5.3010381 0.8996540
## [13] 13.3245033 2.7019868 4.4290657 1.7854671 48.7682119 8.0794702
## [19] 2.7820069 0.8442907
## Filter latest data set to include only delayed flights
## Group output by airline
## Calculate sum of percentages
tidy_al_data_by_al_stat %>%
filter(status == "delayed") %>%
group_by(airline) %>%
summarise('%_flights_delayed_by_airline' = sum(pct_of_flights_by_al_n_stat))
## Source: local data frame [2 x 2]
##
## airline %_flights_delayed_by_airline
## (chr) (dbl)
## 1 ALASKA 13.27152
## 2 AMWEST 10.89273