The Task
The chart above describes arrival delays for two airlines across five destinations. Your task is to:
- Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
- Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
- Perform analysis to compare the arrival delays for the two airlines.
- Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions. Please include in your homework submission: The URL to the .Rmd file in your GitHub repository and??? URL for your rpubs.com web page.
Load Packages
knitr::opts_chunk$set(#echo=FALSE,
warning=FALSE,
message=FALSE,
tidy=F,
#comment = "",
dev="png",
dev.args=list(type="cairo"))
#https://cran.r-project.org/web/packages/prettydoc/vignettes/
#https://www.rstudio.com/wp-content/uploads/2015/03/rmarkdown-reference.pdf
load.packages <- c("stringr","prettydoc", "tidyr", "dplyr", "knitr")
ipak <- function(pkg){
#FUNCTION SOURCE: https://gist.github.com/stevenworthington/3178163
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE, quietly = TRUE, warn.conflicts = FALSE)
}
ipak(load.packages)
## stringr prettydoc tidyr dplyr knitr
## TRUE TRUE TRUE TRUE TRUE
Load data, fill in missing values & take a look at it
my_data <- read.csv("https://raw.githubusercontent.com/kylegilde/D607-Data-Acquistion/master/data-sets/numbersense_flight_data.csv", stringsAsFactors = F)
#Fill in the airline values and insert underscore into "on time"
my_data[2,1] <- my_data[1,1]
my_data[5,1] <- my_data[4,1]
my_data[,2] <- sapply(my_data[,2], str_replace, " ", "_")
kable(my_data)
| ALASKA |
on_time |
497 |
221 |
212 |
503 |
1841 |
| ALASKA |
delayed |
62 |
12 |
20 |
102 |
305 |
|
|
NA |
NA |
NA |
NA |
NA |
| AM WEST |
on_time |
694 |
4840 |
383 |
320 |
201 |
| AM WEST |
delayed |
117 |
415 |
65 |
129 |
61 |
Time to tidy this data up!
tidy_data <- my_data %>%
na.omit() %>%
rename(airline = X, arrival_type = X.1) %>%
gather("arrival_city", "n", 3:7) %>%
spread(arrival_type, "n") %>%
mutate(total_arrivals = delayed + on_time, on_time_rate = on_time / total_arrivals) %>%
arrange(desc(total_arrivals))
tidy_data[,2] <- sapply(tidy_data[,2], str_replace, "\\.", " ")
Let’s take a look at the results
## Observations: 10
## Variables: 6
## $ airline <chr> "AM WEST", "ALASKA", "AM WEST", "ALASKA", "ALAS...
## $ arrival_city <chr> "Phoenix", "Seattle", "Los Angeles", "San Franc...
## $ delayed <int> 415, 305, 117, 102, 62, 129, 65, 61, 12, 20
## $ on_time <int> 4840, 1841, 694, 503, 497, 320, 383, 201, 221, 212
## $ total_arrivals <int> 5255, 2146, 811, 605, 559, 449, 448, 262, 233, 232
## $ on_time_rate <dbl> 0.9210276, 0.8578751, 0.8557337, 0.8314050, 0.8...
| AM WEST |
Phoenix |
415 |
4840 |
5255 |
0.9210276 |
| ALASKA |
Seattle |
305 |
1841 |
2146 |
0.8578751 |
| AM WEST |
Los Angeles |
117 |
694 |
811 |
0.8557337 |
| ALASKA |
San Francisco |
102 |
503 |
605 |
0.8314050 |
| ALASKA |
Los Angeles |
62 |
497 |
559 |
0.8890877 |
| AM WEST |
San Francisco |
129 |
320 |
449 |
0.7126949 |
| AM WEST |
San Diego |
65 |
383 |
448 |
0.8549107 |
| AM WEST |
Seattle |
61 |
201 |
262 |
0.7671756 |
| ALASKA |
Phoenix |
12 |
221 |
233 |
0.9484979 |
| ALASKA |
San Diego |
20 |
212 |
232 |
0.9137931 |
Now let’s do some basic analysis with dply
Which airline & city had the best on-time arrival rate?
tidy_data %>%
filter(on_time_rate == max(on_time_rate)) %>%
kable()
| ALASKA |
Phoenix |
12 |
221 |
233 |
0.9484979 |
Which airline had the better on-time arrival rate?
tidy_data %>%
group_by(airline) %>%
summarise(airline_on_time_rate = sum(on_time) / sum(total_arrivals)) %>%
filter(airline_on_time_rate == max(airline_on_time_rate)) %>%
kable()
Let’s rank the cities by their overall on-time arrival rates.
tidy_data %>%
group_by(arrival_city) %>%
summarise(city_on_time_rate = sum(on_time) / sum(total_arrivals)) %>%
mutate(on_time_ranking = min_rank(desc(city_on_time_rate))) %>%
arrange(on_time_ranking) %>%
kable()
| Phoenix |
0.9221939 |
1 |
| San Diego |
0.8750000 |
2 |
| Los Angeles |
0.8693431 |
3 |
| Seattle |
0.8480066 |
4 |
| San Francisco |
0.7808349 |
5 |