library(knitr); #For using kable functions for tables
library(tidyr); #For tidying data
library(dplyr); #For easy data scrubbing and manipulation
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
The data is uploaded into my Github repository. Downloading the data to an R object.
flights_data_master <- read.table(file = "https://raw.githubusercontent.com/arunk13/MSDA-Assignments/master/IS607Fall2015/Assignment6/FlightStatus.csv", stringsAsFactors = FALSE, sep = ",", header = TRUE, check.names = FALSE);
## Warning in read.table(file = "https://raw.githubusercontent.com/arunk13/
## MSDA-Assignments/master/IS607Fall2015/Assignment6/FlightStatus.csv", :
## incomplete final line found by readTableHeader on 'https://
## raw.githubusercontent.com/arunk13/MSDA-Assignments/master/IS607Fall2015/
## Assignment6/FlightStatus.csv'
flights_data <- flights_data_master; #Working copy
Below is the raw data :
kable(flights_data, align = c('c'));
| Airlines | Arrival_Status | Los Angeles | Phoenix | San Diego | San Fransisco | Seattle |
|---|---|---|---|---|---|---|
| Alaska | On Time | 497 | 221 | 212 | 503 | 1841 |
| Alaska | Delayed | 62 | 12 | 20 | 102 | 305 |
| AM West | On Time | 694 | 4840 | 383 | 320 | 201 |
| AM West | Delayed | 117 | 415 | 65 | 129 | 61 |
I find the below variables in my dataset :
a. Airlines
b. Arrival_status
c. City
For easier analysis, I will like to convert my dataset from a wide format to a long format. :
flights_data_long <- flights_data %>%
gather("City", "Stats", 3:ncol(flights_data));
kable(head(flights_data_long), align = c('c'));
| Airlines | Arrival_Status | City | Stats |
|---|---|---|---|
| Alaska | On Time | Los Angeles | 497 |
| Alaska | Delayed | Los Angeles | 62 |
| AM West | On Time | Los Angeles | 694 |
| AM West | Delayed | Los Angeles | 117 |
| Alaska | On Time | Phoenix | 221 |
| Alaska | Delayed | Phoenix | 12 |
As we have to do analysis on arrivals, I think that it will be a good idea to spread the arrival and delay times separately.
flights_data_long <- flights_data_long %>%
spread(Arrival_Status, Stats) %>%
group_by(Airlines, City);
kable(flights_data_long, align = c('c'));
| Airlines | City | Delayed | On Time |
|---|---|---|---|
| Alaska | Los Angeles | 62 | 497 |
| Alaska | Phoenix | 12 | 221 |
| Alaska | San Diego | 20 | 212 |
| Alaska | San Fransisco | 102 | 503 |
| Alaska | Seattle | 305 | 1841 |
| AM West | Los Angeles | 117 | 694 |
| AM West | Phoenix | 415 | 4840 |
| AM West | San Diego | 65 | 383 |
| AM West | San Fransisco | 129 | 320 |
| AM West | Seattle | 61 | 201 |
flights_data_final <- flights_data_long %>%
group_by(Airlines) %>%
summarise(delayed_arrivals = sum(Delayed), on_time_arrivals = sum(`On Time`),`delayed_arrivals(%)` = round((delayed_arrivals /(delayed_arrivals+on_time_arrivals))*100, digits = 2)) %>% select(Airlines, `delayed_arrivals(%)`);
kable(flights_data_final, caption = "Arrival delay comparison by airports", align = c('c'));
| Airlines | delayed_arrivals(%) |
|---|---|
| Alaska | 13.27 |
| AM West | 10.89 |