Tyding and Transformation
Data acquisition and management
CUNY MSDS DATA 607
Rose Koh
2018/03/03
- Create a .csv file or SQL database to use a wide structure to practice tidying and transforming as below.
- Use tidyr, dplyr. (data.table optional)
- Perform analysis to compare the arrival delays for the two airlines.
- Create R Markdown, post to rpubs.com, include narrative descriptions of data cleanup work, analysis and conclusions.
- Include .Rmd file in github repository and URL for rpubs webpage.
Install | Load Packages
library(DBI)
library(RPostgres)
library(knitr)
library(data.table)
library(ggplot2)
Set working directory where rmd is located
set_wd <- function() {
library(rstudioapi) # make sure you have it installed
current_path <- getActiveDocumentContext()$path
setwd(dirname(current_path ))
print( getwd() )
}
Connect to AWS
source("./source_lib/aws_rep_connect.R")
con <- dbConnect(RPostgres::Postgres(),
host = host,
port = port,
dbname = dbname,
user = user,
password = password)
Create the DB using csv file to AWS
# check whether connection is on
is.null(con)
## [1] FALSE
# read csv file
flights <- fread("./source_lib/data/flight_delay.csv")
flights
## airlines status los_angeles phoenix san_diego san_francisco seattle
## 1: alaska on_time 497 221 212 503 1841
## 2: alaska delayed 62 12 20 102 305
## 3: amwest on_time 694 4840 383 320 201
## 4: amwest delayed 118 415 65 129 61
# add id
flights$id <- seq.int(nrow(flights))
# change column order
flights <- flights[, c(8,1,2,3,4,5,6,7)]
# Check DB list, Drop and Create DB
#dbListTables(con)
#dbExistsTable(con, "flight_delay")
#dbRemoveTable(con, "flight_delay")
dbWriteTable(con, name='flight_delay', value=flights)
Send query and Fetch the dataset
# Query the table
rs <- dbSendQuery(con, statement="SELECT * FROM flight_delay;")
# Fetch all result
data <- dbFetch(rs, n= -1)
# Change it as data table
data <- as.data.table(data)
str(data)
## Classes 'data.table' and 'data.frame': 4 obs. of 8 variables:
## $ id : int 1 2 3 4
## $ airlines : chr "alaska" "alaska" "amwest" "amwest"
## $ status : chr "on_time" "delayed" "on_time" "delayed"
## $ los_angeles : int 497 62 694 118
## $ phoenix : int 221 12 4840 415
## $ san_diego : int 212 20 383 65
## $ san_francisco: int 503 102 320 129
## $ seattle : int 1841 305 201 61
## - attr(*, ".internal.selfref")=<externalptr>
Analysis
## Airline delay rate by airlines
# use long data to make wide data : airlines, delayed, on_time
status <- dcast(melt.dt, airlines ~ status, sum)
# format as data table
status <- as.data.table(status)
# add total column
status[, total := delayed + on_time]
# rate
status[, rate:= round((delayed/total) * 100, 2), by = airlines]
status <- status[order(-rank(rate))]
kable(status, caption = "Airline delay rate by airlines")
Airline delay rate by airlines
alaska |
501 |
3274 |
3775 |
13.27 |
amwest |
788 |
6438 |
7226 |
10.91 |
## Delay rate by region
# use the long data to make wide data : destination, delayed, on_time
region <- dcast(melt.dt, destination~status, sum)
# format as data table
region <- as.data.table(region)
# add total column, calculate rate
region[, total := delayed + on_time][, rate:= round((delayed/total) * 100, 2), by = region]
# sort by rate in desc
region <- region[order(-rank(rate))]
kable(region, caption = "Airline delay rate by region")
Airline delay rate by region
san_francisco |
231 |
823 |
1054 |
21.92 |
seattle |
366 |
2042 |
2408 |
15.20 |
los_angeles |
180 |
1191 |
1371 |
13.13 |
san_diego |
85 |
595 |
680 |
12.50 |
phoenix |
427 |
5061 |
5488 |
7.78 |
## Delay rate by airline by region
region.status <- melt.dt[, .(count = sum(count)), by = .(airlines, destination, status)]
region.status <- dcast(region.status, airlines + destination ~ status)
region.status <- as.data.table(region.status)
region.status[, total := delayed + on_time][, rate:= round((delayed/total) * 100, 2), by = airlines]
region.status <- region.status[order(-rank(rate))]
kable(region.status, caption = "Delay rate by airline by region")
Delay rate by airline by region
amwest |
san_francisco |
129 |
320 |
449 |
28.73 |
amwest |
seattle |
61 |
201 |
262 |
23.28 |
alaska |
san_francisco |
102 |
503 |
605 |
16.86 |
amwest |
los_angeles |
118 |
694 |
812 |
14.53 |
amwest |
san_diego |
65 |
383 |
448 |
14.51 |
alaska |
seattle |
305 |
1841 |
2146 |
14.21 |
alaska |
los_angeles |
62 |
497 |
559 |
11.09 |
alaska |
san_diego |
20 |
212 |
232 |
8.62 |
amwest |
phoenix |
415 |
4840 |
5255 |
7.90 |
alaska |
phoenix |
12 |
221 |
233 |
5.15 |
## Visualization
ggplot(region.status, aes(airlines, rate,,
fill = airlines,
color = airlines)) +
geom_bar(stat = 'identity') +
facet_wrap(~destination, ncol = 5) +
geom_text(aes(x = airlines, y = rate,
label = paste(rate, "%"),
group = airlines,
vjust = -0.4)) +
labs(title = "Delay rate by airline by region",
x = "Airlines",
y = "Delay rate") +
theme_bw()

- Airlines Alaska airlines has 13.27% of delay rate, followed by Amwest airlines at 10.91%. Alaska airlines has 91% more flight volumes (7226) than that of Amwest airlines (3775).
- Region San francisco is the city with the highest delay rate at 21.92%, Phoenix at lowest at 7.78%.
- Airlines + Region Amwest airlines flight delay in San Francisco is the highest at 28.73%, followed by 23.28% in Seattle. Alaska airlines flight delay rate in San Francisco was the third at 16.86%. The lowest rate was shown by Alaska airline in Phoenix at 5.15%.
Clear result and Disconnect