Tyding and Transformation

Data acquisition and management

CUNY MSDS DATA 607

Rose Koh

2018/03/03
  1. Create a .csv file or SQL database to use a wide structure to practice tidying and transforming as below.
  2. Use tidyr, dplyr. (data.table optional)
  3. Perform analysis to compare the arrival delays for the two airlines.
  4. Create R Markdown, post to rpubs.com, include narrative descriptions of data cleanup work, analysis and conclusions.
  5. Include .Rmd file in github repository and URL for rpubs webpage.

Install | Load Packages

library(DBI)
library(RPostgres)
library(knitr)
library(data.table)
library(ggplot2)

Set working directory where rmd is located

set_wd <- function() {
  library(rstudioapi) # make sure you have it installed
  current_path <- getActiveDocumentContext()$path 
  setwd(dirname(current_path ))
  print( getwd() )
}

Connect to AWS

source("./source_lib/aws_rep_connect.R")
con <- dbConnect(RPostgres::Postgres(),
                 host = host,
                 port = port,
                 dbname = dbname,
                 user = user,
                 password = password)

Create the DB using csv file to AWS

In week2 assignment, I wrote the query to create table in sql client, and connected to the db in R to fetch the data.
In week5, I connected to DB to create table from R after reading the .csv file.

# check whether connection is on
is.null(con)
## [1] FALSE
# read csv file
flights <- fread("./source_lib/data/flight_delay.csv")
flights
##    airlines  status los_angeles phoenix san_diego san_francisco seattle
## 1:   alaska on_time         497     221       212           503    1841
## 2:   alaska delayed          62      12        20           102     305
## 3:   amwest on_time         694    4840       383           320     201
## 4:   amwest delayed         118     415        65           129      61
# add id
flights$id <- seq.int(nrow(flights))

# change column order
flights <- flights[, c(8,1,2,3,4,5,6,7)]

# Check DB list, Drop and Create DB
#dbListTables(con)
#dbExistsTable(con, "flight_delay")
#dbRemoveTable(con, "flight_delay")
dbWriteTable(con, name='flight_delay', value=flights)

Send query and Fetch the dataset

# Query the table
rs <- dbSendQuery(con, statement="SELECT * FROM flight_delay;")

# Fetch all result
data <- dbFetch(rs, n= -1)

# Change it as data table
data <- as.data.table(data)
str(data)
## Classes 'data.table' and 'data.frame':   4 obs. of  8 variables:
##  $ id           : int  1 2 3 4
##  $ airlines     : chr  "alaska" "alaska" "amwest" "amwest"
##  $ status       : chr  "on_time" "delayed" "on_time" "delayed"
##  $ los_angeles  : int  497 62 694 118
##  $ phoenix      : int  221 12 4840 415
##  $ san_diego    : int  212 20 383 65
##  $ san_francisco: int  503 102 320 129
##  $ seattle      : int  1841 305 201 61
##  - attr(*, ".internal.selfref")=<externalptr>

Let’s transform the data

## Perform analysis to compare the arrival delays for the two airlines
# data.table has reshape2 in its package. Using melt does not require to load reshape2.

## wide data to long data : airlines, status, destination, count
data$id <- NULL
melt.dt <- melt(data, id.vars = c("airlines", "status"),
                variable.name = "destination",
                value.name = "count")
# form it as data table
melt.dt <- as.data.table(melt.dt)
setkey(melt.dt)
## set key to set key on a DT: data sorts on the column by the reference : Very useful for `join`
## e.g.
# a <- melt.dt[, sum(count), by = airlines]
# b <- melt.dt[status == 'delayed'][, sum(count), by = airlines]
# setkey(a, airlines)
# setkey(b, airlines)
# c <- merge(a,b, by = "airlines")
# setnames(c, c("V1.x", "V1.y"), c("total", "delay"))

Analysis

## Airline delay rate by airlines
# use long data to make wide data : airlines, delayed, on_time
status <- dcast(melt.dt, airlines ~ status, sum)
# format as data table
status <- as.data.table(status)
# add total column
status[, total := delayed + on_time]
# rate
status[, rate:= round((delayed/total) * 100, 2), by = airlines]
status <- status[order(-rank(rate))]
kable(status, caption = "Airline delay rate by airlines")
Airline delay rate by airlines
airlines delayed on_time total rate
alaska 501 3274 3775 13.27
amwest 788 6438 7226 10.91
## Delay rate by region
# use the long data to make wide data : destination, delayed, on_time
region <- dcast(melt.dt, destination~status, sum)
# format as data table
region <- as.data.table(region)
# add total column, calculate rate
region[, total := delayed + on_time][, rate:= round((delayed/total) * 100, 2), by = region]
# sort by rate in desc
region <- region[order(-rank(rate))]
kable(region, caption = "Airline delay rate by region")
Airline delay rate by region
destination delayed on_time total rate
san_francisco 231 823 1054 21.92
seattle 366 2042 2408 15.20
los_angeles 180 1191 1371 13.13
san_diego 85 595 680 12.50
phoenix 427 5061 5488 7.78
## Delay rate by airline by region
region.status <- melt.dt[, .(count = sum(count)), by = .(airlines, destination, status)]
region.status <- dcast(region.status, airlines + destination ~ status)
region.status <- as.data.table(region.status)
region.status[, total := delayed + on_time][, rate:= round((delayed/total) * 100, 2), by = airlines]
region.status <- region.status[order(-rank(rate))]
kable(region.status, caption = "Delay rate by airline by region")
Delay rate by airline by region
airlines destination delayed on_time total rate
amwest san_francisco 129 320 449 28.73
amwest seattle 61 201 262 23.28
alaska san_francisco 102 503 605 16.86
amwest los_angeles 118 694 812 14.53
amwest san_diego 65 383 448 14.51
alaska seattle 305 1841 2146 14.21
alaska los_angeles 62 497 559 11.09
alaska san_diego 20 212 232 8.62
amwest phoenix 415 4840 5255 7.90
alaska phoenix 12 221 233 5.15
## Visualization
ggplot(region.status, aes(airlines, rate,,
                               fill = airlines, 
                               color = airlines)) + 
                      geom_bar(stat = 'identity') + 
                      facet_wrap(~destination, ncol = 5) +
                      geom_text(aes(x = airlines, y = rate, 
                                    label = paste(rate, "%"),
                                    group = airlines,
                                    vjust = -0.4)) +
                      labs(title = "Delay rate by airline by region", 
                         x = "Airlines", 
                         y = "Delay rate") +
                      theme_bw() 

  • Airlines Alaska airlines has 13.27% of delay rate, followed by Amwest airlines at 10.91%. Alaska airlines has 91% more flight volumes (7226) than that of Amwest airlines (3775).
  • Region San francisco is the city with the highest delay rate at 21.92%, Phoenix at lowest at 7.78%.
  • Airlines + Region Amwest airlines flight delay in San Francisco is the highest at 28.73%, followed by 23.28% in Seattle. Alaska airlines flight delay rate in San Francisco was the third at 16.86%. The lowest rate was shown by Alaska airline in Phoenix at 5.15%.

Clear result and Disconnect

References

datatable cheatsheet