Assignment 05 - Tidying and Transforming Data

STEP 1 : Always load your libraries

# Good Practise: Basic house keeping: cleanup the env before you start new work
rm(list=ls())

# Libraries 
library(tidyverse)
## -- Attaching packages -------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.8.0     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.3.0
## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(RMySQL)
## Loading required package: DBI
library(ggplot2)
library(knitr)

STEP 2 : MySQL DAtabase Connection

Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.

Exihibit of the script used to create table using mysql

/*
tidydata tables

DROP TABLE IF EXISTS tidydata;

# Create new table
CREATE TABLE tidydata           
(
  airline           varchar(20) ,  
  status            varchar(20) NOT NULL,  
  LosAngeles        integer NOT NULL,
  Phoenix           integer NOT NULL,
  SanDiego          integer NOT NULL,
  SanFrancisco      integer NOT NULL,
  Seattle           integer NOT NULL
);

insert into tidydata values ("Alaska", "on time", 497, 221, 212, 503, 1841);
insert into tidydata values ("Alaska", "delayed", 62, 12, 20, 102, 305);
insert into tidydata values ("AM West", "on time", 694, 4840, 383, 320, 201);
insert into tidydata values ("AM West", "delayed", 117, 415, 65, 129, 61) ;
*/

(a) create dbConnect

##
# Create a database connection
##

mydb = dbConnect(MySQL(), user='root', password='mysql57', host='localhost', dbname="tb")

(b) select data fron tidydata

##
# Create a Database connection
##

untidywidedata <- fetch(dbSendQuery(mydb, "select * from tidydata;"))
head(untidywidedata)
##   airline  status LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1  Alaska on time        497     221      212          503    1841
## 2  Alaska delayed         62      12       20          102     305
## 3 AM West on time        694    4840      383          320     201
## 4 AM West delayed        117     415       65          129      61

The above shows data in the wide format

STEP 3 : Transform Data

Use gather() and spread() to tidy and transform your data
##
# Use gather() to gather all the city names
##
tidylongdata <- gather(untidywidedata, city, flights, 3:7)
head(tidylongdata)
##   airline  status       city flights
## 1  Alaska on time LosAngeles     497
## 2  Alaska delayed LosAngeles      62
## 3 AM West on time LosAngeles     694
## 4 AM West delayed LosAngeles     117
## 5  Alaska on time    Phoenix     221
## 6  Alaska delayed    Phoenix      12
##
# Use spread() to spread the data based on flight status
##
tidydata <- spread(tidylongdata, status, flights)
head(tidydata)
##   airline         city delayed on time
## 1  Alaska   LosAngeles      62     497
## 2  Alaska      Phoenix      12     221
## 3  Alaska     SanDiego      20     212
## 4  Alaska SanFrancisco     102     503
## 5  Alaska      Seattle     305    1841
## 6 AM West   LosAngeles     117     694
tidydata
##    airline         city delayed on time
## 1   Alaska   LosAngeles      62     497
## 2   Alaska      Phoenix      12     221
## 3   Alaska     SanDiego      20     212
## 4   Alaska SanFrancisco     102     503
## 5   Alaska      Seattle     305    1841
## 6  AM West   LosAngeles     117     694
## 7  AM West      Phoenix     415    4840
## 8  AM West     SanDiego      65     383
## 9  AM West SanFrancisco     129     320
## 10 AM West      Seattle      61     201

STEP 4 : Perform Analysis

Perform analysis
Use group_by() to group by different criteria
Use summarise() to summarise
Use mutate() to mutate and percentage column
Use innerjoin() to join dataframe based on key
##
# Group by airline and summarize and then mutate 
##
airline_flights <- group_by(tidydata, airline) %>% 
    summarise(total_delay=sum(delayed), total_ontime=sum(`on time`), total=sum(delayed+`on time`))

head(airline_flights)
## # A tibble: 2 x 4
##   airline total_delay total_ontime total
##   <chr>         <int>        <int> <int>
## 1 Alaska          501         3274  3775
## 2 AM West         787         6438  7225
summary_by_airline <- group_by(tidydata, airline) %>% 
    summarise(avg_delay = mean(delayed), avg_on_time = mean(`on time`)) %>% 
    mutate(percent_delay=100*(avg_delay/(avg_delay+avg_on_time))) 

summary_by_airline <- inner_join(summary_by_airline, airline_flights, by="airline")
kable(summary_by_airline)
airline avg_delay avg_on_time percent_delay total_delay total_ontime total
Alaska 100.2 654.8 13.27152 501 3274 3775
AM West 157.4 1287.6 10.89273 787 6438 7225
##
# Group by city and summarize and then mutate 
##
summary_by_city <- group_by(tidydata, city) %>% 
    summarise(avg_delay = mean(delayed), avg_on_time = mean(`on time`)) %>% 
    mutate(percent_delay=100*(avg_delay/(avg_delay+avg_on_time)))


kable(summary_by_city)
city avg_delay avg_on_time percent_delay
LosAngeles 89.5 595.5 13.065693
Phoenix 213.5 2530.5 7.780612
SanDiego 42.5 297.5 12.500000
SanFrancisco 115.5 411.5 21.916508
Seattle 183.0 1021.0 15.199336

STEP 5 : Create Visualizations

Create visualization like box plot, scatterplot
5.1 Boxplot to see total delays by airline
##
# Total Delays by Airline
##

ggplot(data=tidydata, aes(airline, delayed))+
    geom_boxplot()+
    theme_bw()+
    labs(title="Total Delays by Airline", x="Airline Name", y="Total Delays")+
    theme(plot.title = element_text(hjust = 0.5))

##### This graph is misleading as it makes one believe that AM West had higher delays. While AM West did have higher number of delays but since they had greater number of flights, the percent of delays AM West was lower than Alaska airlines.

5.2 Scatterplot to see total delays by airline
ggplot(data=summary_by_city, aes(city, percent_delay, color=city))+
    geom_point()+
    theme_bw()+
    labs(title="Percent Delays by City", x="City", y="Percent Delay")+
    theme(plot.title = element_text(hjust = 0.5))

STEP 5: Analysis

Summary of Findings

1. As seen in summary_by_airline$percent_delay that Alaska airline had 13.27% delays while AM West had 10.89% delays. AM West seems to have slightly higher on time performance.
2. Total number of flights flown by Alaska were 3775 while AM West had 7225 flights
3. As presented by dataframe summary_by_city, City of Pheonix had only 7.78% delays (lowest) while San Francisco had 21.9% delays (highest)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.