Tyding and Transformation

library(readr)
library(stringr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Importing csv file from my github

airline <- read.csv("https://raw.githubusercontent.com/AlainKuiete/DATA607/master/arrival_delays.csv")

removing the blank row

arr.delay <- airline[-3,]
row.names(arr.delay)<-c(1:4)
airline
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503   1,841
## 2         delayed          62      12        20           102     305
## 3  \t\t\t\t\t\t                  NA                NA            NA        
## 4 AM WEST on time         694   4,840       383           320     201
## 5         delayed         117     415        65           129      61

Suppressing comma in numeric data

arr.delay$Seattle <- gsub(",", "",arr.delay$Seattle)
arr.delay$Phoenix <- gsub(",", "",arr.delay$Phoenix)

Filling the blank cells

arr.delay$X[2] <- arr.delay$X[1]
arr.delay$X[4] <- arr.delay$X[3]
arr.delay
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61

Putting the appropriate variables. Los.Angeles, Phoenix,… at the header of columns are values of the variable airline.

First transformatiom switch column to row and add the intermediate count column

arr.delay <- gather(arr.delay,"Destination", "Count", 3:7)

Second Transformation split the (on time - delayed)column in two column.

arr.delay <- spread(arr.delay, X.1, Count)
colnames(arr.delay) <- c("Airline", "Destination", "Delayed", "OnTime")

Converting columns Delayed and OnTime to numeric

arr.delay$Delayed <- as.numeric(arr.delay$Delayed) 
arr.delay$OnTime <- as.numeric(arr.delay$OnTime)
arr.delay
##    Airline   Destination Delayed OnTime
## 1   ALASKA   Los.Angeles      62    497
## 2   ALASKA       Phoenix      12    221
## 3   ALASKA     San.Diego      20    212
## 4   ALASKA San.Francisco     102    503
## 5   ALASKA       Seattle     305   1841
## 6  AM WEST   Los.Angeles     117    694
## 7  AM WEST       Phoenix     415   4840
## 8  AM WEST     San.Diego      65    383
## 9  AM WEST San.Francisco     129    320
## 10 AM WEST       Seattle      61    201

Rate of Arrival delays per flight for Alaska

AD <- sum(arr.delay$Delayed[which(arr.delay$Airline == "ALASKA")])
AO <- sum(arr.delay$OnTime[which(arr.delay$Airline == "ALASKA")])
RA <- AD/(AO + AD)
RA
## [1] 0.1327152

Rate of Arrival delays per flight for AM WEST

WD <- sum(arr.delay$Delayed[which(arr.delay$Airline == "AM WEST")])
WO <- sum(arr.delay$OnTime[which(arr.delay$Airline == "AM WEST")])
RW <- WD/(WO + WD)
RW
## [1] 0.1089273

Alaska rate of arriving delayed (13.3%) is higher than AM West rate of arriving delayed (10.9%)