Tyding and Transformation
library(readr)
library(stringr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Importing csv file from my github
airline <- read.csv("https://raw.githubusercontent.com/AlainKuiete/DATA607/master/arrival_delays.csv")
removing the blank row
arr.delay <- airline[-3,]
row.names(arr.delay)<-c(1:4)
airline
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 delayed 62 12 20 102 305
## 3 \t\t\t\t\t\t NA NA NA
## 4 AM WEST on time 694 4,840 383 320 201
## 5 delayed 117 415 65 129 61
Suppressing comma in numeric data
arr.delay$Seattle <- gsub(",", "",arr.delay$Seattle)
arr.delay$Phoenix <- gsub(",", "",arr.delay$Phoenix)
Filling the blank cells
arr.delay$X[2] <- arr.delay$X[1]
arr.delay$X[4] <- arr.delay$X[3]
arr.delay
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
Putting the appropriate variables. Los.Angeles, Phoenix,… at the header of columns are values of the variable airline.
Converting columns Delayed and OnTime to numeric
arr.delay$Delayed <- as.numeric(arr.delay$Delayed)
arr.delay$OnTime <- as.numeric(arr.delay$OnTime)
arr.delay
## Airline Destination Delayed OnTime
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
Rate of Arrival delays per flight for Alaska
AD <- sum(arr.delay$Delayed[which(arr.delay$Airline == "ALASKA")])
AO <- sum(arr.delay$OnTime[which(arr.delay$Airline == "ALASKA")])
RA <- AD/(AO + AD)
RA
## [1] 0.1327152
Rate of Arrival delays per flight for AM WEST
WD <- sum(arr.delay$Delayed[which(arr.delay$Airline == "AM WEST")])
WO <- sum(arr.delay$OnTime[which(arr.delay$Airline == "AM WEST")])
RW <- WD/(WO + WD)
RW
## [1] 0.1089273
Alaska rate of arriving delayed (13.3%) is higher than AM West rate of arriving delayed (10.9%)