Practicing data clean-up and transformations using tidyr and dplyr. CRAN documentation available for tidyr is available here and dplr documentation can be found here.
library(tidyr)
library(dplyr)
# Create a dataframe that contains the data pictured above:
hw5data <- data.frame(
airline = rep(c('Alaska', 'AMWest'), each = 2),
status = c('on-time', 'delayed', 'on-time', 'delayed'),
LosAngeles = c(497,62,694,117),
Phoenix = c(221,12,4840,415),
SanDiego = c(212,20,383,65),
SanFrancisco = c(503,102,320,129),
Seattle = c(1841,305,201,61)
)
# Write the data to a CSV and save it to current working directory
write.csv(hw5data, "IS607_HW5_data.csv")
data <- read.csv("IS607_HW5_data.csv") # Reads in the CSV file
str(data) # check out the structure
## 'data.frame': 4 obs. of 8 variables:
## $ X : int 1 2 3 4
## $ airline : Factor w/ 2 levels "Alaska","AMWest": 1 1 2 2
## $ status : Factor w/ 2 levels "delayed","on-time": 2 1 2 1
## $ LosAngeles : int 497 62 694 117
## $ Phoenix : int 221 12 4840 415
## $ SanDiego : int 212 20 383 65
## $ SanFrancisco: int 503 102 320 129
## $ Seattle : int 1841 305 201 61
data <- data[,2:7] # remove ID column (X)
data
## airline status LosAngeles Phoenix SanDiego SanFrancisco
## 1 Alaska on-time 497 221 212 503
## 2 Alaska delayed 62 12 20 102
## 3 AMWest on-time 694 4840 383 320
## 4 AMWest delayed 117 415 65 129
tidy_data <- data %>%
gather(city, count, -airline, -status) # Convert the data into a long format using gather() from tidyr package
tidy_data
## airline status city count
## 1 Alaska on-time LosAngeles 497
## 2 Alaska delayed LosAngeles 62
## 3 AMWest on-time LosAngeles 694
## 4 AMWest delayed LosAngeles 117
## 5 Alaska on-time Phoenix 221
## 6 Alaska delayed Phoenix 12
## 7 AMWest on-time Phoenix 4840
## 8 AMWest delayed Phoenix 415
## 9 Alaska on-time SanDiego 212
## 10 Alaska delayed SanDiego 20
## 11 AMWest on-time SanDiego 383
## 12 AMWest delayed SanDiego 65
## 13 Alaska on-time SanFrancisco 503
## 14 Alaska delayed SanFrancisco 102
## 15 AMWest on-time SanFrancisco 320
## 16 AMWest delayed SanFrancisco 129
# Calculate total number of flights for each airline using aggregate():
total_flights <- aggregate(tidy_data$count, by=list(airline=tidy_data$airline), FUN=sum)
colnames(total_flights) <- c("airline", "count") # name columns in resulting df
total_flights
## airline count
## 1 Alaska 1629
## 2 AMWest 6963
# Sum the total number of delayed flights for each airline using dplyr filter()
# Alaska delayed flights:
d_alaska <- filter(tidy_data, airline == "Alaska", status == "delayed")
d_alaska <- sum(d_alaska$count)
d_alaska
## [1] 196
# AMWest delayed flights:
d_AMWest <- filter(tidy_data, airline == "AMWest", status == "delayed")
d_AMWest <- sum(d_AMWest$count)
d_AMWest
## [1] 726
# Calculate the percentage of delayed flights for each airline:
p_d_alaska <- d_alaska/total_flights$count[1] # percentage of delays with Alaska
p_d_alaska
## [1] 0.1203192
p_d_AMWest <- d_AMWest/total_flights$count[2] # percentage of delays with AMWest
p_d_AMWest
## [1] 0.1042654
# Find difference between percentage of delays among airlines using absolute value:
abs(p_d_AMWest - p_d_alaska)
## [1] 0.01605381