Data 607 Homework Assignment 5

First, Import Data Set into R. Read file below. Rename and make file a data frame.

samplesflights <- read.csv(file= "https://raw.githubusercontent.com/ntlrs/607homework5/master/sampleflights%20-%20final.csv")
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on 'https://
## raw.githubusercontent.com/ntlrs/607homework5/master/sampleflights%20-
## %20final.csv'
samplesflights
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA   delay          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST   delay         117     415        65           129      61

Load the tidyr and dplyr package

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Make airline names the key for the new data frame.

sf1 <-gather(samplesflights, city, number, 3:7, factor_key=TRUE)
sf1
##          X     X.1          city number
## 1   ALASKA on time   Los.Angeles    497
## 2   ALASKA   delay   Los.Angeles     62
## 3  AM WEST on time   Los.Angeles    694
## 4  AM WEST   delay   Los.Angeles    117
## 5   ALASKA on time       Phoenix    221
## 6   ALASKA   delay       Phoenix     12
## 7  AM WEST on time       Phoenix   4840
## 8  AM WEST   delay       Phoenix    415
## 9   ALASKA on time     San.Diego    212
## 10  ALASKA   delay     San.Diego     20
## 11 AM WEST on time     San.Diego    383
## 12 AM WEST   delay     San.Diego     65
## 13  ALASKA on time San.Francisco    503
## 14  ALASKA   delay San.Francisco    102
## 15 AM WEST on time San.Francisco    320
## 16 AM WEST   delay San.Francisco    129
## 17  ALASKA on time       Seattle   1841
## 18  ALASKA   delay       Seattle    305
## 19 AM WEST on time       Seattle    201
## 20 AM WEST   delay       Seattle     61

Rename columns for better analysis

colnames(sf1)[1] <- "Airline"
colnames(sf1)[2] <- "Status"
colnames(sf1)[3] <- "City"
colnames(sf1)[4] <- "Frequency"

cast “Airline” and “Status” as factors

sf1$Airline <- as.factor(sf1$Airline)
sf1$Status <- as.factor(sf1$Status)

spread data to separate “delay” and “on time”

cleandata<- spread(sf1, Status, Frequency)
cleandata
##    Airline          City delay on time
## 1   ALASKA   Los.Angeles    62     497
## 2   ALASKA       Phoenix    12     221
## 3   ALASKA     San.Diego    20     212
## 4   ALASKA San.Francisco   102     503
## 5   ALASKA       Seattle   305    1841
## 6  AM WEST   Los.Angeles   117     694
## 7  AM WEST       Phoenix   415    4840
## 8  AM WEST     San.Diego    65     383
## 9  AM WEST San.Francisco   129     320
## 10 AM WEST       Seattle    61     201

Preform analysis. Figure out total flights/percentage of delayed flights

library(dplyr)
cleandata1 <- cleandata %>% 
  mutate(total.flights = delay + `on time`) 
cleandata2 <- cleandata1 %>% 
  mutate(percent_on_time = `on time`/total.flights,
       percent_delay = delay/total.flights) %>%
  arrange(City)
colnames(cleandata2)[6] <- "OnTime"
colnames(cleandata2)[7] <- "Delay"
cleandata3 <- cleandata2 %>% select(Airline, City, OnTime, Delay)
cleandata3
##    Airline          City    OnTime      Delay
## 1   ALASKA   Los.Angeles 0.8890877 0.11091234
## 2  AM WEST   Los.Angeles 0.8557337 0.14426634
## 3   ALASKA       Phoenix 0.9484979 0.05150215
## 4  AM WEST       Phoenix 0.9210276 0.07897241
## 5   ALASKA     San.Diego 0.9137931 0.08620690
## 6  AM WEST     San.Diego 0.8549107 0.14508929
## 7   ALASKA San.Francisco 0.8314050 0.16859504
## 8  AM WEST San.Francisco 0.7126949 0.28730512
## 9   ALASKA       Seattle 0.8578751 0.14212488
## 10 AM WEST       Seattle 0.7671756 0.23282443
cleandata4 <- cleandata3 %>% 
  group_by(Airline) %>% 
  summarise(avg_ontime = mean(OnTime), avg_delay = mean(Delay), min_ontime = min(OnTime), max_ontime = max(OnTime), min_delay = min(Delay), max_delay = max(Delay))
cleandata4
## # A tibble: 2 x 7
##   Airline avg_ontime avg_delay min_ontime max_ontime  min_delay max_delay
##    <fctr>      <dbl>     <dbl>      <dbl>      <dbl>      <dbl>     <dbl>
## 1  ALASKA  0.8881317 0.1118683  0.8314050  0.9484979 0.05150215 0.1685950
## 2 AM WEST  0.8223085 0.1776915  0.7126949  0.9210276 0.07897241 0.2873051
library(ggplot2)
plot <- ggplot(cleandata3, aes(City, OnTime)) + geom_point(aes(shape = factor(Airline)))
plot

plot2 <- ggplot(cleandata3, aes(City, Delay)) + geom_point(aes(shape = factor(Airline)))
plot2

The two plots show the likelihood of having an on time or delayed flight by location. Your best chances of having an ontime flight is if you are flying with Alaska Airline from Pheonix. Your most likely to be delayed coming out of San Francisco on AM West.