Load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
Read CSV file Rename Columns
airplanes <-read.csv("/Users/christinakasman/Desktop/airplanes data607.csv", header=TRUE, sep=",", stringsAsFactors=FALSE)
airplanes
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM West on time 694 4850 383 320 201
## 5 Delayed 117 415 65 129 61
## 6 NA NA NA NA NA
## 7 NA NA NA NA NA
## 8 NA NA NA NA NA
## 9 NA NA NA NA NA
## 10 NA NA NA NA NA
## 11 NA NA NA NA NA
## 12 NA NA NA NA NA
## 13 NA NA NA NA NA
## 14 NA NA NA NA NA
## 15 NA NA NA NA NA
## 16 NA NA NA NA NA
## 17 NA NA NA NA NA
## 18 NA NA NA NA NA
## 19 NA NA NA NA NA
## 20 NA NA NA NA NA
## 21 NA NA NA NA NA
names(airplanes) <- c("Airline", "Status", "LosAngeles", "Phoenix", "SanDiego", "SanFrancisco", "Seattle")
airplanes
## Airline Status LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM West on time 694 4850 383 320 201
## 5 Delayed 117 415 65 129 61
## 6 NA NA NA NA NA
## 7 NA NA NA NA NA
## 8 NA NA NA NA NA
## 9 NA NA NA NA NA
## 10 NA NA NA NA NA
## 11 NA NA NA NA NA
## 12 NA NA NA NA NA
## 13 NA NA NA NA NA
## 14 NA NA NA NA NA
## 15 NA NA NA NA NA
## 16 NA NA NA NA NA
## 17 NA NA NA NA NA
## 18 NA NA NA NA NA
## 19 NA NA NA NA NA
## 20 NA NA NA NA NA
## 21 NA NA NA NA NA
Remove NA values using drop_na from plyr package
airplanes5 <- airplanes %>% drop_na()
airplanes5
## Airline Status LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Delayed 62 12 20 102 305
## 4 AM West on time 694 4850 383 320 201
## 5 Delayed 117 415 65 129 61
for (i in 1:dim(airplanes5)[1]){
if (i %% 2 == 0){
airplanes5$Airline[i] <- airplanes5$Airline[i-1]
}
}
airplanes5
## Airline Status LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska Delayed 62 12 20 102 305
## 4 AM West on time 694 4850 383 320 201
## 5 AM West Delayed 117 415 65 129 61
Use gather function to transform data into Tidy dataset
airplanesnew <- airplanes5%>% gather("city", "Count", LosAngeles:Seattle)
airplanesnew
## Airline Status city Count
## 1 Alaska on time LosAngeles 497
## 2 Alaska Delayed LosAngeles 62
## 3 AM West on time LosAngeles 694
## 4 AM West Delayed LosAngeles 117
## 5 Alaska on time Phoenix 221
## 6 Alaska Delayed Phoenix 12
## 7 AM West on time Phoenix 4850
## 8 AM West Delayed Phoenix 415
## 9 Alaska on time SanDiego 212
## 10 Alaska Delayed SanDiego 20
## 11 AM West on time SanDiego 383
## 12 AM West Delayed SanDiego 65
## 13 Alaska on time SanFrancisco 503
## 14 Alaska Delayed SanFrancisco 102
## 15 AM West on time SanFrancisco 320
## 16 AM West Delayed SanFrancisco 129
## 17 Alaska on time Seattle 1841
## 18 Alaska Delayed Seattle 305
## 19 AM West on time Seattle 201
## 20 AM West Delayed Seattle 61
airplanesnew2 <- as.data.frame(airplanesnew)
airplanesnew4 <- airplanesnew %>%
spread("Status", "Count")
names(airplanesnew4) <- c("Airline", "city", "Delayed", "Ontime")
airplanesnew4
## Airline city Delayed Ontime
## 1 Alaska LosAngeles 62 497
## 2 Alaska Phoenix 12 221
## 3 Alaska SanDiego 20 212
## 4 Alaska SanFrancisco 102 503
## 5 Alaska Seattle 305 1841
## 6 AM West LosAngeles 117 694
## 7 AM West Phoenix 415 4850
## 8 AM West SanDiego 65 383
## 9 AM West SanFrancisco 129 320
## 10 AM West Seattle 61 201
counts <- airplanesnew4 %>% group_by(Airline) %>% summarise(Totaldelayed = sum(Delayed), Totalontime = sum(Ontime))
counts
## # A tibble: 2 x 3
## Airline Totaldelayed Totalontime
## <chr> <int> <int>
## 1 Alaska 501 3274
## 2 AM West 787 6448
graph1 <- mutate(counts, Airline, Totaldelayed, Totalontime, Totalflights = Totaldelayed + Totalontime, percentdelayed = ((Totaldelayed/(Totalontime + Totaldelayed))*100), percentontime = ((Totalontime/(Totalontime + Totaldelayed))*100))
graph1
## # A tibble: 2 x 6
## Airline Totaldelayed Totalontime Totalflights percentdelayed
## <chr> <int> <int> <int> <dbl>
## 1 Alaska 501 3274 3775 13.27152
## 2 AM West 787 6448 7235 10.87768
## # ... with 1 more variables: percentontime <dbl>
Overall, AM West has a nigher number of delayed flights, but when looking at ratios, Alaska has the highest probability of having a flight delayed. AM West has almost double the total number of flights
library(ggplot2)
g <- ggplot(graph1, aes(x = Airline)) + geom_bar(aes(weight = Totaldelayed), fill="#DD8888") + ylab("Delayed") + ggtitle("Total Delayed Flights per Airline")
g
graph2 <- airplanesnew4 %>% group_by(Airline, city) %>% summarise(Totaldelayed = sum(Delayed), Totalontime = sum(Ontime)) %>%
mutate(city, Totaldelayed, Totalontime, percentdelayed = ((Totaldelayed/(Totalontime + Totaldelayed))*100), percentontime = ((Totalontime/(Totalontime + Totaldelayed))*100))
graph2
## # A tibble: 10 x 6
## # Groups: Airline [2]
## Airline city Totaldelayed Totalontime percentdelayed
## <chr> <chr> <int> <int> <dbl>
## 1 Alaska LosAngeles 62 497 11.091234
## 2 Alaska Phoenix 12 221 5.150215
## 3 Alaska SanDiego 20 212 8.620690
## 4 Alaska SanFrancisco 102 503 16.859504
## 5 Alaska Seattle 305 1841 14.212488
## 6 AM West LosAngeles 117 694 14.426634
## 7 AM West Phoenix 415 4850 7.882241
## 8 AM West SanDiego 65 383 14.508929
## 9 AM West SanFrancisco 129 320 28.730512
## 10 AM West Seattle 61 201 23.282443
## # ... with 1 more variables: percentontime <dbl>
g3 <- ggplot(graph2, aes(x = city, y = Totaldelayed)) + geom_bar(aes(fill= Airline), stat = "Identity", position=position_dodge()) + ylab("Delayed") + ggtitle("Total Delayed Flights per Airline")
g3
The highest number of delayed flights occur at Phoenix aiport by AM West Airlines
g1 <- ggplot(graph2, aes(x = city, y = percentdelayed)) + geom_bar(aes(fill= Airline), stat = "Identity", position=position_dodge()) + ylab("On Time") + ggtitle("Percent Delayed Flights per Airline")
g1
The highest probability of flights delayed occur at San Francisco airport by AM West