Step 1 - Importing the libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(stringr)
Step 2 - Importing the dataset
flightsdata <- read.csv("https://raw.githubusercontent.com/DevMeh/Assignment5/master/assign5.csv", sep = ",", header = TRUE)
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on 'https://
## raw.githubusercontent.com/DevMeh/Assignment5/master/assign5.csv'
head(flightsdata)
## Airline Status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
Step 3 - Tidying up the data
# Seperate the data by creating a city column and splitting on status
tidyflights <- rename(flightsdata, Airline = Airline)
tidyflights <- tidyflights %>%
gather(city, number, 3:length(flightsdata)) %>%
spread(Status, number)
# Remove underscore from city name
tidyflights$city <- str_replace_all(tidyflights$city, "\\_", " ")
# Renaming on time as on_time
tidyflights <- rename(tidyflights, on_time = `on time`)
#View the dataframe
head(tidyflights)
## Airline city delayed on_time
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
Step 4 - Creating calcuated columns
- Created a column to display percentage of flights that were delayed, percenage of flights that were on time and total flights by city
tidyflights <- tidyflights %>%
mutate(total_flights=on_time+ delayed, pct_delay = round(100*(delayed/total_flights),2),pct_ontime=round(100*(on_time/total_flights),2)) %>%
arrange(desc(total_flights))
tidyflights
## Airline city delayed on_time total_flights pct_delay
## 1 AM WEST Phoenix 415 4840 5255 7.90
## 2 ALASKA Seattle 305 1841 2146 14.21
## 3 AM WEST Los Angeles 117 694 811 14.43
## 4 ALASKA San Francisco 102 503 605 16.86
## 5 ALASKA Los Angeles 62 497 559 11.09
## 6 AM WEST San Francisco 129 320 449 28.73
## 7 AM WEST San Diego 65 383 448 14.51
## 8 AM WEST Seattle 61 201 262 23.28
## 9 ALASKA Phoenix 12 221 233 5.15
## 10 ALASKA San Diego 20 212 232 8.62
## pct_ontime
## 1 92.10
## 2 85.79
## 3 85.57
## 4 83.14
## 5 88.91
## 6 71.27
## 7 85.49
## 8 76.72
## 9 94.85
## 10 91.38
Step 5 - Analysis
- Summarizing delays by city
delaybycity <- tidyflights %>%
group_by(city) %>%
summarise(delay_pct = mean(pct_delay)) %>%
arrange(desc(delay_pct))
delaybycity
## # A tibble: 5 x 2
## city delay_pct
## <chr> <dbl>
## 1 San Francisco 22.8
## 2 Seattle 18.7
## 3 Los Angeles 12.8
## 4 San Diego 11.6
## 5 Phoenix 6.52
- Summarizing flights that were ontime by Airline
ontimebyairline <- tidyflights %>%
group_by(Airline) %>%
summarise(ontime_pct = mean(pct_ontime)) %>%
arrange(desc(ontime_pct))
ontimebyairline
## # A tibble: 2 x 2
## Airline ontime_pct
## <fct> <dbl>
## 1 ALASKA 88.8
## 2 AM WEST 82.2
- Summarizing flights that delayed by city by airline
delaybyairline <- tidyflights %>%
group_by(city,Airline) %>%
summarise(delay_pct = mean(pct_delay)) %>%
arrange(city,delay_pct)
delaybyairline
## # A tibble: 10 x 3
## # Groups: city [5]
## city Airline delay_pct
## <chr> <fct> <dbl>
## 1 Los Angeles ALASKA 11.1
## 2 Los Angeles AM WEST 14.4
## 3 Phoenix ALASKA 5.15
## 4 Phoenix AM WEST 7.9
## 5 San Diego ALASKA 8.62
## 6 San Diego AM WEST 14.5
## 7 San Francisco ALASKA 16.9
## 8 San Francisco AM WEST 28.7
## 9 Seattle ALASKA 14.2
## 10 Seattle AM WEST 23.3
Step 6 - Visualization
- Visualizing percentage of flights delayed by city by airline
ggplot(data=tidyflights, aes(x=city, y=pct_delay,fill=Airline))+
geom_bar(stat="identity", position=position_dodge(), colour="black") +
geom_text(aes(label=pct_delay), vjust=.5, hjust=1.25,position= position_dodge(width=0.9), color="black") +
ggtitle("Flights Delayed by Airline by City") +
xlab("City") + ylab("% of Flights Delayed") +coord_flip()
