R Markdown
1. Import libraries
library(stringr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('scales')
## Warning: package 'scales' was built under R version 3.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
2. Get .csv file
csvfile <- read.csv("https://raw.githubusercontent.com/AjayArora35/Data-607-Assignment-Week-5/master/AssignmentWeek5.csv", header=FALSE, sep=",", stringsAsFactors=FALSE)
csvfile <- data.frame(csvfile)
csvfile
## V1 V2 V3 V4 V5 V6
## 1 Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 2 ALASKA on time 497 221 212 503
## 3 delayed 62 12 20 102
## 4
## 5 AM WEST on time 694 4,840 383 320
## 6 delayed 117 415 65 129
## V7
## 1 Seattle
## 2 1,841
## 3 305
## 4
## 5 201
## 6 61
3. Assign headers to ‘V1’ and ‘V2’ then copy the first row as headers, etc.
#Assign missing headers
csvfile$V1[1] <- "Airline"
csvfile$V2[1] <- "Status"
csvfile
## V1 V2 V3 V4 V5 V6
## 1 Airline Status Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 2 ALASKA on time 497 221 212 503
## 3 delayed 62 12 20 102
## 4
## 5 AM WEST on time 694 4,840 383 320
## 6 delayed 117 415 65 129
## V7
## 1 Seattle
## 2 1,841
## 3 305
## 4
## 5 201
## 6 61
#Copy the first row as headers
names(csvfile) <- csvfile[1,]
csvfile
## Airline Status Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 1 Airline Status Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 2 ALASKA on time 497 221 212 503
## 3 delayed 62 12 20 102
## 4
## 5 AM WEST on time 694 4,840 383 320
## 6 delayed 117 415 65 129
## Seattle
## 1 Seattle
## 2 1,841
## 3 305
## 4
## 5 201
## 6 61
#Remove the first row
csvfile <- csvfile[-c(1), ]
csvfile
## Airline Status Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 2 ALASKA on time 497 221 212 503
## 3 delayed 62 12 20 102
## 4
## 5 AM WEST on time 694 4,840 383 320
## 6 delayed 117 415 65 129
## Seattle
## 2 1,841
## 3 305
## 4
## 5 201
## 6 61
#Remove the NA row as well
csvfile <- csvfile[c(1,2,4,5), ]
csvfile
## Airline Status Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 2 ALASKA on time 497 221 212 503
## 3 delayed 62 12 20 102
## 5 AM WEST on time 694 4,840 383 320
## 6 delayed 117 415 65 129
## Seattle
## 2 1,841
## 3 305
## 5 201
## 6 61
#Fill in the Airline missing column data
for (i in 1:dim(csvfile)[1]){
if (i %% 2 == 0){
csvfile$Airline[i] <- csvfile$Airline[i-1]
}
}
csvfile
## Airline Status Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 2 ALASKA on time 497 221 212 503
## 3 ALASKA delayed 62 12 20 102
## 5 AM WEST on time 694 4,840 383 320
## 6 AM WEST delayed 117 415 65 129
## Seattle
## 2 1,841
## 3 305
## 5 201
## 6 61
4. Some Analysis
Now, take the wide table and make it long: tidyr (Pivot), and additional cleanup
flights <- gather(csvfile, key = 'City', value = 'Flights', 3:7, convert=TRUE)
flights
## Airline Status City Flights
## 1 ALASKA on time Los \nAngeles 497
## 2 ALASKA delayed Los \nAngeles 62
## 3 AM WEST on time Los \nAngeles 694
## 4 AM WEST delayed Los \nAngeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4,840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San \nDiego 212
## 10 ALASKA delayed San \nDiego 20
## 11 AM WEST on time San \nDiego 383
## 12 AM WEST delayed San \nDiego 65
## 13 ALASKA on time San \nFrancisco 503
## 14 ALASKA delayed San \nFrancisco 102
## 15 AM WEST on time San \nFrancisco 320
## 16 AM WEST delayed San \nFrancisco 129
## 17 ALASKA on time Seattle 1,841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
#Remove new line character from city names and remove commas from number of flights
flights$City <- str_replace_all(flights$City, "[\r\n]" , "")
flights
## Airline Status City Flights
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4,840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 AM WEST delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 17 ALASKA on time Seattle 1,841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
flights$Flights <- str_replace_all(flights$Flights, "[,]" , "")
flights
## Airline Status City Flights
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 AM WEST delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
#Now, separate out the status column into On-time and Delayed
flights2 <- flights %>% spread(Status, `Flights`, convert=TRUE)
flights2
## Airline City delayed on time
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San Diego 65 383
## 9 AM WEST San Francisco 129 320
## 10 AM WEST Seattle 61 201
#Now, add some percentages for On-Time and Delayed
flights2 <- flights2 %>%
mutate(total = delayed + `on time`,
OnTime = percent(`on time`/total*100, accuracy = .01, scale=1),
Delayed = percent(delayed/total*100, accuracy = .01, scale=1))
#What is the percentage for on-time versus delayed
flights2 %>%
kable() %>%
kable_styling()
|
Airline
|
City
|
delayed
|
on time
|
total
|
OnTime
|
Delayed
|
|
ALASKA
|
Los Angeles
|
62
|
497
|
559
|
88.91%
|
11.09%
|
|
ALASKA
|
Phoenix
|
12
|
221
|
233
|
94.85%
|
5.15%
|
|
ALASKA
|
San Diego
|
20
|
212
|
232
|
91.38%
|
8.62%
|
|
ALASKA
|
San Francisco
|
102
|
503
|
605
|
83.14%
|
16.86%
|
|
ALASKA
|
Seattle
|
305
|
1841
|
2146
|
85.79%
|
14.21%
|
|
AM WEST
|
Los Angeles
|
117
|
694
|
811
|
85.57%
|
14.43%
|
|
AM WEST
|
Phoenix
|
415
|
4840
|
5255
|
92.10%
|
7.90%
|
|
AM WEST
|
San Diego
|
65
|
383
|
448
|
85.49%
|
14.51%
|
|
AM WEST
|
San Francisco
|
129
|
320
|
449
|
71.27%
|
28.73%
|
|
AM WEST
|
Seattle
|
61
|
201
|
262
|
76.72%
|
23.28%
|
#Reference: http://www.sthda.com/english/wiki/ggplot2-barplots-quick-start-guide-r-software-and-data-visualization
#What is the total flights for each airline?
ggplot(flights2,
aes(x=flights2$Airline,
y=flights2$total
))+
geom_bar(stat="identity")

#What is the total flights for each citry?
ggplot(flights2,
aes(x=flights2$City,
y=flights2$total, fill = flights2$Airline
))+
geom_bar(stat="identity", position="dodge")

#What is the percentage delay for all cities per airline?
ggplot(flights2,
aes(x=flights2$City,
y=flights2$Delayed, fill = flights2$Airline
))+
geom_bar(stat="identity", position="dodge")
#5. Final Analysis
#What is the best On-time percentage rate?
flights3 <- flights2 %>% filter(flights2$OnTime == max(flights2$OnTime))
flights3%>%
kable() %>%
kable_styling()
|
Airline
|
City
|
delayed
|
on time
|
total
|
OnTime
|
Delayed
|
|
ALASKA
|
Phoenix
|
12
|
221
|
233
|
94.85%
|
5.15%
|
#What is the best On-time arrival rate?
flights4 <- flights2 %>%
group_by(Airline) %>%
summarise(Delayed_Flights = sum(delayed), Total_Flights = sum(total)) %>%
mutate(Delayed = (Delayed_Flights / Total_Flights)*100)
flights4%>%
kable() %>%
kable_styling()
|
Airline
|
Delayed_Flights
|
Total_Flights
|
Delayed
|
|
ALASKA
|
501
|
3775
|
13.27152
|
|
AM WEST
|
787
|
7225
|
10.89273
|
#What are the ariline delays as percentage?
ggplot(flights4,
aes(x=flights4$Airline,
y=flights4$Delayed, fill = flights4$Airline
))+
geom_bar(stat="identity", position="dodge")

#What are the airline total flights?
ggplot(flights4,
aes(x=flights4$Airline,
y=flights4$Total_Flights, fill = flights4$Airline
))+
geom_bar(stat="identity", position="dodge")

#What are the airline total delayed flights?
ggplot(flights4,
aes(x=flights4$Airline,
y=flights4$Delayed_Flights, fill = flights4$Airline
))+
geom_bar(stat="identity", position="dodge")

6. Conclusions
As indicated by the above graphs, AM West had many more flights than Alaska. The best arrival time was by Alaska to Phoneix where their on-time is a 94.85%. Alaska’s overall delay is at 13.27% compared with 10.89% with AM West. Lastly, for each city, AM West has a larger percentage delay than Alaska.