Summary of Assignment This assignment involves Tidying and Transforming flight data
This Assignment requires the following:
1). R-Studio
The following R-packages are used: 1.tidyr 2.dplyr 3.ggplot2
Steps to reproduce: 1). run the R-Studio file: R_607_Week_5a_Hmk_Tidy_Transform_Data_Daniel_Thonn.Rmd
Setting up and Preparing the Environment
#install.packages("stringr")
#install.packages("tidyr")
#install.packages("dplyr")
#install.packages("plyr")
#library(stringr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#library(plyr)
#detach("package:plyr", unload=TRUE)
library(ggplot2)
Load the character names data into an R dataframe, cleanup, and convert to long format
#Load data from a text file
#data_1 <- readLines("C:/mysqldata/tournamentinfo.txt")
#head(data_1)
# Load the data csv file to a vector
a <- read.csv(
"C:/mysqldata/Arrivals_Airlines.csv",
sep=",",
na.strings = "",
blank.lines.skip = TRUE,
stringsAsFactors=FALSE)
list(a)
## [[1]]
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 12 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
# Convert the input vector to a dataframe
df_arrivals_1 = data.frame(a)
list(df_arrivals_1)
## [[1]]
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 12 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
# Remove the empty row [3]
df_arrivals_2 <- df_arrivals_1[-3,]
list(df_arrivals_2)
## [[1]]
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 12 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
names(df_arrivals_2)
## [1] "X" "X.1" "Los.Angeles" "Phoenix"
## [5] "San.Diego" "San.Francisco" "Seattle"
# Label columns for Airlines and Status
df_arrivals_2b <- df_arrivals_2
names(df_arrivals_2b)[names(df_arrivals_2b) == "X"] <- "Airlines"
names(df_arrivals_2b)[names(df_arrivals_2b) == "X.1"] <- "Status"
list(df_arrivals_2b)
## [[1]]
## Airlines Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 12 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
# Remove NA's
df_arrivals_3 <- replace(df_arrivals_2b, is.na(df_arrivals_2), "")
list(df_arrivals_3)
## [[1]]
## Airlines Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 12 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
df_arrivals_4 <- df_arrivals_3
names(df_arrivals_4)
## [1] "Airlines" "Status" "Los.Angeles" "Phoenix"
## [5] "San.Diego" "San.Francisco" "Seattle"
list(df_arrivals_4)
## [[1]]
## Airlines Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 12 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
length(df_arrivals_4$Airlines)
## [1] 4
df_arrivals_5 <- df_arrivals_4
list(df_arrivals_5)
## [[1]]
## Airlines Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 12 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
# Fill in missing column names
for (i in 1:length(df_arrivals_5$Airlines))
{
if (i %% 2){
next
}
#print(df_arrivals_4$Airlines[i])
df_arrivals_5$Airlines[i] <- paste(df_arrivals_4$Airlines[i],df_arrivals_4$Airlines[i-1])
}
list(df_arrivals_5)
## [[1]]
## Airlines Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 12 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
# Remove spaces from column Airlines values
df_arrivals_5$Airlines <- gsub(" ", "", df_arrivals_5$Airlines)
df_arrivals_5$Airlines
## [1] "ALASKA" "ALASKA" "AMWEST" "AMWEST"
list(df_arrivals_5)
## [[1]]
## Airlines Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 12 102 305
## 4 AMWEST on time 694 4840 383 320 201
## 5 AMWEST delayed 117 415 65 129 61
# Convert wide dataframe into long dataframe
df_arrivals_long <- gather(df_arrivals_5, City, NumFlights, Los.Angeles:Seattle) %>% mutate(City = gsub( "\\.", " ", City)) %>% arrange(Airlines)
#list(df_arrivals_long)
Use dplyer to analyze the flight numbers and plot the results
list(df_arrivals_long)
## [[1]]
## Airlines Status City NumFlights
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 ALASKA on time Phoenix 221
## 4 ALASKA delayed Phoenix 12
## 5 ALASKA on time San Diego 212
## 6 ALASKA delayed San Diego 12
## 7 ALASKA on time San Francisco 503
## 8 ALASKA delayed San Francisco 102
## 9 ALASKA on time Seattle 1841
## 10 ALASKA delayed Seattle 305
## 11 AMWEST on time Los Angeles 694
## 12 AMWEST delayed Los Angeles 117
## 13 AMWEST on time Phoenix 4840
## 14 AMWEST delayed Phoenix 415
## 15 AMWEST on time San Diego 383
## 16 AMWEST delayed San Diego 65
## 17 AMWEST on time San Francisco 320
## 18 AMWEST delayed San Francisco 129
## 19 AMWEST on time Seattle 201
## 20 AMWEST delayed Seattle 61
# Add a column for percent of flights delayed or ont-time relative to total number of flights
By_Airlines_df_0a <- mutate(df_arrivals_long,percent1= round(NumFlights/sum(NumFlights) * 100, 1))
By_Airlines_df_0a
## Airlines Status City NumFlights percent1
## 1 ALASKA on time Los Angeles 497 4.5
## 2 ALASKA delayed Los Angeles 62 0.6
## 3 ALASKA on time Phoenix 221 2.0
## 4 ALASKA delayed Phoenix 12 0.1
## 5 ALASKA on time San Diego 212 1.9
## 6 ALASKA delayed San Diego 12 0.1
## 7 ALASKA on time San Francisco 503 4.6
## 8 ALASKA delayed San Francisco 102 0.9
## 9 ALASKA on time Seattle 1841 16.7
## 10 ALASKA delayed Seattle 305 2.8
## 11 AMWEST on time Los Angeles 694 6.3
## 12 AMWEST delayed Los Angeles 117 1.1
## 13 AMWEST on time Phoenix 4840 44.0
## 14 AMWEST delayed Phoenix 415 3.8
## 15 AMWEST on time San Diego 383 3.5
## 16 AMWEST delayed San Diego 65 0.6
## 17 AMWEST on time San Francisco 320 2.9
## 18 AMWEST delayed San Francisco 129 1.2
## 19 AMWEST on time Seattle 201 1.8
## 20 AMWEST delayed Seattle 61 0.6
# Setup group by criteria by Airlines
By_Airlines_df_1 <- group_by(By_Airlines_df_0a,Airlines,Status)
By_Airlines_df_1
## Source: local data frame [20 x 5]
## Groups: Airlines, Status [4]
##
## Airlines Status City NumFlights percent1
## <chr> <chr> <chr> <int> <dbl>
## 1 ALASKA on time Los Angeles 497 4.5
## 2 ALASKA delayed Los Angeles 62 0.6
## 3 ALASKA on time Phoenix 221 2.0
## 4 ALASKA delayed Phoenix 12 0.1
## 5 ALASKA on time San Diego 212 1.9
## 6 ALASKA delayed San Diego 12 0.1
## 7 ALASKA on time San Francisco 503 4.6
## 8 ALASKA delayed San Francisco 102 0.9
## 9 ALASKA on time Seattle 1841 16.7
## 10 ALASKA delayed Seattle 305 2.8
## 11 AMWEST on time Los Angeles 694 6.3
## 12 AMWEST delayed Los Angeles 117 1.1
## 13 AMWEST on time Phoenix 4840 44.0
## 14 AMWEST delayed Phoenix 415 3.8
## 15 AMWEST on time San Diego 383 3.5
## 16 AMWEST delayed San Diego 65 0.6
## 17 AMWEST on time San Francisco 320 2.9
## 18 AMWEST delayed San Francisco 129 1.2
## 19 AMWEST on time Seattle 201 1.8
## 20 AMWEST delayed Seattle 61 0.6
# Select columns for analysis dataframe by Airlines
By_Airlines_df_1a <- select(By_Airlines_df_1,Airlines,Status,NumFlights,percent1)
By_Airlines_df_1a
## Source: local data frame [20 x 4]
## Groups: Airlines, Status [4]
##
## Airlines Status NumFlights percent1
## <chr> <chr> <int> <dbl>
## 1 ALASKA on time 497 4.5
## 2 ALASKA delayed 62 0.6
## 3 ALASKA on time 221 2.0
## 4 ALASKA delayed 12 0.1
## 5 ALASKA on time 212 1.9
## 6 ALASKA delayed 12 0.1
## 7 ALASKA on time 503 4.6
## 8 ALASKA delayed 102 0.9
## 9 ALASKA on time 1841 16.7
## 10 ALASKA delayed 305 2.8
## 11 AMWEST on time 694 6.3
## 12 AMWEST delayed 117 1.1
## 13 AMWEST on time 4840 44.0
## 14 AMWEST delayed 415 3.8
## 15 AMWEST on time 383 3.5
## 16 AMWEST delayed 65 0.6
## 17 AMWEST on time 320 2.9
## 18 AMWEST delayed 129 1.2
## 19 AMWEST on time 201 1.8
## 20 AMWEST delayed 61 0.6
# Present columns based on select and group-by criteria by Airlines
By_Airlines_df_3 <- summarise(By_Airlines_df_1a,SumNumFlights = sum(NumFlights),SumPercent = paste0(sum(percent1),"%"))
By_Airlines_df_3
## Source: local data frame [4 x 4]
## Groups: Airlines [?]
##
## Airlines Status SumNumFlights SumPercent
## <chr> <chr> <int> <chr>
## 1 ALASKA delayed 493 4.5%
## 2 ALASKA on time 3274 29.7%
## 3 AMWEST delayed 787 7.3%
## 4 AMWEST on time 6438 58.5%
# Graph delayed flights per total flights by Airline
By_Airlines_df_3 %>% filter(Status == "delayed") %>%
ggplot( aes(x=Airlines, y=SumPercent, fill=Airlines)) +
geom_bar(stat="identity", position=position_dodge(), colour="black", width = 0.5) +
coord_flip() +
ggtitle("Percent of Delays per Total Flights by Airline") +
xlab("Airline") + ylab("Percentage of Flights Delayed") +
geom_text(aes(label=paste(SumPercent)), vjust=0.5, hjust=1.1,color="black")
# Graph on-time flights per total flights by Airline
By_Airlines_df_3 %>% filter(Status == "on time") %>%
ggplot( aes(x=Airlines, y=SumPercent, fill=Airlines)) +
geom_bar(stat="identity", position=position_dodge(), colour="black", width = 0.5) +
coord_flip() +
ggtitle("Percent of On-Time Trips per Total Flights by Airline") +
xlab("Airline") + ylab("Percentage of Flights On-Time") +
geom_text(aes(label=paste(SumPercent)), vjust=0.5, hjust=1.1,color="black")
# Setup group by criteria by Airlines and City
By_Airlines_df_1b <- group_by(By_Airlines_df_0a,Airlines,Status,City)
By_Airlines_df_1b
## Source: local data frame [20 x 5]
## Groups: Airlines, Status, City [20]
##
## Airlines Status City NumFlights percent1
## <chr> <chr> <chr> <int> <dbl>
## 1 ALASKA on time Los Angeles 497 4.5
## 2 ALASKA delayed Los Angeles 62 0.6
## 3 ALASKA on time Phoenix 221 2.0
## 4 ALASKA delayed Phoenix 12 0.1
## 5 ALASKA on time San Diego 212 1.9
## 6 ALASKA delayed San Diego 12 0.1
## 7 ALASKA on time San Francisco 503 4.6
## 8 ALASKA delayed San Francisco 102 0.9
## 9 ALASKA on time Seattle 1841 16.7
## 10 ALASKA delayed Seattle 305 2.8
## 11 AMWEST on time Los Angeles 694 6.3
## 12 AMWEST delayed Los Angeles 117 1.1
## 13 AMWEST on time Phoenix 4840 44.0
## 14 AMWEST delayed Phoenix 415 3.8
## 15 AMWEST on time San Diego 383 3.5
## 16 AMWEST delayed San Diego 65 0.6
## 17 AMWEST on time San Francisco 320 2.9
## 18 AMWEST delayed San Francisco 129 1.2
## 19 AMWEST on time Seattle 201 1.8
## 20 AMWEST delayed Seattle 61 0.6
# Select columns for analysis dataframe by Airlines and City
By_Airlines_df_1c <- select(By_Airlines_df_1b,Airlines,Status,City,NumFlights,percent1)
By_Airlines_df_1c
## Source: local data frame [20 x 5]
## Groups: Airlines, Status, City [20]
##
## Airlines Status City NumFlights percent1
## <chr> <chr> <chr> <int> <dbl>
## 1 ALASKA on time Los Angeles 497 4.5
## 2 ALASKA delayed Los Angeles 62 0.6
## 3 ALASKA on time Phoenix 221 2.0
## 4 ALASKA delayed Phoenix 12 0.1
## 5 ALASKA on time San Diego 212 1.9
## 6 ALASKA delayed San Diego 12 0.1
## 7 ALASKA on time San Francisco 503 4.6
## 8 ALASKA delayed San Francisco 102 0.9
## 9 ALASKA on time Seattle 1841 16.7
## 10 ALASKA delayed Seattle 305 2.8
## 11 AMWEST on time Los Angeles 694 6.3
## 12 AMWEST delayed Los Angeles 117 1.1
## 13 AMWEST on time Phoenix 4840 44.0
## 14 AMWEST delayed Phoenix 415 3.8
## 15 AMWEST on time San Diego 383 3.5
## 16 AMWEST delayed San Diego 65 0.6
## 17 AMWEST on time San Francisco 320 2.9
## 18 AMWEST delayed San Francisco 129 1.2
## 19 AMWEST on time Seattle 201 1.8
## 20 AMWEST delayed Seattle 61 0.6
# Present columns based on select and group-by criteria by Airlines
By_Airlines_df_3b <- summarise(By_Airlines_df_1c,SumNumFlights = sum(NumFlights),SumPercent = paste0(sum(percent1),"%"))
By_Airlines_df_3b
## Source: local data frame [20 x 5]
## Groups: Airlines, Status [?]
##
## Airlines Status City SumNumFlights SumPercent
## <chr> <chr> <chr> <int> <chr>
## 1 ALASKA delayed Los Angeles 62 0.6%
## 2 ALASKA delayed Phoenix 12 0.1%
## 3 ALASKA delayed San Diego 12 0.1%
## 4 ALASKA delayed San Francisco 102 0.9%
## 5 ALASKA delayed Seattle 305 2.8%
## 6 ALASKA on time Los Angeles 497 4.5%
## 7 ALASKA on time Phoenix 221 2%
## 8 ALASKA on time San Diego 212 1.9%
## 9 ALASKA on time San Francisco 503 4.6%
## 10 ALASKA on time Seattle 1841 16.7%
## 11 AMWEST delayed Los Angeles 117 1.1%
## 12 AMWEST delayed Phoenix 415 3.8%
## 13 AMWEST delayed San Diego 65 0.6%
## 14 AMWEST delayed San Francisco 129 1.2%
## 15 AMWEST delayed Seattle 61 0.6%
## 16 AMWEST on time Los Angeles 694 6.3%
## 17 AMWEST on time Phoenix 4840 44%
## 18 AMWEST on time San Diego 383 3.5%
## 19 AMWEST on time San Francisco 320 2.9%
## 20 AMWEST on time Seattle 201 1.8%
# Graph delayed flights per total flights by Airline and City
By_Airlines_df_3b %>% filter(Status == "delayed") %>%
ggplot( aes(x=City, y=SumPercent, fill=Airlines)) +
geom_bar(stat="identity", position=position_dodge(), colour="black", width = 0.5) +
coord_flip() +
ggtitle("Percent of Delays per Total Flight by Airline by City") +
xlab("City") + ylab("Percentage of Flights Delayed") +
geom_text(aes(label=paste(SumPercent)), vjust=0.5, hjust=1.1,color="black")
# Graph on-time flights per total flights by Airline
By_Airlines_df_3b %>% filter(Status == "on time") %>%
ggplot( aes(x=City, y=SumPercent, fill=Airlines)) +
geom_bar(stat="identity", position=position_dodge(), colour="black", width = 0.5) +
coord_flip() +
ggtitle("Percent of On-Time Trips per Total Flights by Airline by City") +
xlab("City") + ylab("Percentage of Flights On-Time") +
geom_text(aes(label=paste(SumPercent)), vjust=0.5, hjust=1.1,color="black")
Conclusion: Overall AMWEST has the higher total proportion of delayed flights at 7.3% versus 4.5% for ALASKA. But AMWEST also had the higher number of on-time flights at 58.5% versus 29.7% for ALASKA. The number of delays versus on-time was 7.3/58.5 = 12.4% for AMWEST and 4.5/29.7 = 15.1% for ALASKA, so ALASKA has a higher relative proportion of delayed flights.
In analysis by City Seattle has the highest total proportion of delayed flights for ALASKA airlines (2.8%), and Phoenix the highest number of delays by AMWEST (3.8%).
By City Pheonix also has the highest total proportion of on-time flights at a total of 46% for both Airlines combined.
If we compare delays to on-time by city the breakdown of relative proportional delays is as follows:
Los Angeles: 1.7/4.8 = 35.4% Seattle: 3.4/11.5 = 29.6% San Francisco: 2.1/10.9 = 19.2% San Diego: .7/5.4 = 12.9% Phoenix: 3.9/46 = 8.4%
This shows the breakdown of relative delays versus on-time flights by city and shows Los Angeles is the highest proportion and Phoenix has the least delays for both airlines.