Assignment 607_Homework-5: Tidy and Transform data

Summary of Assignment This assignment involves Tidying and Transforming flight data

This Assignment requires the following:

1). R-Studio

The following R-packages are used: 1.tidyr 2.dplyr 3.ggplot2

Steps to reproduce: 1). run the R-Studio file: R_607_Week_5a_Hmk_Tidy_Transform_Data_Daniel_Thonn.Rmd

Setting up and Preparing the Environment

#install.packages("stringr")
#install.packages("tidyr")
#install.packages("dplyr")
#install.packages("plyr")
#library(stringr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#library(plyr)
#detach("package:plyr", unload=TRUE)
library(ggplot2)

Load the character names data into an R dataframe, cleanup, and convert to long format

#Load data from a text file
#data_1 <- readLines("C:/mysqldata/tournamentinfo.txt") 
#head(data_1)

# Load the data csv file to a vector
a <- read.csv(
              "C:/mysqldata/Arrivals_Airlines.csv",
              sep=",",
              na.strings = "",
              blank.lines.skip = TRUE,
                stringsAsFactors=FALSE)
list(a)
## [[1]]
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2    <NA> delayed          62      12        12           102     305
## 3    <NA>    <NA>          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5    <NA> delayed         117     415        65           129      61
# Convert the input vector to a dataframe
df_arrivals_1 = data.frame(a)
list(df_arrivals_1)
## [[1]]
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2    <NA> delayed          62      12        12           102     305
## 3    <NA>    <NA>          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5    <NA> delayed         117     415        65           129      61
# Remove the empty row [3]
df_arrivals_2 <- df_arrivals_1[-3,]
list(df_arrivals_2)
## [[1]]
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2    <NA> delayed          62      12        12           102     305
## 4 AM WEST on time         694    4840       383           320     201
## 5    <NA> delayed         117     415        65           129      61
names(df_arrivals_2)
## [1] "X"             "X.1"           "Los.Angeles"   "Phoenix"      
## [5] "San.Diego"     "San.Francisco" "Seattle"
# Label columns for Airlines and Status
df_arrivals_2b <- df_arrivals_2
names(df_arrivals_2b)[names(df_arrivals_2b) == "X"] <- "Airlines"
names(df_arrivals_2b)[names(df_arrivals_2b) == "X.1"] <- "Status"
list(df_arrivals_2b)
## [[1]]
##   Airlines  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   ALASKA on time         497     221       212           503    1841
## 2     <NA> delayed          62      12        12           102     305
## 4  AM WEST on time         694    4840       383           320     201
## 5     <NA> delayed         117     415        65           129      61
# Remove NA's
df_arrivals_3 <- replace(df_arrivals_2b, is.na(df_arrivals_2), "") 
list(df_arrivals_3)
## [[1]]
##   Airlines  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   ALASKA on time         497     221       212           503    1841
## 2          delayed          62      12        12           102     305
## 4  AM WEST on time         694    4840       383           320     201
## 5          delayed         117     415        65           129      61
df_arrivals_4 <- df_arrivals_3
names(df_arrivals_4)
## [1] "Airlines"      "Status"        "Los.Angeles"   "Phoenix"      
## [5] "San.Diego"     "San.Francisco" "Seattle"
list(df_arrivals_4)
## [[1]]
##   Airlines  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   ALASKA on time         497     221       212           503    1841
## 2          delayed          62      12        12           102     305
## 4  AM WEST on time         694    4840       383           320     201
## 5          delayed         117     415        65           129      61
length(df_arrivals_4$Airlines)
## [1] 4
df_arrivals_5 <- df_arrivals_4
list(df_arrivals_5)
## [[1]]
##   Airlines  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   ALASKA on time         497     221       212           503    1841
## 2          delayed          62      12        12           102     305
## 4  AM WEST on time         694    4840       383           320     201
## 5          delayed         117     415        65           129      61
# Fill in missing column names
for (i in 1:length(df_arrivals_5$Airlines))
{
  if (i %% 2){
    next
  }
  #print(df_arrivals_4$Airlines[i])
  df_arrivals_5$Airlines[i] <- paste(df_arrivals_4$Airlines[i],df_arrivals_4$Airlines[i-1])
}

list(df_arrivals_5)
## [[1]]
##   Airlines  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   ALASKA on time         497     221       212           503    1841
## 2   ALASKA delayed          62      12        12           102     305
## 4  AM WEST on time         694    4840       383           320     201
## 5  AM WEST delayed         117     415        65           129      61
# Remove spaces from column Airlines values
df_arrivals_5$Airlines <- gsub(" ", "", df_arrivals_5$Airlines)
df_arrivals_5$Airlines
## [1] "ALASKA" "ALASKA" "AMWEST" "AMWEST"
list(df_arrivals_5)
## [[1]]
##   Airlines  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   ALASKA on time         497     221       212           503    1841
## 2   ALASKA delayed          62      12        12           102     305
## 4   AMWEST on time         694    4840       383           320     201
## 5   AMWEST delayed         117     415        65           129      61
# Convert wide dataframe into long dataframe
df_arrivals_long <- gather(df_arrivals_5, City, NumFlights, Los.Angeles:Seattle) %>% mutate(City = gsub( "\\.", " ", City)) %>% arrange(Airlines)

#list(df_arrivals_long)

Use dplyer to analyze the flight numbers and plot the results

list(df_arrivals_long)
## [[1]]
##    Airlines  Status          City NumFlights
## 1    ALASKA on time   Los Angeles        497
## 2    ALASKA delayed   Los Angeles         62
## 3    ALASKA on time       Phoenix        221
## 4    ALASKA delayed       Phoenix         12
## 5    ALASKA on time     San Diego        212
## 6    ALASKA delayed     San Diego         12
## 7    ALASKA on time San Francisco        503
## 8    ALASKA delayed San Francisco        102
## 9    ALASKA on time       Seattle       1841
## 10   ALASKA delayed       Seattle        305
## 11   AMWEST on time   Los Angeles        694
## 12   AMWEST delayed   Los Angeles        117
## 13   AMWEST on time       Phoenix       4840
## 14   AMWEST delayed       Phoenix        415
## 15   AMWEST on time     San Diego        383
## 16   AMWEST delayed     San Diego         65
## 17   AMWEST on time San Francisco        320
## 18   AMWEST delayed San Francisco        129
## 19   AMWEST on time       Seattle        201
## 20   AMWEST delayed       Seattle         61
# Add a column for percent of flights delayed or ont-time relative to total number of flights
By_Airlines_df_0a <- mutate(df_arrivals_long,percent1= round(NumFlights/sum(NumFlights) * 100, 1))
By_Airlines_df_0a
##    Airlines  Status          City NumFlights percent1
## 1    ALASKA on time   Los Angeles        497      4.5
## 2    ALASKA delayed   Los Angeles         62      0.6
## 3    ALASKA on time       Phoenix        221      2.0
## 4    ALASKA delayed       Phoenix         12      0.1
## 5    ALASKA on time     San Diego        212      1.9
## 6    ALASKA delayed     San Diego         12      0.1
## 7    ALASKA on time San Francisco        503      4.6
## 8    ALASKA delayed San Francisco        102      0.9
## 9    ALASKA on time       Seattle       1841     16.7
## 10   ALASKA delayed       Seattle        305      2.8
## 11   AMWEST on time   Los Angeles        694      6.3
## 12   AMWEST delayed   Los Angeles        117      1.1
## 13   AMWEST on time       Phoenix       4840     44.0
## 14   AMWEST delayed       Phoenix        415      3.8
## 15   AMWEST on time     San Diego        383      3.5
## 16   AMWEST delayed     San Diego         65      0.6
## 17   AMWEST on time San Francisco        320      2.9
## 18   AMWEST delayed San Francisco        129      1.2
## 19   AMWEST on time       Seattle        201      1.8
## 20   AMWEST delayed       Seattle         61      0.6
# Setup group by criteria by Airlines
By_Airlines_df_1 <- group_by(By_Airlines_df_0a,Airlines,Status) 
By_Airlines_df_1
## Source: local data frame [20 x 5]
## Groups: Airlines, Status [4]
## 
##    Airlines  Status          City NumFlights percent1
##       <chr>   <chr>         <chr>      <int>    <dbl>
## 1    ALASKA on time   Los Angeles        497      4.5
## 2    ALASKA delayed   Los Angeles         62      0.6
## 3    ALASKA on time       Phoenix        221      2.0
## 4    ALASKA delayed       Phoenix         12      0.1
## 5    ALASKA on time     San Diego        212      1.9
## 6    ALASKA delayed     San Diego         12      0.1
## 7    ALASKA on time San Francisco        503      4.6
## 8    ALASKA delayed San Francisco        102      0.9
## 9    ALASKA on time       Seattle       1841     16.7
## 10   ALASKA delayed       Seattle        305      2.8
## 11   AMWEST on time   Los Angeles        694      6.3
## 12   AMWEST delayed   Los Angeles        117      1.1
## 13   AMWEST on time       Phoenix       4840     44.0
## 14   AMWEST delayed       Phoenix        415      3.8
## 15   AMWEST on time     San Diego        383      3.5
## 16   AMWEST delayed     San Diego         65      0.6
## 17   AMWEST on time San Francisco        320      2.9
## 18   AMWEST delayed San Francisco        129      1.2
## 19   AMWEST on time       Seattle        201      1.8
## 20   AMWEST delayed       Seattle         61      0.6
# Select columns for analysis dataframe by Airlines
By_Airlines_df_1a <- select(By_Airlines_df_1,Airlines,Status,NumFlights,percent1)
By_Airlines_df_1a
## Source: local data frame [20 x 4]
## Groups: Airlines, Status [4]
## 
##    Airlines  Status NumFlights percent1
##       <chr>   <chr>      <int>    <dbl>
## 1    ALASKA on time        497      4.5
## 2    ALASKA delayed         62      0.6
## 3    ALASKA on time        221      2.0
## 4    ALASKA delayed         12      0.1
## 5    ALASKA on time        212      1.9
## 6    ALASKA delayed         12      0.1
## 7    ALASKA on time        503      4.6
## 8    ALASKA delayed        102      0.9
## 9    ALASKA on time       1841     16.7
## 10   ALASKA delayed        305      2.8
## 11   AMWEST on time        694      6.3
## 12   AMWEST delayed        117      1.1
## 13   AMWEST on time       4840     44.0
## 14   AMWEST delayed        415      3.8
## 15   AMWEST on time        383      3.5
## 16   AMWEST delayed         65      0.6
## 17   AMWEST on time        320      2.9
## 18   AMWEST delayed        129      1.2
## 19   AMWEST on time        201      1.8
## 20   AMWEST delayed         61      0.6
# Present columns based on select and group-by criteria by Airlines
By_Airlines_df_3 <- summarise(By_Airlines_df_1a,SumNumFlights = sum(NumFlights),SumPercent = paste0(sum(percent1),"%"))
By_Airlines_df_3
## Source: local data frame [4 x 4]
## Groups: Airlines [?]
## 
##   Airlines  Status SumNumFlights SumPercent
##      <chr>   <chr>         <int>      <chr>
## 1   ALASKA delayed           493       4.5%
## 2   ALASKA on time          3274      29.7%
## 3   AMWEST delayed           787       7.3%
## 4   AMWEST on time          6438      58.5%
# Graph delayed flights per total flights by Airline
By_Airlines_df_3 %>% filter(Status == "delayed") %>%
    ggplot( aes(x=Airlines, y=SumPercent, fill=Airlines)) +
    geom_bar(stat="identity", position=position_dodge(), colour="black", width = 0.5) +
    coord_flip() + 
    ggtitle("Percent of Delays per Total Flights by Airline") +
    xlab("Airline") + ylab("Percentage of Flights Delayed") +
    geom_text(aes(label=paste(SumPercent)), vjust=0.5, hjust=1.1,color="black")

# Graph on-time flights per total flights by Airline
By_Airlines_df_3 %>% filter(Status == "on time") %>%
    ggplot( aes(x=Airlines, y=SumPercent, fill=Airlines)) +
    geom_bar(stat="identity", position=position_dodge(), colour="black", width = 0.5) +
    coord_flip() + 
    ggtitle("Percent of On-Time Trips per Total Flights by Airline") +
    xlab("Airline") + ylab("Percentage of Flights On-Time") +
    geom_text(aes(label=paste(SumPercent)), vjust=0.5, hjust=1.1,color="black")

# Setup group by criteria by Airlines and City
By_Airlines_df_1b <- group_by(By_Airlines_df_0a,Airlines,Status,City) 
By_Airlines_df_1b
## Source: local data frame [20 x 5]
## Groups: Airlines, Status, City [20]
## 
##    Airlines  Status          City NumFlights percent1
##       <chr>   <chr>         <chr>      <int>    <dbl>
## 1    ALASKA on time   Los Angeles        497      4.5
## 2    ALASKA delayed   Los Angeles         62      0.6
## 3    ALASKA on time       Phoenix        221      2.0
## 4    ALASKA delayed       Phoenix         12      0.1
## 5    ALASKA on time     San Diego        212      1.9
## 6    ALASKA delayed     San Diego         12      0.1
## 7    ALASKA on time San Francisco        503      4.6
## 8    ALASKA delayed San Francisco        102      0.9
## 9    ALASKA on time       Seattle       1841     16.7
## 10   ALASKA delayed       Seattle        305      2.8
## 11   AMWEST on time   Los Angeles        694      6.3
## 12   AMWEST delayed   Los Angeles        117      1.1
## 13   AMWEST on time       Phoenix       4840     44.0
## 14   AMWEST delayed       Phoenix        415      3.8
## 15   AMWEST on time     San Diego        383      3.5
## 16   AMWEST delayed     San Diego         65      0.6
## 17   AMWEST on time San Francisco        320      2.9
## 18   AMWEST delayed San Francisco        129      1.2
## 19   AMWEST on time       Seattle        201      1.8
## 20   AMWEST delayed       Seattle         61      0.6
# Select columns for analysis dataframe by Airlines and City
By_Airlines_df_1c <- select(By_Airlines_df_1b,Airlines,Status,City,NumFlights,percent1)
By_Airlines_df_1c
## Source: local data frame [20 x 5]
## Groups: Airlines, Status, City [20]
## 
##    Airlines  Status          City NumFlights percent1
##       <chr>   <chr>         <chr>      <int>    <dbl>
## 1    ALASKA on time   Los Angeles        497      4.5
## 2    ALASKA delayed   Los Angeles         62      0.6
## 3    ALASKA on time       Phoenix        221      2.0
## 4    ALASKA delayed       Phoenix         12      0.1
## 5    ALASKA on time     San Diego        212      1.9
## 6    ALASKA delayed     San Diego         12      0.1
## 7    ALASKA on time San Francisco        503      4.6
## 8    ALASKA delayed San Francisco        102      0.9
## 9    ALASKA on time       Seattle       1841     16.7
## 10   ALASKA delayed       Seattle        305      2.8
## 11   AMWEST on time   Los Angeles        694      6.3
## 12   AMWEST delayed   Los Angeles        117      1.1
## 13   AMWEST on time       Phoenix       4840     44.0
## 14   AMWEST delayed       Phoenix        415      3.8
## 15   AMWEST on time     San Diego        383      3.5
## 16   AMWEST delayed     San Diego         65      0.6
## 17   AMWEST on time San Francisco        320      2.9
## 18   AMWEST delayed San Francisco        129      1.2
## 19   AMWEST on time       Seattle        201      1.8
## 20   AMWEST delayed       Seattle         61      0.6
# Present columns based on select and group-by criteria by Airlines
By_Airlines_df_3b <- summarise(By_Airlines_df_1c,SumNumFlights = sum(NumFlights),SumPercent = paste0(sum(percent1),"%"))
By_Airlines_df_3b
## Source: local data frame [20 x 5]
## Groups: Airlines, Status [?]
## 
##    Airlines  Status          City SumNumFlights SumPercent
##       <chr>   <chr>         <chr>         <int>      <chr>
## 1    ALASKA delayed   Los Angeles            62       0.6%
## 2    ALASKA delayed       Phoenix            12       0.1%
## 3    ALASKA delayed     San Diego            12       0.1%
## 4    ALASKA delayed San Francisco           102       0.9%
## 5    ALASKA delayed       Seattle           305       2.8%
## 6    ALASKA on time   Los Angeles           497       4.5%
## 7    ALASKA on time       Phoenix           221         2%
## 8    ALASKA on time     San Diego           212       1.9%
## 9    ALASKA on time San Francisco           503       4.6%
## 10   ALASKA on time       Seattle          1841      16.7%
## 11   AMWEST delayed   Los Angeles           117       1.1%
## 12   AMWEST delayed       Phoenix           415       3.8%
## 13   AMWEST delayed     San Diego            65       0.6%
## 14   AMWEST delayed San Francisco           129       1.2%
## 15   AMWEST delayed       Seattle            61       0.6%
## 16   AMWEST on time   Los Angeles           694       6.3%
## 17   AMWEST on time       Phoenix          4840        44%
## 18   AMWEST on time     San Diego           383       3.5%
## 19   AMWEST on time San Francisco           320       2.9%
## 20   AMWEST on time       Seattle           201       1.8%
# Graph delayed flights per total flights by Airline and City
By_Airlines_df_3b %>% filter(Status == "delayed") %>%
    ggplot( aes(x=City, y=SumPercent, fill=Airlines)) +
    geom_bar(stat="identity", position=position_dodge(), colour="black", width = 0.5) +
    coord_flip() + 
    ggtitle("Percent of Delays per Total Flight by Airline by City") +
    xlab("City") + ylab("Percentage of Flights Delayed") +
    geom_text(aes(label=paste(SumPercent)), vjust=0.5, hjust=1.1,color="black")

# Graph on-time flights per total flights by Airline
By_Airlines_df_3b %>% filter(Status == "on time") %>%
    ggplot( aes(x=City, y=SumPercent, fill=Airlines)) +
    geom_bar(stat="identity", position=position_dodge(), colour="black", width = 0.5) +
    coord_flip() + 
    ggtitle("Percent of On-Time Trips per Total Flights by Airline by City") +
    xlab("City") + ylab("Percentage of Flights On-Time") +
    geom_text(aes(label=paste(SumPercent)), vjust=0.5, hjust=1.1,color="black")

Conclusion: Overall AMWEST has the higher total proportion of delayed flights at 7.3% versus 4.5% for ALASKA. But AMWEST also had the higher number of on-time flights at 58.5% versus 29.7% for ALASKA. The number of delays versus on-time was 7.3/58.5 = 12.4% for AMWEST and 4.5/29.7 = 15.1% for ALASKA, so ALASKA has a higher relative proportion of delayed flights.

In analysis by City Seattle has the highest total proportion of delayed flights for ALASKA airlines (2.8%), and Phoenix the highest number of delays by AMWEST (3.8%).

By City Pheonix also has the highest total proportion of on-time flights at a total of 46% for both Airlines combined.

If we compare delays to on-time by city the breakdown of relative proportional delays is as follows:

Los Angeles: 1.7/4.8 = 35.4% Seattle: 3.4/11.5 = 29.6% San Francisco: 2.1/10.9 = 19.2% San Diego: .7/5.4 = 12.9% Phoenix: 3.9/46 = 8.4%

This shows the breakdown of relative delays versus on-time flights by city and shows Los Angeles is the highest proportion and Phoenix has the least delays for both airlines.

END