Libaries used: library(knitr) library(png) library(tidyr) library(dplyr)

Tidying and transforming Data

The chart below describes arrival delays for two airlines across five destinations.

Your task is to:

  1. Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above.

You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.

  1. Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.

  2. Perform analysis to compare the arrival delays for the two airlines.

  3. Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions.

imgage <- "C:/Users/jpsim/Documents/DATA Acquisition and Management/4.png"
include_graphics(imgage)

DataFrame Creation

flight_times <- rbind(c(NA, NA, "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
             c("ALASKA", "On Time", 497, 221, 212, 503, 1841),
             c(NA, "Delayed", 62, 12, 20, 102, 305),
             c("AM WEST", "On Time", 694, 4840, 383, 320, 201),
             c(NA, "Delayed", 117, 415, 65, 129, 61))

Local .cvs Export

write.csv(flight_times,'C:/Users/jpsim/Documents/DATA Acquisition and Management/flight_times.csv', row.names = FALSE)

GitHub Data Load

raw_data <- read.csv(paste0("https://raw.githubusercontent.com/josephsimone/DATA607/master/flight_times.csv"), 
                             stringsAsFactors = F); raw_data
##       NA.   NA..1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA On Time         497     221       212           503    1841
## 2    <NA> Delayed          62      12        20           102     305
## 3 AM WEST On Time         694    4840       383           320     201
## 4    <NA> Delayed         117     415        65           129      61

Tidying the dataset with the use of tidyr

 clean_data <- raw_data %>%
  mutate(Airlines1 = NA., Airlines2=lag(NA.)) %>% 
  mutate(Airline = coalesce(Airlines1,Airlines2), Status = NA..1)  %>%
  gather("Destination", "Flights", 3:7) %>% 
  select(Airline:Flights) %>% 
  arrange(Airline, desc(Status), Destination); clean_data
##    Airline  Status   Destination Flights
## 1   ALASKA On Time   Los.Angeles     497
## 2   ALASKA On Time       Phoenix     221
## 3   ALASKA On Time     San.Diego     212
## 4   ALASKA On Time San.Francisco     503
## 5   ALASKA On Time       Seattle    1841
## 6   ALASKA Delayed   Los.Angeles      62
## 7   ALASKA Delayed       Phoenix      12
## 8   ALASKA Delayed     San.Diego      20
## 9   ALASKA Delayed San.Francisco     102
## 10  ALASKA Delayed       Seattle     305
## 11 AM WEST On Time   Los.Angeles     694
## 12 AM WEST On Time       Phoenix    4840
## 13 AM WEST On Time     San.Diego     383
## 14 AM WEST On Time San.Francisco     320
## 15 AM WEST On Time       Seattle     201
## 16 AM WEST Delayed   Los.Angeles     117
## 17 AM WEST Delayed       Phoenix     415
## 18 AM WEST Delayed     San.Diego      65
## 19 AM WEST Delayed San.Francisco     129
## 20 AM WEST Delayed       Seattle      61

Use the dplyr package to transform the data for analysis

 flights_aggr_df <- clean_data %>% select(Airline, Status, Flights) %>% 
      group_by(Airline, Status) %>% 
      summarise(total = sum(Flights)) %>% 
      mutate(percent           = round(total/sum(total) * 100, 2),
             percent_formatted = paste0(round(total/sum(total) * 100, 1), "%"),
             total_flights     = sum(total)) 

Creation of new DataFram

tidy_df <- as.data.frame(flights_aggr_df)
tidy_df
##   Airline  Status total percent percent_formatted total_flights
## 1  ALASKA Delayed   501   13.27             13.3%          3775
## 2  ALASKA On Time  3274   86.73             86.7%          3775
## 3 AM WEST Delayed   787   10.89             10.9%          7225
## 4 AM WEST On Time  6438   89.11             89.1%          7225

Vizualization of Data using GGPlot2

tidy_df %>% filter(Status == "Delayed") %>%
    ggplot( aes(x=Airline, y=percent, fill=Airline)) +
    geom_bar(stat="identity", position=position_dodge(), colour="black", width = 0.5) +
    coord_flip() + 
    ggtitle("Overall Percentage of Flights Delayed by Airline") +
    xlab("Airline") + ylab("Percentage of Flights Delayed") +
    geom_text(aes(label=paste(percent_formatted, "\n n =", total_flights)), vjust=0.5, hjust=1.1,color="black")

flights_aggr_df <- clean_data %>% select(Airline, Status, Flights, Destination) %>% 
      group_by(Airline, Status, Destination) %>% 
      summarise(total = sum(Flights)) %>% 
      mutate(percent           = round(total/sum(total) * 100, 2),
            percent_formatted = paste0(round(total/sum(total) * 100, 1), "%"),
            total_flights     = sum(total)) 
flights_by_city_df <- flights_aggr_df %>%
group_by(Airline, Destination) %>% 
    mutate(percent = round(total/sum(total)*100, 2),
           percent_formatted=paste0(round(total/sum(total)*100, 1), "%")) 
flights_by_city_df <- as.data.frame(flights_by_city_df)
flights_by_city_df
##    Airline  Status   Destination total percent percent_formatted
## 1   ALASKA Delayed   Los.Angeles    62   11.09             11.1%
## 2   ALASKA Delayed       Phoenix    12    5.15              5.2%
## 3   ALASKA Delayed     San.Diego    20    8.62              8.6%
## 4   ALASKA Delayed San.Francisco   102   16.86             16.9%
## 5   ALASKA Delayed       Seattle   305   14.21             14.2%
## 6   ALASKA On Time   Los.Angeles   497   88.91             88.9%
## 7   ALASKA On Time       Phoenix   221   94.85             94.8%
## 8   ALASKA On Time     San.Diego   212   91.38             91.4%
## 9   ALASKA On Time San.Francisco   503   83.14             83.1%
## 10  ALASKA On Time       Seattle  1841   85.79             85.8%
## 11 AM WEST Delayed   Los.Angeles   117   14.43             14.4%
## 12 AM WEST Delayed       Phoenix   415    7.90              7.9%
## 13 AM WEST Delayed     San.Diego    65   14.51             14.5%
## 14 AM WEST Delayed San.Francisco   129   28.73             28.7%
## 15 AM WEST Delayed       Seattle    61   23.28             23.3%
## 16 AM WEST On Time   Los.Angeles   694   85.57             85.6%
## 17 AM WEST On Time       Phoenix  4840   92.10             92.1%
## 18 AM WEST On Time     San.Diego   383   85.49             85.5%
## 19 AM WEST On Time San.Francisco   320   71.27             71.3%
## 20 AM WEST On Time       Seattle   201   76.72             76.7%
##    total_flights
## 1            501
## 2            501
## 3            501
## 4            501
## 5            501
## 6           3274
## 7           3274
## 8           3274
## 9           3274
## 10          3274
## 11           787
## 12           787
## 13           787
## 14           787
## 15           787
## 16          6438
## 17          6438
## 18          6438
## 19          6438
## 20          6438
flights_by_city_df %>% filter(Status == "Delayed") %>%
ggplot( aes(x=Destination, y=percent, fill=Airline)) +
    geom_bar(stat="identity", position=position_dodge(), colour="black") +
    geom_text(aes(label=percent_formatted), vjust=.5, hjust=1,position= position_dodge(width=0.9),  color="black") +
        ggtitle("Percentage of Flights Delayed by Airline by City") +
    xlab("Destination") + ylab("Percentage of Flights Delayed") +
        coord_flip()