1 Pre-Requistes : Available Libraries

  • knitr
  • tidyr
  • DT
  • dplyr
  • sqldf
  • ggplot2

2 Assignment Tidying and Transforming Data

knitr::include_graphics("Assignment Tidying and Transforming Data.pdf")
Assignment Tidying and Transforming Data

Assignment Tidying and Transforming Data

3 Read File into R from local working directory

##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503  1,841 
## 2         delayed          62      12        20           102     305
## 3                          NA                NA            NA        
## 4 AM WEST on time         694  4,840        383           320     201
## 5         delayed         117     415        65           129      61

4 Data Scrubbing

4.1 Set the header

##   Airline FlightStatus Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA      on time         497     221       212           503  1,841 
## 2              delayed          62      12        20           102     305
## 3                               NA                NA            NA        
## 4 AM WEST      on time         694  4,840        383           320     201
## 5              delayed         117     415        65           129      61

4.2 Remove the " " lines.

##   Airline FlightStatus Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA      on time         497     221       212           503  1,841 
## 2              delayed          62      12        20           102     305
## 4 AM WEST      on time         694  4,840        383           320     201
## 5              delayed         117     415        65           129      61

4.3 Add missing data

##   Airline FlightStatus Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA      on time         497     221       212           503  1,841 
## 2  ALASKA      delayed          62      12        20           102     305
## 4 AM WEST      on time         694  4,840        383           320     201
## 5 AM WEST      delayed         117     415        65           129      61

4.4 Pivot data

pivot.raw.data <- tidyr::gather(raw.data, "City", "Count", 3:7)
pivot.raw.data
##    Airline FlightStatus          City   Count
## 1   ALASKA      on time   Los.Angeles     497
## 2   ALASKA      delayed   Los.Angeles      62
## 3  AM WEST      on time   Los.Angeles     694
## 4  AM WEST      delayed   Los.Angeles     117
## 5   ALASKA      on time       Phoenix     221
## 6   ALASKA      delayed       Phoenix      12
## 7  AM WEST      on time       Phoenix  4,840 
## 8  AM WEST      delayed       Phoenix     415
## 9   ALASKA      on time     San.Diego     212
## 10  ALASKA      delayed     San.Diego      20
## 11 AM WEST      on time     San.Diego     383
## 12 AM WEST      delayed     San.Diego      65
## 13  ALASKA      on time San.Francisco     503
## 14  ALASKA      delayed San.Francisco     102
## 15 AM WEST      on time San.Francisco     320
## 16 AM WEST      delayed San.Francisco     129
## 17  ALASKA      on time       Seattle  1,841 
## 18  ALASKA      delayed       Seattle     305
## 19 AM WEST      on time       Seattle     201
## 20 AM WEST      delayed       Seattle      61
pivot.raw.data <- transform(pivot.raw.data, Count = as.numeric(gsub(",", "", Count)))
pivot.raw.data$City <- str_replace(pivot.raw.data$City, "[.]", " ")
pivot.raw.data
##    Airline FlightStatus          City Count
## 1   ALASKA      on time   Los Angeles   497
## 2   ALASKA      delayed   Los Angeles    62
## 3  AM WEST      on time   Los Angeles   694
## 4  AM WEST      delayed   Los Angeles   117
## 5   ALASKA      on time       Phoenix   221
## 6   ALASKA      delayed       Phoenix    12
## 7  AM WEST      on time       Phoenix  4840
## 8  AM WEST      delayed       Phoenix   415
## 9   ALASKA      on time     San Diego   212
## 10  ALASKA      delayed     San Diego    20
## 11 AM WEST      on time     San Diego   383
## 12 AM WEST      delayed     San Diego    65
## 13  ALASKA      on time San Francisco   503
## 14  ALASKA      delayed San Francisco   102
## 15 AM WEST      on time San Francisco   320
## 16 AM WEST      delayed San Francisco   129
## 17  ALASKA      on time       Seattle  1841
## 18  ALASKA      delayed       Seattle   305
## 19 AM WEST      on time       Seattle   201
## 20 AM WEST      delayed       Seattle    61

4.5 Restructure data

pivot_raw_data <- tidyr::spread(pivot.raw.data, FlightStatus, Count)
pivot_raw_data
##    Airline          City delayed on time
## 1   ALASKA   Los Angeles      62     497
## 2   ALASKA       Phoenix      12     221
## 3   ALASKA     San Diego      20     212
## 4   ALASKA San Francisco     102     503
## 5   ALASKA       Seattle     305    1841
## 6  AM WEST   Los Angeles     117     694
## 7  AM WEST       Phoenix     415    4840
## 8  AM WEST     San Diego      65     383
## 9  AM WEST San Francisco     129     320
## 10 AM WEST       Seattle      61     201
#colnames(pivot_raw_data)

5 Show Data

5.1 Data Table

DT::datatable(pivot_raw_data, options = list(pagelength=5))

5.2 Select

DT::datatable(select(pivot_raw_data, Airline:`on time`), options = list(pagelength=5))

5.3 Knitr

knitr::kable(pivot_raw_data)
Airline City delayed on time
ALASKA Los Angeles 62 497
ALASKA Phoenix 12 221
ALASKA San Diego 20 212
ALASKA San Francisco 102 503
ALASKA Seattle 305 1841
AM WEST Los Angeles 117 694
AM WEST Phoenix 415 4840
AM WEST San Diego 65 383
AM WEST San Francisco 129 320
AM WEST Seattle 61 201

5.4 SQL

##    Airline          City delayed on time
## 1   ALASKA   Los Angeles      62     497
## 2   ALASKA       Phoenix      12     221
## 3   ALASKA     San Diego      20     212
## 4   ALASKA San Francisco     102     503
## 5   ALASKA       Seattle     305    1841
## 6  AM WEST   Los Angeles     117     694
## 7  AM WEST       Phoenix     415    4840
## 8  AM WEST     San Diego      65     383
## 9  AM WEST San Francisco     129     320
## 10 AM WEST       Seattle      61     201

6 Analyze data

6.1 Identify OnTime/Delay rates

OnTimeByAirline <- pivot_raw_data %>% 
  group_by(Airline) %>%
  summarise(totalDelayed = sum(delayed), 
            totalOnTime = sum(`on time`), 
            percentDelayed = round(100 * totalDelayed/(totalDelayed+totalOnTime), 2),
            percentOnTime = round(100 * totalOnTime/(totalDelayed+totalOnTime), 2)
            ) %>% 
  arrange(desc(percentOnTime))
OnTimeByAirline
## # A tibble: 2 x 5
##   Airline totalDelayed totalOnTime percentDelayed percentOnTime
##   <chr>          <dbl>       <dbl>          <dbl>         <dbl>
## 1 AM WEST          787        6438           10.9          89.1
## 2 ALASKA           501        3274           13.3          86.7
DelaysByCity <- pivot_raw_data %>% 
  group_by(City) %>%
  summarise(totalDelayed=sum(delayed), 
            totalOnTime=sum(`on time`), 
            percentDelayed=round(100 * totalDelayed/(totalDelayed+totalOnTime), 2),
            percentOnTime = round(100 * totalOnTime/(totalDelayed+totalOnTime), 2)
            ) %>% 
  arrange(desc(percentDelayed))
DelaysByCity
## # A tibble: 5 x 5
##   City          totalDelayed totalOnTime percentDelayed percentOnTime
##   <chr>                <dbl>       <dbl>          <dbl>         <dbl>
## 1 San Francisco          231         823          21.9           78.1
## 2 Seattle                366        2042          15.2           84.8
## 3 Los Angeles            179        1191          13.1           86.9
## 4 San Diego               85         595          12.5           87.5
## 5 Phoenix                427        5061           7.78          92.2
AirlineCity <- pivot_raw_data %>% 
  mutate(totalFlights = delayed + `on time`, 
         percentDelayed = 100*delayed / totalFlights,
         percentOnTime = 100*`on time` / totalFlights
         ) %>% 
  arrange(percentDelayed)
AirlineCity
##    Airline          City delayed on time totalFlights percentDelayed
## 1   ALASKA       Phoenix      12     221          233       5.150215
## 2  AM WEST       Phoenix     415    4840         5255       7.897241
## 3   ALASKA     San Diego      20     212          232       8.620690
## 4   ALASKA   Los Angeles      62     497          559      11.091234
## 5   ALASKA       Seattle     305    1841         2146      14.212488
## 6  AM WEST   Los Angeles     117     694          811      14.426634
## 7  AM WEST     San Diego      65     383          448      14.508929
## 8   ALASKA San Francisco     102     503          605      16.859504
## 9  AM WEST       Seattle      61     201          262      23.282443
## 10 AM WEST San Francisco     129     320          449      28.730512
##    percentOnTime
## 1       94.84979
## 2       92.10276
## 3       91.37931
## 4       88.90877
## 5       85.78751
## 6       85.57337
## 7       85.49107
## 8       83.14050
## 9       76.71756
## 10      71.26949

7 Analysis Visualizations

ggplot(AirlineCity, aes(x=reorder(City,percentDelayed), y=percentDelayed)) + 
  geom_bar(aes(fill = Airline), position = "dodge", stat = "identity") + 
  xlab("City") +
  ylab("Delay Rate")

ggplot(AirlineCity, aes(x=City)) +
  geom_line(aes(y=percentDelayed, color = "darkred")) +
  geom_line(aes(y=percentOnTime, color="steelblue"), linetype="twodash") +
  scale_y_continuous(name ="Cumulative minutes delayed by cause of delay", labels = scales::comma ) +
  scale_color_discrete(name = "Flight Status", labels=c("Delay", "On Time")) +
  theme_minimal()

ggplot(pivot.raw.data, aes(x = Airline, y = Count, fill = FlightStatus)) + 
  geom_bar(position = "stack", stat = "identity") + facet_grid(~ City) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  scale_fill_discrete(labels = c("Delayed", "On Time")) +
  ylab("Flights")

ggplot(OnTimeByAirline, aes(Airline, percentOnTime)) + 
  geom_bar(aes(fill = Airline), position = "dodge", stat = "identity") + 
  ylab("Percent OnTIme")

8 Conclusion

  • AM WEST airline is more On Time (89%) with less Delays (11%) compared to ALASKA airlines which has 87% On Time with 13% delays
  • San Francisco has most delay rate (22%) whereas Phoenix has least delay rate (8%)
  • AM WEST airline at San Francisco has most delays (29%) whereas ALASKA airline at Phoenix has least delays (5%)
  • AM WEST airline at Phoenix has most flights (5255) with second best on-time arrival rate (92%) resulting on overall better On-Time arrival compared to ALASKA airlines