Read and transform data

airline_data <- read.csv('flightdata.txt')

#wide format data
airline_data
##   airline  status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA  ontime         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST  ontime         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61
#long format data
airline_data_long <- airline_data %>% 
gather( 3:7, key="airport", value = "num_of_flights")
airline_data_long 
##    airline  status       airport num_of_flights
## 1   ALASKA  ontime   Los.Angeles            497
## 2   ALASKA delayed   Los.Angeles             62
## 3  AM WEST  ontime   Los.Angeles            694
## 4  AM WEST delayed   Los.Angeles            117
## 5   ALASKA  ontime       Phoenix            221
## 6   ALASKA delayed       Phoenix             12
## 7  AM WEST  ontime       Phoenix           4840
## 8  AM WEST delayed       Phoenix            415
## 9   ALASKA  ontime     San.Diego            212
## 10  ALASKA delayed     San.Diego             20
## 11 AM WEST  ontime     San.Diego            383
## 12 AM WEST delayed     San.Diego             65
## 13  ALASKA  ontime San.Francisco            503
## 14  ALASKA delayed San.Francisco            102
## 15 AM WEST  ontime San.Francisco            320
## 16 AM WEST delayed San.Francisco            129
## 17  ALASKA  ontime       Seattle           1841
## 18  ALASKA delayed       Seattle            305
## 19 AM WEST  ontime       Seattle            201
## 20 AM WEST delayed       Seattle             61
# Number of flights
sum(airline_data_long$num_of_flights)
## [1] 11000
#Flights by airline
groupby_airline <- group_by(airline_data_long, airline)

summarise(groupby_airline, sum(num_of_flights))
## # A tibble: 2 x 2
##   airline `sum(num_of_flights)`
##   <fct>                   <int>
## 1 ALASKA                   3775
## 2 AM WEST                  7225
ggplot(data = groupby_airline, aes(x = airline, y= num_of_flights, fill = airline)) +
    geom_bar(stat="identity", position=position_dodge())

#flights by airport
group_by_airport <- airline_data_long %>%group_by(airport)

summarise(group_by_airport, sum(num_of_flights))
## # A tibble: 5 x 2
##   airport       `sum(num_of_flights)`
##   <chr>                         <int>
## 1 Los.Angeles                    1370
## 2 Phoenix                        5488
## 3 San.Diego                       680
## 4 San.Francisco                  1054
## 5 Seattle                        2408
ggplot(data = group_by_airport, aes(x = airport, y = num_of_flights, fill = airport)) +
    geom_bar(stat ="identity", position = position_dodge())

#flights by airline, airport
groupby_airline_airport <- group_by(airline_data_long, airline, airport)

summarise(groupby_airline_airport, sum(num_of_flights))
## # A tibble: 10 x 3
## # Groups:   airline [?]
##    airline airport       `sum(num_of_flights)`
##    <fct>   <chr>                         <int>
##  1 ALASKA  Los.Angeles                     559
##  2 ALASKA  Phoenix                         233
##  3 ALASKA  San.Diego                       232
##  4 ALASKA  San.Francisco                   605
##  5 ALASKA  Seattle                        2146
##  6 AM WEST Los.Angeles                     811
##  7 AM WEST Phoenix                        5255
##  8 AM WEST San.Diego                       448
##  9 AM WEST San.Francisco                   449
## 10 AM WEST Seattle                         262
ggplot(data = groupby_airline_airport, aes(x = airport, y = num_of_flights, fill = airline)) +
    geom_bar(stat ="identity", position = position_dodge())

#flights by status
groupby_status <- group_by(airline_data_long, status)

summarise(groupby_status, sum(num_of_flights))
## # A tibble: 2 x 2
##   status  `sum(num_of_flights)`
##   <fct>                   <int>
## 1 delayed                  1288
## 2 ontime                   9712
ggplot(data = groupby_status, aes(x = status, y = num_of_flights, fill = status)) +
    geom_bar(stat ="identity", position = position_dodge())

#flights by status and airport
groupby_status_airport <- group_by(airline_data_long,airport, status)

summarise(groupby_status_airport, sum(num_of_flights))
## # A tibble: 10 x 3
## # Groups:   airport [?]
##    airport       status  `sum(num_of_flights)`
##    <chr>         <fct>                   <int>
##  1 Los.Angeles   delayed                   179
##  2 Los.Angeles   ontime                   1191
##  3 Phoenix       delayed                   427
##  4 Phoenix       ontime                   5061
##  5 San.Diego     delayed                    85
##  6 San.Diego     ontime                    595
##  7 San.Francisco delayed                   231
##  8 San.Francisco ontime                    823
##  9 Seattle       delayed                   366
## 10 Seattle       ontime                   2042
ggplot(data = groupby_status_airport, aes(x = airport, y = num_of_flights, fill = status)) +
    geom_bar(stat ="identity", position = position_dodge())

#fligths per airport by airline
groupby_airport <- group_by(airline_data_long, airline, airport)
summarise(groupby_airport, sum(num_of_flights))
## # A tibble: 10 x 3
## # Groups:   airline [?]
##    airline airport       `sum(num_of_flights)`
##    <fct>   <chr>                         <int>
##  1 ALASKA  Los.Angeles                     559
##  2 ALASKA  Phoenix                         233
##  3 ALASKA  San.Diego                       232
##  4 ALASKA  San.Francisco                   605
##  5 ALASKA  Seattle                        2146
##  6 AM WEST Los.Angeles                     811
##  7 AM WEST Phoenix                        5255
##  8 AM WEST San.Diego                       448
##  9 AM WEST San.Francisco                   449
## 10 AM WEST Seattle                         262
ggplot(data=groupby_airport, aes(x=airport, y= num_of_flights, fill=airline)) +
    geom_bar(stat="identity", position=position_dodge())

#delayed vs on time %
spread_delay_ontime <- airline_data_long %>%
    spread(key = status , value = num_of_flights)

spread_delay_ontime_percentage <- spread_delay_ontime%>% 
  mutate(
         ontime_percentage = round((ontime / (delayed + ontime)) *100),
         delayed_percentage = round((delayed / (delayed + ontime)) * 100)
         )

spread_delay_ontime_percentage
##    airline       airport delayed ontime ontime_percentage
## 1   ALASKA   Los.Angeles      62    497                89
## 2   ALASKA       Phoenix      12    221                95
## 3   ALASKA     San.Diego      20    212                91
## 4   ALASKA San.Francisco     102    503                83
## 5   ALASKA       Seattle     305   1841                86
## 6  AM WEST   Los.Angeles     117    694                86
## 7  AM WEST       Phoenix     415   4840                92
## 8  AM WEST     San.Diego      65    383                85
## 9  AM WEST San.Francisco     129    320                71
## 10 AM WEST       Seattle      61    201                77
##    delayed_percentage
## 1                  11
## 2                   5
## 3                   9
## 4                  17
## 5                  14
## 6                  14
## 7                   8
## 8                  15
## 9                  29
## 10                 23
ggplot(airline_data_long, aes(x = status, y = (num_of_flights), fill = status)) +
  geom_bar(stat="identity")+
  facet_grid(airline ~ airport)