Read and transform data
airline_data <- read.csv('flightdata.txt')
#wide format data
airline_data
## airline status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA ontime 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST ontime 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
#long format data
airline_data_long <- airline_data %>%
gather( 3:7, key="airport", value = "num_of_flights")
airline_data_long
## airline status airport num_of_flights
## 1 ALASKA ontime Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST ontime Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA ontime Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST ontime Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA ontime San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST ontime San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA ontime San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AM WEST ontime San.Francisco 320
## 16 AM WEST delayed San.Francisco 129
## 17 ALASKA ontime Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST ontime Seattle 201
## 20 AM WEST delayed Seattle 61
# Number of flights
sum(airline_data_long$num_of_flights)
## [1] 11000
#Flights by airline
groupby_airline <- group_by(airline_data_long, airline)
summarise(groupby_airline, sum(num_of_flights))
## # A tibble: 2 x 2
## airline `sum(num_of_flights)`
## <fct> <int>
## 1 ALASKA 3775
## 2 AM WEST 7225
ggplot(data = groupby_airline, aes(x = airline, y= num_of_flights, fill = airline)) +
geom_bar(stat="identity", position=position_dodge())

#flights by airport
group_by_airport <- airline_data_long %>%group_by(airport)
summarise(group_by_airport, sum(num_of_flights))
## # A tibble: 5 x 2
## airport `sum(num_of_flights)`
## <chr> <int>
## 1 Los.Angeles 1370
## 2 Phoenix 5488
## 3 San.Diego 680
## 4 San.Francisco 1054
## 5 Seattle 2408
ggplot(data = group_by_airport, aes(x = airport, y = num_of_flights, fill = airport)) +
geom_bar(stat ="identity", position = position_dodge())

#flights by airline, airport
groupby_airline_airport <- group_by(airline_data_long, airline, airport)
summarise(groupby_airline_airport, sum(num_of_flights))
## # A tibble: 10 x 3
## # Groups: airline [?]
## airline airport `sum(num_of_flights)`
## <fct> <chr> <int>
## 1 ALASKA Los.Angeles 559
## 2 ALASKA Phoenix 233
## 3 ALASKA San.Diego 232
## 4 ALASKA San.Francisco 605
## 5 ALASKA Seattle 2146
## 6 AM WEST Los.Angeles 811
## 7 AM WEST Phoenix 5255
## 8 AM WEST San.Diego 448
## 9 AM WEST San.Francisco 449
## 10 AM WEST Seattle 262
ggplot(data = groupby_airline_airport, aes(x = airport, y = num_of_flights, fill = airline)) +
geom_bar(stat ="identity", position = position_dodge())

#flights by status
groupby_status <- group_by(airline_data_long, status)
summarise(groupby_status, sum(num_of_flights))
## # A tibble: 2 x 2
## status `sum(num_of_flights)`
## <fct> <int>
## 1 delayed 1288
## 2 ontime 9712
ggplot(data = groupby_status, aes(x = status, y = num_of_flights, fill = status)) +
geom_bar(stat ="identity", position = position_dodge())

#flights by status and airport
groupby_status_airport <- group_by(airline_data_long,airport, status)
summarise(groupby_status_airport, sum(num_of_flights))
## # A tibble: 10 x 3
## # Groups: airport [?]
## airport status `sum(num_of_flights)`
## <chr> <fct> <int>
## 1 Los.Angeles delayed 179
## 2 Los.Angeles ontime 1191
## 3 Phoenix delayed 427
## 4 Phoenix ontime 5061
## 5 San.Diego delayed 85
## 6 San.Diego ontime 595
## 7 San.Francisco delayed 231
## 8 San.Francisco ontime 823
## 9 Seattle delayed 366
## 10 Seattle ontime 2042
ggplot(data = groupby_status_airport, aes(x = airport, y = num_of_flights, fill = status)) +
geom_bar(stat ="identity", position = position_dodge())

#fligths per airport by airline
groupby_airport <- group_by(airline_data_long, airline, airport)
summarise(groupby_airport, sum(num_of_flights))
## # A tibble: 10 x 3
## # Groups: airline [?]
## airline airport `sum(num_of_flights)`
## <fct> <chr> <int>
## 1 ALASKA Los.Angeles 559
## 2 ALASKA Phoenix 233
## 3 ALASKA San.Diego 232
## 4 ALASKA San.Francisco 605
## 5 ALASKA Seattle 2146
## 6 AM WEST Los.Angeles 811
## 7 AM WEST Phoenix 5255
## 8 AM WEST San.Diego 448
## 9 AM WEST San.Francisco 449
## 10 AM WEST Seattle 262
ggplot(data=groupby_airport, aes(x=airport, y= num_of_flights, fill=airline)) +
geom_bar(stat="identity", position=position_dodge())

#delayed vs on time %
spread_delay_ontime <- airline_data_long %>%
spread(key = status , value = num_of_flights)
spread_delay_ontime_percentage <- spread_delay_ontime%>%
mutate(
ontime_percentage = round((ontime / (delayed + ontime)) *100),
delayed_percentage = round((delayed / (delayed + ontime)) * 100)
)
spread_delay_ontime_percentage
## airline airport delayed ontime ontime_percentage
## 1 ALASKA Los.Angeles 62 497 89
## 2 ALASKA Phoenix 12 221 95
## 3 ALASKA San.Diego 20 212 91
## 4 ALASKA San.Francisco 102 503 83
## 5 ALASKA Seattle 305 1841 86
## 6 AM WEST Los.Angeles 117 694 86
## 7 AM WEST Phoenix 415 4840 92
## 8 AM WEST San.Diego 65 383 85
## 9 AM WEST San.Francisco 129 320 71
## 10 AM WEST Seattle 61 201 77
## delayed_percentage
## 1 11
## 2 5
## 3 9
## 4 17
## 5 14
## 6 14
## 7 8
## 8 15
## 9 29
## 10 23
ggplot(airline_data_long, aes(x = status, y = (num_of_flights), fill = status)) +
geom_bar(stat="identity")+
facet_grid(airline ~ airport)
