The data that will be used in this analysis is extracted from built-in R dataset hflights
. This dataset compiles the basic details of flights for fifteen carriers in USA in 2011.
Imagine yourself as an analyst who works in a consultancy specializing in market research. Your manager provided you with the mentioned datasest and asked you to extract some useful insights. He also asked you to visulaize your insights in a nice and precise way so he can draw few conclusions about the main trends in the market.
library(hflights)
library(ggplot2)
library(dplyr)
library(ggthemes)
library(scales)
library(gridExtra)
glimpse(hflights)
## Observations: 227,496
## Variables: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
Let’s add a new column Carrier
in order to show the full name of each carrier instead of the codes included in UniqueCarrier
.
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental",
"DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways",
"WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier",
"FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")
hflights$Carrier <- lut[hflights$UniqueCarrier]
glimpse(hflights)
## Observations: 227,496
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Carrier <chr> "American", "American", "American", "America...
market_snapshot <- hflights %>%
filter(Cancelled == 0) %>%
group_by(Carrier) %>%
summarise(flights = n()) %>%
arrange(desc(flights))
market_snapshot
## # A tibble: 15 x 2
## Carrier flights
## <chr> <int>
## 1 ExpressJet 71921
## 2 Continental 69557
## 3 Southwest 44640
## 4 SkyWest 15837
## 5 American_Eagle 4513
## 6 US_Airways 4036
## 7 American 3184
## 8 Delta 2599
## 9 Atlantic_Southeast 2128
## 10 AirTran 2118
## 11 United 2038
## 12 Frontier 832
## 13 JetBlue 677
## 14 Alaska 365
## 15 Mesa 78
It seems that ExpressJet
along with Continental
have the lion’s share in the number of the flights. Let’s dig deeper and examine their performances in different time ranges.
two_largest_daily <- hflights %>%
filter(DayOfWeek %in% c(1, 2, 3, 4, 5, 6, 7)) %>%
filter(Cancelled == 0) %>%
filter(Carrier %in% c("Continental", "ExpressJet")) %>%
group_by(Carrier, DayOfWeek) %>%
summarise(flights = n())
two_largest_daily
## # A tibble: 14 x 3
## # Groups: Carrier [?]
## Carrier DayOfWeek flights
## <chr> <int> <int>
## 1 Continental 1 10453
## 2 Continental 2 9418
## 3 Continental 3 9497
## 4 Continental 4 10522
## 5 Continental 5 10509
## 6 Continental 6 9061
## 7 Continental 7 10097
## 8 ExpressJet 1 11154
## 9 ExpressJet 2 9670
## 10 ExpressJet 3 9785
## 11 ExpressJet 4 11203
## 12 ExpressJet 5 11231
## 13 ExpressJet 6 8502
## 14 ExpressJet 7 10376
ggplot(two_largest_daily, aes(x= DayOfWeek, y= flights, col = Carrier)) +
geom_line(size = 1.2) +
theme_minimal() +
geom_point(size = 3.2, shape = 21, fill = "white", stroke = 2.5) +
scale_colour_manual(values = c("firebrick1", "goldenrod1")) +
scale_x_continuous(breaks = 1:7, labels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) +
theme(axis.text.x = element_text(colour = "black", size = 12, vjust = 0.6, angle = 60)) +
theme(axis.text.y = element_text(colour = "black", size = 12)) +
theme(axis.title.x = element_blank()) +
ylab("Number of Flights") +
theme(axis.title.y = element_text(size = 14, face = "bold", margin = margin(r = 20))) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(legend.position = "bottom") +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(size = 12, colour = "black", face = "bold")) +
labs(title = "The Two Largest Players in the Market", subtitle = "Daily Flights in a Week") +
theme(plot.title = element_text(size = 15, face = "bold"))
two_largest_daily_month <- hflights %>%
filter(DayofMonth %in% c(1:30)) %>%
filter(Cancelled == 0) %>%
filter(Carrier %in% c("ExpressJet", "Continental")) %>%
group_by(Carrier, DayofMonth) %>%
summarise(flights = n())
two_largest_daily_month <- as.data.frame(two_largest_daily_month)
two_largest_daily_month
## Carrier DayofMonth flights
## 1 Continental 1 2249
## 2 Continental 2 2289
## 3 Continental 3 2254
## 4 Continental 4 2130
## 5 Continental 5 2268
## 6 Continental 6 2334
## 7 Continental 7 2329
## 8 Continental 8 2284
## 9 Continental 9 2287
## 10 Continental 10 2347
## 11 Continental 11 2299
## 12 Continental 12 2257
## 13 Continental 13 2313
## 14 Continental 14 2346
## 15 Continental 15 2299
## 16 Continental 16 2280
## 17 Continental 17 2332
## 18 Continental 18 2312
## 19 Continental 19 2222
## 20 Continental 20 2309
## 21 Continental 21 2339
## 22 Continental 22 2324
## 23 Continental 23 2303
## 24 Continental 24 2249
## 25 Continental 25 2276
## 26 Continental 26 2248
## 27 Continental 27 2307
## 28 Continental 28 2322
## 29 Continental 29 2063
## 30 Continental 30 2071
## 31 ExpressJet 1 2332
## 32 ExpressJet 2 2422
## 33 ExpressJet 3 2273
## 34 ExpressJet 4 2154
## 35 ExpressJet 5 2263
## 36 ExpressJet 6 2447
## 37 ExpressJet 7 2462
## 38 ExpressJet 8 2305
## 39 ExpressJet 9 2305
## 40 ExpressJet 10 2424
## 41 ExpressJet 11 2440
## 42 ExpressJet 12 2248
## 43 ExpressJet 13 2422
## 44 ExpressJet 14 2491
## 45 ExpressJet 15 2345
## 46 ExpressJet 16 2336
## 47 ExpressJet 17 2393
## 48 ExpressJet 18 2429
## 49 ExpressJet 19 2304
## 50 ExpressJet 20 2465
## 51 ExpressJet 21 2466
## 52 ExpressJet 22 2339
## 53 ExpressJet 23 2357
## 54 ExpressJet 24 2304
## 55 ExpressJet 25 2331
## 56 ExpressJet 26 2352
## 57 ExpressJet 27 2499
## 58 ExpressJet 28 2530
## 59 ExpressJet 29 2033
## 60 ExpressJet 30 2094
ggplot(two_largest_daily_month, aes(x= DayofMonth, y= flights, col= Carrier)) +
geom_line(size = 1.2) +
geom_point(size = 3.2, shape = 21, fill = "white", stroke = 2.5) +
theme_minimal() +
scale_colour_manual(values = c("firebrick1", "goldenrod1")) +
scale_x_continuous(breaks = 1:30, limits = c(0, 30)) +
scale_y_continuous(limits = c(2000, 2600), breaks = c(2000, 2100, 2200, 2300, 2400, 2500, 2600), labels = scales :: comma) +
ylab("Number of Flights") +
xlab("Day of Month") +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(axis.text.x = element_text(size = 10, colour = "black")) +
theme(axis.text.y = element_text(size = 10, colour = "black")) +
theme(axis.title.x = element_text(size= 14, face = "bold", margin = margin(t= 20))) +
theme(axis.title.y = element_text(size= 14, face = "bold", margin = margin(r =20))) +
theme(legend.position = "bottom") +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(size = 12, face = "bold", colour = "black")) +
labs(title = "The Two Largest Players in the Market", subtitle = "Daily Flights in a Month") +
theme(plot.title = element_text(size = 15, face = "bold"))
two_largest_monthly <- hflights %>%
filter(Month %in% c(1:12)) %>%
filter(Cancelled == 0) %>%
filter(Carrier %in% c("ExpressJet", "Continental")) %>%
group_by(Carrier, Month) %>%
summarise(flights = n())
two_largest_monthly <- as.data.frame(two_largest_monthly)
two_largest_monthly
## Carrier Month flights
## 1 Continental 1 6010
## 2 Continental 2 5181
## 3 Continental 3 6024
## 4 Continental 4 5513
## 5 Continental 5 5824
## 6 Continental 6 6025
## 7 Continental 7 6174
## 8 Continental 8 5962
## 9 Continental 9 5451
## 10 Continental 10 5901
## 11 Continental 11 5691
## 12 Continental 12 5801
## 13 ExpressJet 1 6727
## 14 ExpressJet 2 5348
## 15 ExpressJet 3 6321
## 16 ExpressJet 4 5976
## 17 ExpressJet 5 5985
## 18 ExpressJet 6 6045
## 19 ExpressJet 7 6728
## 20 ExpressJet 8 6606
## 21 ExpressJet 9 5590
## 22 ExpressJet 10 5514
## 23 ExpressJet 11 5309
## 24 ExpressJet 12 5772
ggplot(two_largest_monthly, aes(x= Month, y= flights, col = Carrier)) +
geom_line(size = 1.2) +
geom_point(size = 3.2, shape = 21, fill = "white", stroke = 2.5) +
theme_minimal() +
ylab("Number of Flights") +
scale_y_continuous(breaks = seq(5000, 7000, 300)) +
scale_x_continuous(breaks = 1:12, labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) +
theme(axis.text.x = element_text(size = 12, colour = "black")) +
theme(axis.text.y = element_text(size = 12, colour = "black")) +
theme(axis.title.x = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
scale_colour_manual(values = c("firebrick1", "goldenrod1")) +
theme(axis.title.y = element_text(size = 14, face = "bold", margin = margin(r=20))) +
theme(legend.position = "bottom") +
theme(legend.text = element_text(size= 12, face = "bold", colour= "black")) +
theme(legend.title = element_blank()) +
labs(title = "The Largest Players in the Market", subtitle = "Monhtly Flights") +
theme(plot.title = element_text(size = 15, colour = "black", face = "bold"))
Let’s try to extract some useful statistics from our data.
Origin_carr <- hflights %>%
filter(DayOfWeek %in% c(6, 7)) %>%
group_by(Carrier, Origin) %>%
summarise(flights = n()) %>%
mutate(rank = rank(desc(flights))) %>%
filter(rank == 1) %>%
arrange(desc(flights))
Origin_carr
## # A tibble: 15 x 4
## # Groups: Carrier [15]
## Carrier Origin flights rank
## <chr> <chr> <int> <dbl>
## 1 Continental IAH 19250 1
## 2 ExpressJet IAH 19136 1
## 3 Southwest HOU 10792 1
## 4 SkyWest IAH 4380 1
## 5 US_Airways IAH 1119 1
## 6 American IAH 896 1
## 7 American_Eagle IAH 605 1
## 8 Delta IAH 596 1
## 9 United IAH 589 1
## 10 AirTran HOU 527 1
## 11 Atlantic_Southeast IAH 437 1
## 12 Frontier HOU 203 1
## 13 JetBlue HOU 188 1
## 14 Alaska IAH 105 1
## 15 Mesa IAH 39 1
Dest_carr <- hflights %>%
filter(DayOfWeek %in% c(6, 7)) %>%
group_by(Carrier, Dest) %>%
summarise(flights = n()) %>%
mutate(rank = rank(desc(flights))) %>%
filter(rank == 1) %>%
arrange(desc(flights))
Dest_carr
## # A tibble: 15 x 4
## # Groups: Carrier [15]
## Carrier Dest flights rank
## <chr> <chr> <int> <dbl>
## 1 Southwest DAL 1786 1
## 2 Continental EWR 1117 1
## 3 ExpressJet CRP 822 1
## 4 Delta ATL 633 1
## 5 US_Airways CLT 598 1
## 6 American_Eagle DFW 593 1
## 7 American DFW 569 1
## 8 AirTran ATL 472 1
## 9 SkyWest COS 393 1
## 10 Frontier DEN 203 1
## 11 Atlantic_Southeast DTW 201 1
## 12 United SFO 197 1
## 13 JetBlue JFK 188 1
## 14 Alaska SEA 105 1
## 15 Mesa CLT 31 1
Airline_route <- hflights %>%
filter(DayOfWeek %in% c(6, 7)) %>%
group_by(Origin, Dest) %>%
summarise(flights = n()) %>%
arrange(desc((flights))) %>%
mutate(rank = rank(desc(flights))) %>%
filter(rank %in% c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
Airline_route
## # A tibble: 20 x 4
## # Groups: Origin [2]
## Origin Dest flights rank
## <chr> <chr> <int> <dbl>
## 1 HOU DAL 1786 1
## 2 IAH ORD 1608 1
## 3 IAH LAX 1311 2
## 4 IAH ATL 1292 3
## 5 IAH CLT 1254 4
## 6 IAH DFW 1155 5
## 7 IAH EWR 1117 6
## 8 IAH PHX 1041 7
## 9 IAH DEN 1026 8
## 10 IAH MSY 912 9
## 11 IAH AUS 865 10
## 12 HOU MSY 778 2
## 13 HOU ATL 704 3
## 14 HOU DEN 606 4
## 15 HOU DFW 593 5
## 16 HOU HRL 529 6
## 17 HOU MDW 505 7
## 18 HOU MCO 426 8
## 19 HOU LAS 396 9
## 20 HOU SAT 393 10
Let’s try to extract some seasonal statistics from our data.
allseasons <- hflights %>%
filter(DayOfWeek %in% c(6, 7)) %>%
mutate(Season = case_when(Month %in% 3:5 ~ "Spring",
Month %in% 9:11 ~ "Autumn",
Month %in% 6:8 ~ "Summer",
Month %in% c(12, 1, 2) ~ "Winter")) %>%
filter(!is.na(Season)) %>%
group_by(Season, Dest) %>%
summarise(flights = n()) %>%
filter(!is.na(flights)) %>%
arrange(Season, desc(flights)) %>%
slice(1:10)
allseasons <- as.data.frame(allseasons)
allseasons
## Season Dest flights
## 1 Autumn DAL 508
## 2 Autumn ATL 470
## 3 Autumn DFW 429
## 4 Autumn MSY 418
## 5 Autumn LAX 416
## 6 Autumn DEN 412
## 7 Autumn ORD 386
## 8 Autumn PHX 321
## 9 Autumn AUS 317
## 10 Autumn CLT 311
## 11 Spring ATL 567
## 12 Spring DAL 544
## 13 Spring MSY 460
## 14 Spring DFW 426
## 15 Spring ORD 403
## 16 Spring LAX 398
## 17 Spring DEN 388
## 18 Spring PHX 355
## 19 Spring AUS 321
## 20 Spring CLT 318
## 21 Summer DAL 519
## 22 Summer LAX 494
## 23 Summer ATL 477
## 24 Summer DFW 444
## 25 Summer ORD 425
## 26 Summer DEN 423
## 27 Summer MSY 377
## 28 Summer PHX 356
## 29 Summer CLT 322
## 30 Summer EWR 312
## 31 Winter DAL 559
## 32 Winter ATL 482
## 33 Winter DFW 449
## 34 Winter MSY 435
## 35 Winter DEN 409
## 36 Winter ORD 394
## 37 Winter LAX 362
## 38 Winter PHX 344
## 39 Winter AUS 311
## 40 Winter SAT 309
ggplot(allseasons, aes(x = Dest, y = flights)) + geom_bar(stat = "identity") +
facet_wrap(~ Season, scales = "free")
Let’s plot a graph for each season.
allseasons$Season <- as.factor(allseasons$Season)
allseasons$Season <- factor(allseasons$Season, level = c("Autumn", "Spring", "Summer", "Winter"))
ggplot(subset(allseasons, Season == "Spring"), aes(x= reorder(Dest, flights), y= flights)) +
geom_bar(stat = "identity", width = 0.3, fill = "chartreuse3") +
theme_minimal() +
scale_y_continuous(limits = c(0, 600), breaks = c(0, 100, 200, 300, 400, 500, 600), labels = scales :: comma, name = "Number of Flights") +
scale_x_discrete(name = "Destinations") +
theme(axis.line.x = element_line(colour = "black", size = 1.5)) +
theme(axis.text.x = element_text(colour = "black", size = 13)) +
theme(axis.text.y = element_text(colour = "black", size = 13)) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(axis.title.x = element_text(size = 14, face = "bold", margin = margin(t = 20))) +
theme(axis.title.y = element_text(size = 14, face = "bold", margin = margin(r = 20))) +
labs(title = "The Ten Most Visited Destinations in Spring") +
theme(title = element_text(size = 14, face = "bold", colour = "black")) +
coord_flip()
ggplot(subset(allseasons, Season == "Summer"), aes(x= reorder(Dest, flights), y= flights)) +
geom_bar(stat = "identity", width = 0.3, fill = "darkgoldenrod1") +
theme_minimal() +
scale_y_continuous(limits = c(0, 600), breaks = c(0, 100, 200, 300, 400, 500, 600), labels = scales :: comma, name = "Number of Flights") +
scale_x_discrete(name = "Destinations") +
theme(axis.line.x = element_line(colour = "black", size = 1.5)) +
theme(axis.text.x = element_text(colour = "black", size = 13)) +
theme(axis.text.y = element_text(colour = "black", size = 13)) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(axis.title.x = element_text(size = 14, face = "bold", margin = margin(t = 20))) +
theme(axis.title.y = element_text(size = 14, face = "bold", margin = margin(r = 20))) +
labs(title = "The Ten Most Visited Destinations in Summer") +
theme(title = element_text(size = 14, face = "bold", colour = "black")) +
coord_flip()
ggplot(subset(allseasons, Season == "Autumn"), aes(x= reorder(Dest, flights), y= flights)) +
geom_bar(stat = "identity", width = 0.3, fill = "gray74") +
theme_minimal() +
scale_y_continuous(limits = c(0, 600), breaks = c(0, 100, 200, 300, 400, 500, 600), labels = scales :: comma, name = "Number of Flights") +
scale_x_discrete(name = "Destinations") +
theme(axis.line.x = element_line(colour = "black", size = 1.5)) +
theme(axis.text.x = element_text(colour = "black", size = 13)) +
theme(axis.text.y = element_text(colour = "black", size = 13)) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(axis.title.x = element_text(size = 14, face = "bold", margin = margin(t = 20))) +
theme(axis.title.y = element_text(size = 14, face = "bold", margin = margin(r = 20))) +
labs(title = "The Ten Most Visited Destinations in Autumn") +
theme(title = element_text(size = 14, face = "bold", colour = "black")) +
coord_flip()
ggplot(subset(allseasons, Season == "Autumn"), aes(x= reorder(Dest, flights), y= flights)) +
geom_bar(stat = "identity", width = 0.3, fill = "deepskyblue3") +
theme_minimal() +
scale_y_continuous(limits = c(0, 600), breaks = c(0, 100, 200, 300, 400, 500, 600), labels = scales :: comma, name = "Number of Flights") +
scale_x_discrete(name = "Destinations") +
theme(axis.line.x = element_line(colour = "black", size = 1.5)) +
theme(axis.text.x = element_text(colour = "black", size = 13)) +
theme(axis.text.y = element_text(colour = "black", size = 13)) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(axis.title.x = element_text(size = 14, face = "bold", margin = margin(t = 20))) +
theme(axis.title.y = element_text(size = 14, face = "bold", margin = margin(r = 20))) +
labs(title = "The Ten Most Visited Destinations in Winter") +
theme(title = element_text(size = 14, face = "bold", colour = "black")) +
coord_flip()
Let’s try to extract some useful insights regarding the dealyed and cancelled flights.
Cancelled_Carr <- hflights %>%
filter(Cancelled == 1) %>%
group_by(Carrier) %>%
summarise(cancelledflights = n()) %>%
arrange(desc(cancelledflights)) %>%
slice(1:5)
Cancelled_Carr
## # A tibble: 5 x 2
## Carrier cancelledflights
## <chr> <int>
## 1 ExpressJet 1132
## 2 Southwest 703
## 3 Continental 475
## 4 SkyWest 224
## 5 American_Eagle 135
ggplot(Cancelled_Carr, aes(x=reorder(Carrier, cancelledflights), y=cancelledflights)) +
geom_bar(stat = "identity", fill = "coral1") +
theme_minimal() +
scale_y_continuous(limits = c(0, 1200), breaks = c(0, 200, 400, 600, 800, 1000, 1200), labels = scales :: comma) +
theme(axis.text.x = element_text(size = 12, face = "bold", colour = "black", angle = 65, vjust = 0.6)) +
theme(axis.text.y = element_text(size = 10, face = "bold", colour = "black")) +
theme(axis.title.x = element_blank()) +
ylab("Number of Cancelled Flights") +
theme(axis.title.y = element_text(size = 12, face = "bold", colour = "black")) +
theme(panel.grid.major.x = element_blank()) +
labs(title = "The Highest Level of Cancelled Flights (Top Five)") +
theme(title = element_text(size = 14, face = "bold", colour = "black"))
Carrier_arrdelay <- hflights %>%
filter(!is.na(ArrDelay), ArrDelay > 0) %>%
group_by(Carrier) %>%
summarise(avg = mean(ArrDelay)) %>%
mutate(rank = rank(avg)) %>%
arrange(desc(rank))
Carrier_arrdelay
## # A tibble: 15 x 3
## Carrier avg rank
## <chr> <dbl> <dbl>
## 1 JetBlue 45.5 15
## 2 Atlantic_Southeast 40.2 14
## 3 American_Eagle 38.8 13
## 4 United 32.5 12
## 5 Delta 32.1 11
## 6 American 28.5 10
## 7 AirTran 27.9 9
## 8 Southwest 25.3 8
## 9 ExpressJet 24.2 7
## 10 SkyWest 24.1 6
## 11 Alaska 22.9 5
## 12 Continental 22.1 4
## 13 US_Airways 20.7 3
## 14 Frontier 18.7 2
## 15 Mesa 18.7 1
ggplot(Carrier_arrdelay, aes(x = reorder(Carrier, avg), y = avg)) +
geom_bar(stat = "identity", fill = "darkred") +
theme_minimal()+
scale_y_continuous(limits = c(0, 50), breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50), labels = scales :: comma) +
theme(axis.title.x = element_blank()) +
theme(axis.text.x = element_text(size = 10, face = "bold", colour = "black", angle = 65, vjust = 0.6)) +
theme(axis.text.y = element_text(size = 10, face = "bold", colour = "black")) +
ylab("Average of Delay") +
theme(axis.title.y = element_text(size = 12, face = "bold", colour = "black", margin = margin(r=20))) +
theme(panel.grid.major.x = element_blank()) +
labs(title = "The Average of Arrival Delay for each Carrier", subtitle = "Minutes") +
theme(title = element_text(size = 14, face = "bold", colour = "black")) +
theme(plot.subtitle = element_text(size = 12, colour = "black"))
Carrier_depdelay <- hflights %>%
filter(!is.na(DepDelay), DepDelay > 0) %>%
group_by(Carrier) %>%
summarise(avg = mean(DepDelay)) %>%
mutate(rank = rank(avg)) %>%
arrange(rank)
Carrier_depdelay
## # A tibble: 15 x 3
## Carrier avg rank
## <chr> <dbl> <dbl>
## 1 Continental 17.9 1
## 2 Alaska 20.8 2
## 3 Southwest 21.9 3
## 4 Frontier 22.7 4
## 5 Mesa 24.5 5
## 6 SkyWest 24.6 6
## 7 American 24.7 7
## 8 US_Airways 26.5 8
## 9 ExpressJet 26.9 9
## 10 United 28.8 10
## 11 Delta 32.4 11
## 12 AirTran 33.4 12
## 13 American_Eagle 37.9 13
## 14 JetBlue 43.5 14
## 15 Atlantic_Southeast 49.3 15
ggplot(Carrier_arrdelay, aes(x = reorder(Carrier, avg), y = avg)) +
geom_bar(stat = "identity", fill = "darkslateblue") +
theme_minimal()+
scale_y_continuous(limits = c(0, 50), breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50), labels = scales :: comma) +
theme(axis.title.x = element_blank()) +
theme(axis.text.x = element_text(size = 10, face = "bold", colour = "black", angle = 65, vjust = 0.6)) +
theme(axis.text.y = element_text(size = 10, face = "bold", colour = "black")) +
ylab("Average of Delay") +
theme(axis.title.y = element_text(size = 12, face = "bold", colour = "black", margin = margin(r=20))) +
theme(panel.grid.major.x = element_blank()) +
labs(title = "The Average of Departure Delay for each Carrier", subtitle = "Minutes") +
theme(title = element_text(size = 14, face = "bold", colour = "black")) +
theme(plot.subtitle = element_text(size = 12, colour = "black"))