What happened on the United airline a few days ago is just tragic. The google trend of United airline has reached an all time high.
After reading this blog, I am inspired to do the visualization myself.
The data is from United States Department of Transportation.
There are two methods of getting the data: scrape it from the website or download the data.
Let us download and import the data
#load the packages and data
library(tidyverse)
library(ggplot2)
library(dplyr)
library(stringr)
airline=read.csv("table_01_64_2.csv",header=TRUE)
Then, some simple data cleaning
airline=airline%>%select(starts_with("X."))
airline=airline[1:6,]
row.names(airline)=c("Year","Boarded","Denied_boarding_tot","Voluntary_denied","Involuntary_denied","Denied_boarding_percent")
airline=t(airline)
airline=as.data.frame(airline)
airline%>%head
Notice the problem with commas in the numeric value bigger than 1000. That will need to be fixed.
airline=airline%>%
mutate_each(funs(as.character(.)), Boarded:Voluntary_denied) %>%
mutate_each(funs(gsub(",", "", .)),Boarded:Voluntary_denied) %>%
mutate_each(funs(as.numeric(.)), Boarded:Voluntary_denied)
airline=airline%>%select(Year:Involuntary_denied)
airline$Involuntary_denied=airline$Involuntary_denied%>%as.numeric()
glimpse(airline)
Observations: 25
Variables: 5
$ Year <fctr> 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, ...
$ Boarded <dbl> 429190, 445271, 449184, 457286, 460277, 480555, 502960, 514170, 523081, 543344, 477970, 467205, 485797, 52230...
$ Denied_boarding_tot <dbl> 646, 764, 683, 824, 842, 957, 1071, 1136, 1070, 1120, 900, 837, 769, 747, 597, 674, 685, 684, 719, 746, 626, ...
$ Voluntary_denied <dbl> 599, 718, 632, 771, 794, 899, 1018, 1091, 1024, 1062, 861, 803, 727, 702, 552, 619, 621, 620, 651, 681, 578, ...
$ Involuntary_denied <dbl> 6, 5, 9, 10, 8, 14, 11, 4, 5, 13, 2, 1, 3, 4, 4, 12, 16, 16, 18, 17, 7, 15, 11, 8, 5
summary(airline)
Year Boarded Denied_boarding_tot Voluntary_denied Involuntary_denied
(R) 2014: 1 Min. :429190 Min. : 467.0 Min. : 418 Min. : 1.00
1991 : 1 1st Qu.:477970 1st Qu.: 646.0 1st Qu.: 599 1st Qu.: 5.00
1992 : 1 Median :522308 Median : 746.0 Median : 681 Median : 8.00
1993 : 1 Mean :522232 Mean : 768.3 Mean : 717 Mean : 8.96
1994 : 1 3rd Qu.:567740 3rd Qu.: 842.0 3rd Qu.: 803 3rd Qu.:13.00
1995 : 1 Max. :613141 Max. :1136.0 Max. :1091 Max. :18.00
(Other) :19
Next,we need to create some more variables.
airline=airline%>%mutate(Voluntary_denied_percent=Voluntary_denied/Boarded,
Involuntary_denied_percent=Involuntary_denied/Boarded,
Denied_boarding_percent=Denied_boarding_tot/Boarded,
Involutary_in_denied=Involuntary_denied/Denied_boarding_tot)
airline$Year=airline$Year%>%as.character()
airline[airline$Year=="(R) 2014", 1]="2014"
airline$Year=airline$Year%>%as.numeric()
ggplot(data=airline)+geom_point(aes(x=Year,y=Involuntary_denied_percent*100),size=2.5,color="lightblue")+
geom_segment(aes(x=Year,y=0,xend=Year,yend=Involuntary_denied_percent*100),size=1.2,color="lightblue")+ggtitle("Involuntary denied boarding percentage from 1990-2015")+ylab("Involuntary denied boarding percentage (%) ")+xlab("Year")
ggplot(data=airline)+geom_point(aes(x=Year,y=Involutary_in_denied*100),color="#9ecae1",size=2.5)+
geom_segment(aes(x=Year,y=0,xend=Year,yend=Involutary_in_denied*100),color="#9ecae1",size=1.3)+ggtitle("Ratio of involuntary denied boarding rate to total denied boarding")+ylab("Involuntary to total denied boarding (%) ")+xlab("Year")
ggplot(data=airline)+geom_point(aes(x=Year,y=Denied_boarding_percent*100),color="#9ecae1",size=2.5)+
geom_segment(aes(x=Year,y=0,xend=Year,yend=Denied_boarding_percent*100),color="#9ecae1",size=1.3)+ggtitle("Total denied boarding percentage")+ylab("Total denied boarding percentage (%) ")+xlab("Year")
It dictates that the total denied boarding percentage is decreasing over the last 10 to 15 years. While at the same time, there is an rise in involuntary denied boarding percentage.
airline
airline%>%mutate(in_to_vo=Involuntary_denied/Voluntary_denied)%>%ggplot()+
geom_point(aes(x=Year,y=in_to_vo*100),color="#9ecae1",size=2.5)+
geom_segment(aes(x=Year,y=0,xend=Year,yend=in_to_vo*100),color="#9ecae1",size=1.3)+ggtitle("Ratio of involuntary to voluntary denied boarding over the years")+ylab("Total denied boarding percentage (%) ")+xlab("Year")
This graph confirms the ratio of involuntary to voluntary denied boarding has increased for the past 10-15 years.
When you are taking a flight in united states, your should know your right from this government website. Youc oculd also get compensated for delayed flights with details on this website.
library(rvest)
url="https://www.rita.dot.gov/bts/sites/rita.dot.gov.bts/files/publications/national_transportation_statistics/html/table_01_64.html"
airline2=read_html(url)%>%
html_nodes(css=" .cellright , .cellright b" )%>%
html_text()
Then, we would need some further cleaning on the data
airline2=airline2%>%str_replace_all("\n\t\t\t\t","")
airline2
[1] "420,696" "420,696" "429,190" "429,190" "445,271" "445,271" "449,184" "449,184" "457,286" "457,286" "460,277" "460,277"
[13] "480,555" "480,555" "502,960" "502,960" "514,170" "514,170" "523,081" "523,081" "543,344" "543,344" "477,970" "477,970"
[25] "467,205" "467,205" "485,797" "485,797" "522,308" "522,308" "516,553" "516,553" "552,445" "552,445" "567,740" "567,740"
[37] "576,476" "576,476" "548,041" "548,041" "595,253" "595,253" "591,825 " "591,825 " "600,774" "600,774" "599,405" "599,405"
[49] "535,551" "535,551" "613,141" "613,141" "628" "628" "646" "646" "764" "764" "683" "683"
[61] "824" "824" "842" "842" "957" "957" "1,071" "1,071" "1,136" "1,136" "1,070" "1,070"
[73] "1,120" "1,120" "900" "900" "837" "837" "769" "769" "747" "747" "597" "597"
[85] "674" "674" "685" "685" "684" "684" "719" "719" "746" "746" "626" "626"
[97] "598 " "598 " "494" "494" "467" "467" "552" "552" "561" "599" "718" "632"
[109] "771" "794" "899" "1,018" "1,091" "1,024" "1,062" "861" "803" "727" "702" "552"
[121] "619" "621" "620" "651" "681" "578" "539" "440" "418" "505" "67" "47"
[133] "46" "51" "53" "49" "58" "54" "45" "46" "57" "39" "34" "42"
[145] "45" "45" "55" "64" "64" "67" "65" "48" "59" "54" "49" "46"
[157] "0.15" "0.15" "0.17" "0.15" "0.18" "0.18" "0.20" "0.21" "0.22" "0.20" "0.21" "0.19"
[169] "0.18" "0.16" "0.14" "0.12" "0.12" "0.12" "0.12" "0.13" "0.13" "0.11" "0.10" "0.08"
[181] "0.09" "0.09"
index=seq(from=2,to=52,by=2)
Boarded=airline2[index]
index2=index+index[length(index)]
Denied_boarding_tot=airline2[index2]
index3=seq(from=1,to=26)+index2[length(index2)]
Voluntary_denied_boarding=airline2[index3]
index4=seq(from=1,to=26)+index3[length(index3)]
Involuntary_denied_boarding=airline2[index4]
Involuntary_denied_boarding=as.numeric(Involuntary_denied_boarding)
airline2=data.frame(Year=1990:2015,Boarded,Denied_boarding_tot,Voluntary_denied_boarding,Involuntary_denied_boarding)
airline2=airline2%>%
mutate_each(funs(as.character(.)), Boarded:Voluntary_denied_boarding) %>%
mutate_each(funs(gsub(",", "", .)),Boarded:Voluntary_denied_boarding) %>%
mutate_each(funs(as.numeric(.)), Boarded:Voluntary_denied_boarding)
summary(airline2)
Year Boarded Denied_boarding_tot Voluntary_denied_boarding Involuntary_denied_boarding
Min. :1990 Min. :420696 Min. : 467.0 Min. : 418.0 Min. :34.00
1st Qu.:1996 1st Qu.:469896 1st Qu.: 632.5 1st Qu.: 583.2 1st Qu.:46.00
Median :2002 Median :519431 Median : 732.5 Median : 666.0 Median :50.00
Mean :2002 Mean :518327 Mean : 762.9 Mean : 711.0 Mean :51.88
3rd Qu.:2009 3rd Qu.:563916 3rd Qu.: 840.8 3rd Qu.: 800.8 3rd Qu.:57.75
Max. :2015 Max. :613141 Max. :1136.0 Max. :1091.0 Max. :67.00
Now the data is cleaned, we could do what we do as before. But we could see that it is definitely easier to download the data than to scrape it from website.
This visualization is not so helpful in choosing an airline to fly with. Next, I will try to get data for different airlines.