The intention of this project is to explore United States Domestic Airlines through the lenses of market share, on-time performance, and twitter sentiment.

For this project the ten mainline domestic airlines in the United States were chosen for analysis. According the U.S. Department of Transportation these ten mainline domestic airlines all fall under the Group III rating meaning that they have over $1 billion in yearly operating revenue. While there are 18 total airlines in this Group III rating, eight were excluded from this analysis because they are freight airlines or are regional airlines that fly partnerships with the mainline domestic airlines.

Data for the market share and on-time performance sections of this project come from publicly available U.S. Department of Transportation airline data which can be found here. Every single domestic flight that is taken in the United States is recorded and made available to the public. While available to the public this data is very large and it is not easy to download.

The first step in downloading the data was to individually download it based on the individual year and month. With this manually intensive process and large datafiles a specific time frame was set from January 2018 to July 2022 (the most recent data available at the time of publication) which is close to five years. This timeframe was chosen because it includes data before the Covid-19 pandemic, during the Covid-19 pandemic, and the vaccine era of the Covid-19 pandemic.

After each of the individual files were downloaded it required manually filtering the top ten selected airlines using Excel. Once this process was completed the files were combined into one large dataset using R. This completed dataset can be found on Kaggle here.

2018 Cleaned Files
x2018_1 <- read_csv("on_time/2018-1.csv")
x2018_2 <- read_csv("on_time/2018-2.csv")
x2018_3 <- read_csv("on_time/2018-3.csv")
x2018_4 <- read_csv("on_time/2018-4.csv")
x2018_5 <- read_csv("on_time/2018-5.csv")
x2018_6 <- read_csv("on_time/2018-6.csv")
x2018_7 <- read_csv("on_time/2018-7.csv")
x2018_8 <- read_csv("on_time/2018-8.csv")
x2018_9 <- read_csv("on_time/2018-9.csv")
x2018_10 <- read_csv("on_time/2018-10.csv")
x2018_11 <- read_csv("on_time/2018-11.csv")
x2018_12 <- read_csv("on_time/2018-12.csv")

# 2019 Cleaned Files
x2019_1 <- read.csv("on_time/2019-1.csv")
x2019_2 <- read.csv("on_time/2019-2.csv")
x2019_3 <- read.csv("on_time/2019-3.csv")
x2019_4 <- read.csv("on_time/2019-4.csv")
x2019_5 <- read.csv("on_time/2019-5.csv")
x2019_6 <- read.csv("on_time/2019-6.csv")
x2019_7 <- read.csv("on_time/2019-7.csv")
x2019_8 <- read.csv("on_time/2019-8.csv")
x2019_9 <- read.csv("on_time/2019-9.csv")
x2019_10 <- read.csv("on_time/2019-10.csv")
x2019_11 <- read.csv("on_time/2019-11.csv")
x2019_12 <- read.csv("on_time/2019-12.csv")

# 2020 Cleaned Files
x2020_1 <- read.csv("on_time/2020-1.csv")
x2020_2 <- read.csv("on_time/2020-2.csv")
x2020_3 <- read.csv("on_time/2020-3.csv")
x2020_4 <- read.csv("on_time/2020-4.csv")
x2020_5 <- read.csv("on_time/2020-5.csv")
x2020_6 <- read.csv("on_time/2020-6.csv")
x2020_7 <- read.csv("on_time/2020-7.csv")
x2020_8 <- read.csv("on_time/2020-8.csv")
x2020_9 <- read.csv("on_time/2020-9.csv")
x2020_10 <- read.csv("on_time/2020-10.csv")
x2020_11 <- read.csv("on_time/2020-11.csv")
x2020_12 <- read.csv("on_time/2020-12.csv")

# 2021 Cleaned Files
x2021_1 <- read.csv("on_time/2021-1.csv")
x2021_2 <- read.csv("on_time/2021-2.csv")
x2021_3 <- read.csv("on_time/2021-3.csv")
x2021_4 <- read.csv("on_time/2021-4.csv")
x2021_5 <- read.csv("on_time/2021-5.csv")
x2021_6 <- read.csv("on_time/2021-6.csv")
x2021_7 <- read.csv("on_time/2021-7.csv")
x2021_8 <- read.csv("on_time/2021-8.csv")
x2021_9 <- read.csv("on_time/2021-9.csv")
x2021_10 <- read.csv("on_time/2021-10.csv")
x2021_11 <- read.csv("on_time/2021-11.csv")
x2021_12 <- read.csv("on_time/2021-12.csv")

# 2021 Cleaned Files
x2022_1 <- read.csv("on_time/2022-1.csv")
x2022_2 <- read.csv("on_time/2022-2.csv")
x2022_3 <- read.csv("on_time/2022-3.csv")
x2022_4 <- read.csv("on_time/2022-4.csv")
x2022_5 <- read.csv("on_time/2022-5.csv")
x2022_6 <- read.csv("on_time/2022-6.csv")
x2022_7 <- read.csv("on_time/2022-7.csv")


X2018_1 %>% 
  full_join(X2018_2) -> joined

# test:
joined %>% 
  count(MONTH, sort = TRUE)

# so you should be able to:
x2018_1 %>% 
  full_join(x2018_2) %>% 
  full_join(x2018_3) %>% 
  full_join(x2018_4) %>%
  full_join(x2018_5) %>% 
  full_join(x2018_6) %>% 
  full_join(x2018_7) %>% 
  full_join(x2018_8) %>% 
  full_join(x2018_9) %>% 
  full_join(x2018_10) %>% 
  full_join(x2018_11) %>% 
  full_join(x2018_12) -> x2018_joined 

x2019_1 %>% 
  full_join(x2019_2) %>% 
  full_join(x2019_3) %>% 
  full_join(x2019_4) %>%
  full_join(x2019_5) %>% 
  full_join(x2019_6) %>% 
  full_join(x2019_7) %>% 
  full_join(x2019_8) %>% 
  full_join(x2019_9) %>% 
  full_join(x2019_10) %>% 
  full_join(x2019_11) %>% 
  full_join(x2019_12) -> x2019_joined 

x2020_1 %>% 
  full_join(x2020_2) %>% 
  full_join(x2020_3) %>% 
  full_join(x2020_4) %>%
  full_join(x2020_5) %>% 
  full_join(x2020_6) %>% 
  full_join(x2020_7) %>% 
  full_join(x2020_8) %>% 
  full_join(x2020_9) %>% 
  full_join(x2020_10) %>% 
  full_join(x2020_11) %>% 
  full_join(x2020_12) -> x2020_joined 
  
x2021_1 %>% 
  full_join(x2021_2) %>% 
  full_join(x2021_3) %>% 
  full_join(x2021_4) %>%
  full_join(x2021_5) %>% 
  full_join(x2021_6) %>% 
  full_join(x2021_7) %>% 
  full_join(x2021_8) %>% 
  full_join(x2021_9) %>% 
  full_join(x2021_10) %>% 
  full_join(x2021_11) %>% 
  full_join(x2021_12) -> x2021_joined

x2022_1 %>% 
  full_join(x2022_2) %>% 
  full_join(x2022_3) %>% 
  full_join(x2022_4) %>%
  full_join(x2022_5) %>% 
  full_join(x2022_6) %>% 
  full_join(x2022_7) -> x2022_joined

x2018_joined %>%
  full_join(x2019_joined) %>%
  full_join(x2020_joined) %>%
  full_join(x2021_joined) %>%
  full_join(x2022_joined) -> airlines_joined_18to22

write.csv(airlines_joined_18to22,"/Users/ianmyers/Documents/R/AIRLINES/alljoined_airlines.csv")