library(RODBC)
library(ggplot2)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
I decided to create a table in mySQL in order to practice obtaining data from a local SQL server.
db_conn <- odbcConnect("LocalDSN", rows_at_time = 1)
sql <- "
SELECT TOP (1000) [Names]
,[Flight_Status]
,[Los_Angeles]
,[Pheonix]
,[San_Diego]
,[San_Francisco]
,[Seattle]
FROM [SQL_Tutorial].[dbo].[Homework5_607]"
Airplane <- sqlQuery(db_conn, sql, stringsAsFactors = FALSE)
#Closing Database
odbcClose(db_conn)
I have decided to turn my table obtained from SQL into a longer table in order to have the Cities as a column instead of each individual one being its own separate column. Then I decided to filter the data into each individual Airline Carrier to allow for more agency in data transformation and analysis.
head(Airplane)
## Names Flight_Status Los_Angeles Pheonix San_Diego San_Francisco Seattle
## 1 Alaska on_time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AMWEST on_time 694 4840 383 320 201
## 4 AMWEST delayed 117 415 65 129 61
Long_Airplane <- Airplane %>% gather(Cities, Flights, Los_Angeles:Seattle)
#Subset of Alaska Airlines Data
Alaska <- Long_Airplane %>% filter(Names == 'Alaska')
#Subset of AmWest Airline Data
AMWEST <- Long_Airplane %>% filter(Names == 'AMWEST')
The function below is created in order to find the Delay rate of each of the cities for either filtered dataframe. The Idea is to create a function that will give me the Arrival Delay Rate for any amount of entries as long as they are in the order of Alternating On Time and Delayed for each city.
OT_rate <- function(x) {
# Create an empty vector to store the results
results <- c()
# Loop over the input vector, taking every 2nd value and dividing it by the previous one
for (i in seq(2, length(x), by=2)) {
result <- x[i] / x[i-1]
results <- c(results, result)
}
# Return the results vector
return(results)
}
Below is shown that from the function prior we are given a vector with 5 values, one for each city. However, since you can not mutate values back into a function with less values then the dataframe it self has, I have to increase the amount of values in the vector that was created. So a trick that can be done is make every value after our Arrival Delayed Rate to be a 0, which will allow us to filter them out for the final comparison when adding the separated dataframes back together.
#Alaska
On_time_rate_Alaska <- OT_rate(Alaska$Flights)
Sums <- rep(0, length(On_time_rate_Alaska)*2)
Sums[c(TRUE, FALSE)] <- On_time_rate_Alaska
A_Alaska<- Alaska %>% mutate(Alaska, Arrival_Delays_Rate = Sums)
Final_Alaska <- A_Alaska %>% filter(A_Alaska$Flight_Status =='on_time')
#AMWEST
On_time_rate_AMWEST <- OT_rate(AMWEST$Flights)
Sums1 <- rep(0, length(On_time_rate_AMWEST)*2)
Sums1[c(TRUE, FALSE)] <- On_time_rate_AMWEST
A_AMWEST<- AMWEST %>% mutate(AMWEST, Arrival_Delays_Rate = Sums1)
Final_AMWEST <- A_AMWEST %>% filter(A_AMWEST$Flight_Status =='on_time')
Final_1 <- bind_rows(Final_Alaska,Final_AMWEST)
Final <- Final_1[,-2]
print(Final)
## Names Cities Flights Arrival_Delays_Rate
## 1 Alaska Los_Angeles 497 0.12474849
## 2 Alaska Pheonix 221 0.05429864
## 3 Alaska San_Diego 212 0.09433962
## 4 Alaska San_Francisco 503 0.20278330
## 5 Alaska Seattle 1841 0.16567083
## 6 AMWEST Los_Angeles 694 0.16858790
## 7 AMWEST Pheonix 4840 0.08574380
## 8 AMWEST San_Diego 383 0.16971279
## 9 AMWEST San_Francisco 320 0.40312500
## 10 AMWEST Seattle 201 0.30348259
As shown below the continue trend of each city that both airline travel to, American West Airlines have a higher rate of a delayed arrival. With the highest rate being San Francisco and Seattle. Interesting correlation is that according to a simple Google search of the 2021 US Census Bureau, the population of Seattle and San Francisco are both lower than 1 million people, while the other Cities have 2-3 Million population.
ggplot(Final, aes(x = factor(Cities), y = Arrival_Delays_Rate, fill = Names, colour = Names)) +
geom_bar(stat = "identity", position = "dodge",alpha = 0.5, colour = "gray25") + labs(x = 'Cities', y = "Rate of Delayed Arrival", title = "Arrival Delayed Rate by Cities" )
According to this data, if you are going to any of these cities and your only option are Alaskan Airlines and American West Airlines. Take Alaskan Airline if you would like a lower chance at a Arrival Delayed Rate.