FAA1 <- read.csv("FAA1.csv", header = TRUE)
FAA2 <- read.csv("FAA2.csv", header = TRUE)
FAA2 <- FAA2[-c(151:200), ]
FAA <- merge(FAA1, FAA2, by = c("aircraft","no_pasg", "speed_ground","speed_air","height", "pitch", "distance"), all = TRUE)
FAA$aircraft_num <- ifelse(FAA$aircraft == "airbus", 1, 0)
FAA<- subset(FAA, duration > 40)
#remove speed_ground that's less than 30 or greater than 140
FAA<- subset(FAA, speed_ground >=30 & speed_ground <=140)
#remove speed_air that's less than 30 or greater than 140
FAA<- subset(FAA, speed_air >=30 & speed_ground <=140)
#remove height that's less than 6 meters
FAA<- subset(FAA, height >=6)
#remove distance that's over 6000 feet
FAA <- subset(FAA, distance < 6000)
#export the wrangled data to csv for
write.csv(FAA, "FAA_wrangled.csv", row.names = FALSE)
#create additional variables
FAA$long.landing <- ifelse(FAA$distance > 2500, 1, 0)
FAA$risky.landing <- ifelse(FAA$distance > 3000, 1, 0)
#discard variable distance
FAA <- FAA[, -7]
attach(FAA)
# Calculate the frequency distribution of long.landing
long_landing_counts <- round(table(long.landing)/length(long.landing)*100,1)
# Label the pie chart slices
labels <- paste(names(long_landing_counts), ": ", long_landing_counts,"%", sep = "")
# Create the pie chart
pie(
long_landing_counts,
labels = labels,
main = "Distribution of Long Landing",
col = c("lightblue", "salmon") # Custom colors for 0 and 1
)
library(dplyr)
# List of predictors
predictors <- c("aircraft_num", "no_pasg", "speed_ground", "speed_air", "height", "pitch", "duration")
# Initialize an empty data frame to store results
single_reg <- data.frame(
Variable = character(),
Coefficient = numeric(),
Odds_Ratio = numeric(),
Direction = character(),
P_Value = numeric(),
stringsAsFactors = FALSE
)
# Perform logistic regression for each predictor
for (var in predictors) {
# Build the formula dynamically
formula <- as.formula(paste("long.landing ~", var))
# Fit logistic regression model
model <- glm(formula, data = FAA, family = binomial)
# Extract coefficients and p-value
coef_value <- coef(summary(model))[2, 1] # Regression coefficient
p_value <- (coef(summary(model))[2, 4]) # P-value
odds_ratio <- round(exp(coef_value), 3) # Odds ratio
direction <- ifelse(coef_value > 0, "Positive", "Negative")
# Append results to the data frame
single_reg <- rbind(single_reg, data.frame(
Variable = var,
Coefficient = abs(coef_value), # Use absolute value for ranking
Odds_Ratio = odds_ratio,
Direction = direction,
P_Value = p_value
))
}
# Rank factors by the size of the coefficient
single_reg <- single_reg %>% arrange(desc(Coefficient))
# Print the results
print(single_reg)
By the nature of long.landing value and risky.landing value, if it’s a risky landing, it must be a long landing, so I decided not to include the risky.landing variable in this regression analysis. Based on the regression summary statistics, we can see that speed_air and speed ground has significant impact on increasing the risk of being a long landing. Aircraft’s P value is very close to 0.05 threshold, but it still indicates that there may not be a significant impact.
plot(long.landing ~ speed_ground)
plot(long.landing ~ speed_air)
library(ggplot2)
# Jitter plot for speed_ground vs. long.landing
ggplot(FAA, aes(y = as.factor(long.landing), x = speed_ground)) +
geom_jitter(width = 0.2, aes(color = as.factor(long.landing))) +
labs(title = "Jitter Plot: Speed_Ground vs Long Landing",
x = "Speed Ground)",
y = "Long Landing (0 = No, 1 = Yes)") +
scale_color_manual(values = c("blue", "red"), name = "Long Landing") +
theme_minimal()
# Jitter plot for speed_air vs. long.landing
ggplot(FAA, aes(y = as.factor(long.landing), x = speed_air)) +
geom_jitter(width = 0.2, aes(color = as.factor(long.landing))) +
labs(title = "Jitter Plot: Speed_Air vs Long Landing",
x = "Speed Air",
y = "Long Landing (0 = No, 1 = Yes)") +
scale_color_manual(values = c("blue", "red"), name = "Long Landing") +
theme_minimal()
# Histogram with density line for speed_ground by long.landing
ggplot(FAA, aes(x = speed_ground, fill = as.factor(long.landing), color = as.factor(long.landing))) +
geom_histogram(aes(y = ..density..), binwidth = 5, position = "identity", alpha = 0.5) +
geom_density(alpha = 0.7) +
labs(title = "Histogram and Density: Speed_Ground by Long Landing",
x = "Speed Ground",
y = "Density",
fill = "Long Landing",
color = "Long Landing") +
scale_fill_manual(values = c("blue", "red")) +
scale_color_manual(values = c("blue", "red")) +
theme_minimal()
# Histogram with density line for speed_air by long.landing
ggplot(FAA, aes(x = speed_air, fill = as.factor(long.landing), color = as.factor(long.landing))) +
geom_histogram(aes(y = ..density..), binwidth = 5, position = "identity", alpha = 0.5) +
geom_density(alpha = 0.7) +
labs(title = "Histogram and Density: Speed_Air by Long Landing",
x = "Speed Air",
y = "Density",
fill = "Long Landing",
color = "Long Landing") +
scale_fill_manual(values = c("blue", "red")) +
scale_color_manual(values = c("blue", "red")) +
theme_minimal()
Based on the three different plots above, we can see that when speed ground and speed air increase, the risk of having a long landing increase as well, especially when the speed passes 100-105 MPH.
library(faraway)
# Fit the full logistic regression model with speed_air
full_model <- glm(long.landing ~ speed_air,
data = FAA,
family = binomial)
# View the summary of the model
summary(full_model)
Call:
glm(formula = long.landing ~ speed_air, family = binomial, data = FAA)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -52.89273 8.21194 -6.441 1.19e-10 ***
speed_air 0.52234 0.08157 6.404 1.51e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 270.199 on 194 degrees of freedom
Residual deviance: 94.155 on 193 degrees of freedom
AIC: 98.155
Number of Fisher Scoring iterations: 7
odds <- exp(coef(full_model))
paste("Oddes is ", odds)
[1] "Oddes is 1.06900649354914e-23" "Oddes is 1.68596207885757"
#visualize fitted model
beta.full.model <- coef(full_model)
plot(jitter(long.landing, 0.1) ~ jitter(speed_air), FAA, xlab = "Speed Air", ylab = "Long Landing", pch = ".")
curve(ilogit(beta.full.model[1]+beta.full.model[2]*x), add = TRUE)
The full model can be written as P(long.landing = 1) = exp(-52.89 + 0.52 * speed_air) / 1 + exp(-52.89 + 0.52 * speed_air). For every 1 unit increase in speed_air, the log-odds of a long landing increase by 0.52234, the higher speed_air is associated with a greater likelihood of a long landing. The odds ratio is 1.686, for every 1-unit increase in speed_air, the odds of a long landing are multiplied by approx. 1.686, a 68.6% increase in odds.