# Load necessary libraries
library(dplyr)
# Load the dataset
data <- read.csv("D:/1_BU/Fundamental of ML/Term Project/flight_data.csv")
colnames(data)[colnames(data) == "duration"] <- "flight_duration"
# Remove missing values
data <- na.omit(data)
# Remove outliers based on price
Q1 <- quantile(data$price, 0.25)
Q3 <- quantile(data$price, 0.75)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
data <- subset(data, price >= lower_bound & price <= upper_bound)
# Drop unwanted columns
data <- data %>%
select(-flight, -source_city, -destination_city)
colnames(data)
## [1] "filghtId" "airline" "departure_time" "stops"
## [5] "arrival_time" "flight_duration" "days_left" "price"
This section explores the relationship between flight duration and price using a scatter plot and correlation analysis.
# Scatter plot of Price vs. Flight Duration
plot(data$flight_duration, data$price,
main = "Scatter Plot of Price vs. Flight Duration",
xlab = "Flight Duration",
ylab = "Price",
col = "blue",
pch = 16)
# Calculate correlation coefficient
cor_coefficient <- cor(data$flight_duration, data$price)
# Perform correlation test
cor_test_result <- cor.test(data$flight_duration, data$price)
print(cor_test_result)
##
## Pearson's product-moment correlation
##
## data: data$flight_duration and data$price
## t = 11.49, df = 3250, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1643263 0.2303892
## sample estimates:
## cor
## 0.1975821
# Linear regression model
linear_model <- lm(price ~ flight_duration, data = data)
intercept <- coef(linear_model)[1]
slope <- coef(linear_model)
print(paste("Equation of the line: Price =", round(intercept, 2), "+", round(slope, 2), " * Duration"))
## [1] "Equation of the line: Price = 47778.16 + 47778.16 * Duration"
## [2] "Equation of the line: Price = 47778.16 + 347.59 * Duration"
# Checking assumptions
par(mfrow=c(1,2))
plot(fitted(linear_model), resid(linear_model), main="Residuals vs Fitted")
hist(resid(linear_model), main="Histogram of Residuals")
The p-value for duration is less than 0.05 , we can reject null hypothesis and infer that duration of the flight has a significant linear relationship with the price. As the duration of the flight increases, the price tends to increase by an average of 347.59 INR.
# Boxplot of Stops vs Price
boxplot(price ~ stops, data = data,
main = "Boxplot of Stops vs. Price",
xlab = "Stops",
ylab = "Price",
col = c("blue", "green", "coral"))
# Convert stops to factor and check if it's a factor
data$stops = as.factor(data$stops)
m = aov(data$price ~ data$stops, data = data)
summary(m)
## Df Sum Sq Mean Sq F value Pr(>F)
## data$stops 2 1.413e+11 7.064e+10 713.1 <2e-16 ***
## Residuals 3249 3.219e+11 9.907e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pairwise t-test for Stops vs. Price
pairwise.t.test(data$price, data$stops, p.adjust = 'none')
##
## Pairwise comparisons using t tests with pooled SD
##
## data: data$price and data$stops
##
## one two_or_more
## two_or_more <2e-16 -
## zero <2e-16 <2e-16
##
## P value adjustment method: none
With a 95% confidence level, we can infer significant differences in mean prices among flights with varying numbers of stops. So, the number of stops does influence the price of the flight.
## # A tibble: 3 × 2
## stops mean_price
## <fct> <dbl>
## 1 one 54232.
## 2 two_or_more 68757.
## 3 zero 29658.
# Boxplot of Airline vs Price
boxplot(price ~ airline, data = data,
main = "Boxplot of Airline vs. Price",
xlab = "Airline",
ylab = "Price",
col = c("blue", "green"))
# Convert airline to factor and check if it's a factor
data$airline = as.factor(data$airline)
# ANOVA test for Airline vs Price
m = aov(data$price ~ data$airline, data = data)
summary(m)
## Df Sum Sq Mean Sq F value Pr(>F)
## data$airline 1 4.769e+10 4.769e+10 373 <2e-16 ***
## Residuals 3250 4.155e+11 1.278e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANCOVA (Airline and Days Left) test
library(car)
Anova(lm(data$price ~ data$airline + data$days_left), type = 3)
## Anova Table (Type III tests)
##
## Response: data$price
## Sum Sq Df F value Pr(>F)
## (Intercept) 1.2587e+12 1 9918.083 < 2.2e-16 ***
## data$airline 4.8450e+10 1 381.771 < 2.2e-16 ***
## data$days_left 3.1476e+09 1 24.802 6.684e-07 ***
## Residuals 4.1233e+11 3249
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Airline has a significant impact on flight prices, and this effect persists even when considering the influence of ‘days_left’. Both the choice of airline and the number of days left until departure contribute to the variability in flight prices.
# Boxplot of Departure Time vs Price
boxplot(price ~ departure_time, data = data,
main = "Boxplot of Departure Time vs. Price",
xlab = "Departure Time",
ylab = "Price",
col = c("blue", "green", "red", "lightblue", "lightgreen"))
library(car)
# Convert departure_time to factor
data$departure_time = as.factor(data$departure_time)
data$arrival_time = as.factor(data$arrival_time)
# Two-way ANOVA for Departure Time and Arrival Time vs Price
two_way_model = lm(price ~ departure_time + data$arrival_time , data = data)
Anova(two_way_model, type = 3)
## Anova Table (Type III tests)
##
## Response: price
## Sum Sq Df F value Pr(>F)
## (Intercept) 4.8653e+11 1 3494.1006 < 2.2e-16 ***
## departure_time 1.5688e+09 5 2.2534 0.0466 *
## data$arrival_time 1.0634e+10 5 15.2746 7.239e-15 ***
## Residuals 4.5129e+11 3241
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Interaction plot for Departure and Arrival Time
interaction.plot(data$arrival_time, data$departure_time, data$price)
# Stratifying data by arrival time and performing individual ANOVA
arrival_levels <- levels(data$arrival_time)
perform_anova_summary <- function(level) {
subset_data <- subset(data, arrival_time == level)
print(paste("Summary for ANOVA -", level))
print(summary(aov(price ~ departure_time, data = subset_data)))
print("----")
}
lapply(arrival_levels, perform_anova_summary)
## [1] "Summary for ANOVA - Afternoon"
## Df Sum Sq Mean Sq F value Pr(>F)
## departure_time 4 1.947e+10 4.867e+09 47.94 <2e-16 ***
## Residuals 425 4.314e+10 1.015e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "----"
## [1] "Summary for ANOVA - Early_Morning"
## Df Sum Sq Mean Sq F value Pr(>F)
## departure_time 4 1.991e+09 497788126 4.623 0.00164 **
## Residuals 123 1.325e+10 107686188
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "----"
## [1] "Summary for ANOVA - Evening"
## Df Sum Sq Mean Sq F value Pr(>F)
## departure_time 4 1.129e+10 2.821e+09 24.08 <2e-16 ***
## Residuals 873 1.023e+11 1.172e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "----"
## [1] "Summary for ANOVA - Late_Night"
## Df Sum Sq Mean Sq F value Pr(>F)
## departure_time 4 4.190e+09 1.048e+09 24.09 3.48e-12 ***
## Residuals 64 2.783e+09 4.349e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "----"
## [1] "Summary for ANOVA - Morning"
## Df Sum Sq Mean Sq F value Pr(>F)
## departure_time 5 2.582e+10 5.165e+09 46.59 <2e-16 ***
## Residuals 731 8.103e+10 1.108e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "----"
## [1] "Summary for ANOVA - Night"
## Df Sum Sq Mean Sq F value Pr(>F)
## departure_time 4 2.295e+10 5.738e+09 46.27 <2e-16 ***
## Residuals 1005 1.246e+11 1.240e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "----"
## [[1]]
## [1] "----"
##
## [[2]]
## [1] "----"
##
## [[3]]
## [1] "----"
##
## [[4]]
## [1] "----"
##
## [[5]]
## [1] "----"
##
## [[6]]
## [1] "----"
Departure time alone has no effect on the price of the flight. If the departure time is “Early Morning” it does not matter what is the arrival time. In all other cases, the departure and arrival time combined affect the price of the flight.