Data Preparation

Loading and Cleaning Data

# Load necessary libraries
library(dplyr)

# Load the dataset
data <- read.csv("D:/1_BU/Fundamental of ML/Term Project/flight_data.csv")

colnames(data)[colnames(data) == "duration"] <- "flight_duration"
# Remove missing values
data <- na.omit(data)

# Remove outliers based on price
Q1 <- quantile(data$price, 0.25)
Q3 <- quantile(data$price, 0.75)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
data <- subset(data, price >= lower_bound & price <= upper_bound)

# Drop unwanted columns
data <- data %>%
  select(-flight, -source_city, -destination_city)

colnames(data)
## [1] "filghtId"        "airline"         "departure_time"  "stops"          
## [5] "arrival_time"    "flight_duration" "days_left"       "price"

1.Duration and Price Relation

Scatter Plot and Correlation

This section explores the relationship between flight duration and price using a scatter plot and correlation analysis.

# Scatter plot of Price vs. Flight Duration
plot(data$flight_duration, data$price, 
     main = "Scatter Plot of Price vs. Flight Duration",
     xlab = "Flight Duration",
     ylab = "Price",
     col = "blue",
     pch = 16)

# Calculate correlation coefficient
cor_coefficient <- cor(data$flight_duration, data$price)

# Perform correlation test
cor_test_result <- cor.test(data$flight_duration, data$price)
print(cor_test_result)
## 
##  Pearson's product-moment correlation
## 
## data:  data$flight_duration and data$price
## t = 11.49, df = 3250, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1643263 0.2303892
## sample estimates:
##       cor 
## 0.1975821

Linear Regression

# Linear regression model
linear_model <- lm(price ~ flight_duration, data = data)
intercept <- coef(linear_model)[1]
slope <- coef(linear_model)
print(paste("Equation of the line: Price =", round(intercept, 2), "+", round(slope, 2), " * Duration"))
## [1] "Equation of the line: Price = 47778.16 + 47778.16  * Duration"
## [2] "Equation of the line: Price = 47778.16 + 347.59  * Duration"
# Checking assumptions
par(mfrow=c(1,2))
plot(fitted(linear_model), resid(linear_model), main="Residuals vs Fitted")
hist(resid(linear_model), main="Histogram of Residuals")

The p-value for duration is less than 0.05 , we can reject null hypothesis and infer that duration of the flight has a significant linear relationship with the price. As the duration of the flight increases, the price tends to increase by an average of 347.59 INR.

2.Stops vs. Price

# Boxplot of Stops vs Price
boxplot(price ~ stops, data = data, 
        main = "Boxplot of Stops vs. Price",
        xlab = "Stops",
        ylab = "Price",
        col = c("blue", "green", "coral"))

ANOVA TEST

# Convert stops to factor and check if it's a factor
data$stops = as.factor(data$stops)

m = aov(data$price ~ data$stops, data = data)
summary(m)
##               Df    Sum Sq   Mean Sq F value Pr(>F)    
## data$stops     2 1.413e+11 7.064e+10   713.1 <2e-16 ***
## Residuals   3249 3.219e+11 9.907e+07                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pairwise t-test for Stops vs. Price
pairwise.t.test(data$price, data$stops, p.adjust = 'none')
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  data$price and data$stops 
## 
##             one    two_or_more
## two_or_more <2e-16 -          
## zero        <2e-16 <2e-16     
## 
## P value adjustment method: none

With a 95% confidence level, we can infer significant differences in mean prices among flights with varying numbers of stops. So, the number of stops does influence the price of the flight.

Mean Prices by Stops

## # A tibble: 3 × 2
##   stops       mean_price
##   <fct>            <dbl>
## 1 one             54232.
## 2 two_or_more     68757.
## 3 zero            29658.

3. Effect Of Airline and Days Left on Price

# Boxplot of Airline vs Price
boxplot(price ~ airline, data = data, 
        main = "Boxplot of Airline vs. Price",
        xlab = "Airline",
        ylab = "Price",
        col = c("blue", "green"))

# Convert airline to factor and check if it's a factor
data$airline = as.factor(data$airline)

# ANOVA test for Airline vs Price
m = aov(data$price ~ data$airline, data = data)
summary(m)
##                Df    Sum Sq   Mean Sq F value Pr(>F)    
## data$airline    1 4.769e+10 4.769e+10     373 <2e-16 ***
## Residuals    3250 4.155e+11 1.278e+08                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANCOVA (Airline and Days Left) test
library(car)
Anova(lm(data$price ~ data$airline + data$days_left), type = 3)
## Anova Table (Type III tests)
## 
## Response: data$price
##                    Sum Sq   Df  F value    Pr(>F)    
## (Intercept)    1.2587e+12    1 9918.083 < 2.2e-16 ***
## data$airline   4.8450e+10    1  381.771 < 2.2e-16 ***
## data$days_left 3.1476e+09    1   24.802 6.684e-07 ***
## Residuals      4.1233e+11 3249                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Airline has a significant impact on flight prices, and this effect persists even when considering the influence of ‘days_left’. Both the choice of airline and the number of days left until departure contribute to the variability in flight prices.

4. Effect Of Departure and Arrival Time on Price

# Boxplot of Departure Time vs Price
boxplot(price ~ departure_time, data = data, 
        main = "Boxplot of Departure Time vs. Price",
        xlab = "Departure Time",
        ylab = "Price",
        col = c("blue", "green", "red", "lightblue", "lightgreen"))

library(car)
# Convert departure_time to factor
data$departure_time = as.factor(data$departure_time)
data$arrival_time = as.factor(data$arrival_time)

# Two-way ANOVA for Departure Time and Arrival Time vs Price

two_way_model = lm(price ~ departure_time + data$arrival_time , data = data)
Anova(two_way_model, type = 3)
## Anova Table (Type III tests)
## 
## Response: price
##                       Sum Sq   Df   F value    Pr(>F)    
## (Intercept)       4.8653e+11    1 3494.1006 < 2.2e-16 ***
## departure_time    1.5688e+09    5    2.2534    0.0466 *  
## data$arrival_time 1.0634e+10    5   15.2746 7.239e-15 ***
## Residuals         4.5129e+11 3241                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Interaction plot for Departure and Arrival Time
interaction.plot(data$arrival_time, data$departure_time, data$price)

# Stratifying data by arrival time and performing individual ANOVA
arrival_levels <- levels(data$arrival_time)

perform_anova_summary <- function(level) {
  subset_data <- subset(data, arrival_time == level)
  print(paste("Summary for ANOVA -", level))
  print(summary(aov(price ~ departure_time, data = subset_data)))
  print("----")
}

lapply(arrival_levels, perform_anova_summary)
## [1] "Summary for ANOVA - Afternoon"
##                 Df    Sum Sq   Mean Sq F value Pr(>F)    
## departure_time   4 1.947e+10 4.867e+09   47.94 <2e-16 ***
## Residuals      425 4.314e+10 1.015e+08                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "----"
## [1] "Summary for ANOVA - Early_Morning"
##                 Df    Sum Sq   Mean Sq F value  Pr(>F)   
## departure_time   4 1.991e+09 497788126   4.623 0.00164 **
## Residuals      123 1.325e+10 107686188                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "----"
## [1] "Summary for ANOVA - Evening"
##                 Df    Sum Sq   Mean Sq F value Pr(>F)    
## departure_time   4 1.129e+10 2.821e+09   24.08 <2e-16 ***
## Residuals      873 1.023e+11 1.172e+08                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "----"
## [1] "Summary for ANOVA - Late_Night"
##                Df    Sum Sq   Mean Sq F value   Pr(>F)    
## departure_time  4 4.190e+09 1.048e+09   24.09 3.48e-12 ***
## Residuals      64 2.783e+09 4.349e+07                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "----"
## [1] "Summary for ANOVA - Morning"
##                 Df    Sum Sq   Mean Sq F value Pr(>F)    
## departure_time   5 2.582e+10 5.165e+09   46.59 <2e-16 ***
## Residuals      731 8.103e+10 1.108e+08                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "----"
## [1] "Summary for ANOVA - Night"
##                  Df    Sum Sq   Mean Sq F value Pr(>F)    
## departure_time    4 2.295e+10 5.738e+09   46.27 <2e-16 ***
## Residuals      1005 1.246e+11 1.240e+08                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "----"
## [[1]]
## [1] "----"
## 
## [[2]]
## [1] "----"
## 
## [[3]]
## [1] "----"
## 
## [[4]]
## [1] "----"
## 
## [[5]]
## [1] "----"
## 
## [[6]]
## [1] "----"

Departure time alone has no effect on the price of the flight. If the departure time is “Early Morning” it does not matter what is the arrival time. In all other cases, the departure and arrival time combined affect the price of the flight.