Data Loading

# Load the dataset
kaggle <- read_csv('airlines_flights_data.csv')


# Display first few rows
head(kaggle)
## # A tibble: 6 × 12
##   index airline  flight  source_city departure_time stops arrival_time 
##   <dbl> <chr>    <chr>   <chr>       <chr>          <chr> <chr>        
## 1     0 SpiceJet SG-8709 Delhi       Evening        zero  Night        
## 2     1 SpiceJet SG-8157 Delhi       Early_Morning  zero  Morning      
## 3     2 AirAsia  I5-764  Delhi       Early_Morning  zero  Early_Morning
## 4     3 Vistara  UK-995  Delhi       Morning        zero  Afternoon    
## 5     4 Vistara  UK-963  Delhi       Morning        zero  Morning      
## 6     5 Vistara  UK-945  Delhi       Morning        zero  Afternoon    
## # ℹ 5 more variables: destination_city <chr>, class <chr>, duration <dbl>,
## #   days_left <dbl>, price <dbl>

Airlines in the Dataset

# What are the airlines in the dataset, accompanied by their frequencies?
unique_airlines <- unique(kaggle$airline)
cat("Unique Airlines:\n")
## Unique Airlines:
print(unique_airlines)
## [1] "SpiceJet"  "AirAsia"   "Vistara"   "GO_FIRST"  "Indigo"    "Air_India"
cat("\nFrequency of each airline:\n")
## 
## Frequency of each airline:
airline_freq <- table(kaggle$airline)
print(airline_freq)
## 
## Air_India   AirAsia  GO_FIRST    Indigo  SpiceJet   Vistara 
##     80892     16098     23173     43120      9011    127859

Number of Flights by Airline (Ascending Order)

ggplot(kaggle, aes(y=reorder(airline, airline, FUN=length), fill=airline)) +
  geom_bar() +
  labs(title="Number of Flights by Airline",
       x="Number of Flights", 
       y="Airline") +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, max(table(kaggle$airline)), by = 20000)) +
  guides(fill = "none") +
  scale_fill_manual(values = rep(c("lightblue", "lightgreen"), length.out = length(unique(kaggle$airline)))) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Departure and Arrival Time Analysis

Bar Graphs representing the Departure Time & Arrival Time

# Create data for departure and arrival times with descending counts
departure_counts <- sort(table(kaggle$departure_time), decreasing = TRUE)
arrival_counts <- sort(table(kaggle$arrival_time), decreasing = TRUE)

# Create separate data frames for plotting
departure_data <- data.frame(
  time = names(departure_counts),
  count = as.numeric(departure_counts)
)

arrival_data <- data.frame(
  time = names(arrival_counts),
  count = as.numeric(arrival_counts)
)

# Create separate plots using grid
p1 <- ggplot(departure_data, aes(x = reorder(time, -count), y = count)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  labs(title = "Number of Flights by Departure Time",
       x = "Departure Time", 
       y = "Number of Flights") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p2 <- ggplot(arrival_data, aes(x = reorder(time, -count), y = count)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  labs(title = "Number of Flights by Arrival Time",
       x = "Arrival Time", 
       y = "Number of Flights") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Display plots in a grid
grid.arrange(p1, p2, ncol = 2)

Source City and Destination City Analysis

Bar Graphs representing the Source City & Destination City

# Create data for source and destination cities with descending counts
source_counts <- sort(table(kaggle$source_city), decreasing = TRUE)
destination_counts <- sort(table(kaggle$destination_city), decreasing = TRUE)

# Create separate data frames for plotting
source_data <- data.frame(
  place = names(source_counts),
  count = as.numeric(source_counts)
)

destination_data <- data.frame(
  place = names(destination_counts),
  count = as.numeric(destination_counts)
)

p1 <- ggplot(source_data, aes(y = reorder(place, -count), x = count)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  labs(title = "Number of Flights by Source City",
       x = "Number of Flights", 
       y = "Source City") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p2 <- ggplot(destination_data, aes(y = reorder(place, -count), x = count)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  labs(title = "Number of Flights by Destination City",
       x = "Number of Flights", 
       y = "Destination City") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Display plots in a grid
grid.arrange(p1, p2, ncol = 2)

Price Analysis by Airline and Class

Does price vary with airlines?

price_summary <- kaggle %>%
  dplyr::group_by(airline,class) %>%
  summarise(
    Price = mean(price)
  )

ggplot(price_summary, aes(x = airline, y = Price, fill = class)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Mean Price by Airline and Class",
       x = "Airline", 
       y = "Mean Price",
       fill = "Class") +
  scale_fill_manual(values = c("Economy" = "maroon", "Business" = "orange")) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Price Analysis by Time

Does ticket price change based on the departure time and arrival time?

# Calculate mean price by departure time
departure_price_summary <- kaggle %>%
  group_by(departure_time) %>%
  summarise(mean_price = mean(price))

# Create factor with proper time order
time_order <- c("Early_Morning", "Morning", "Afternoon", "Evening", "Night", "Late_Night")
departure_price_summary$departure_time <- factor(departure_price_summary$departure_time, levels = time_order)

# Create bar plot for departure time
ggplot(departure_price_summary, aes(x = departure_time, y = mean_price, fill = departure_time)) +
  geom_bar(stat = "identity") +
  labs(title = "Mean Price by Departure Time",
       x = "Departure Time", 
       y = "Mean Price") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Calculate mean price by arrival time
arrival_price_summary <- kaggle %>%
  group_by(arrival_time) %>%
  summarise(mean_price = mean(price))

arrival_price_summary$arrival_time <- factor(arrival_price_summary$arrival_time, levels = time_order)

# Create bar plot for arrival time
ggplot(arrival_price_summary, aes(x = arrival_time, y = mean_price, fill = arrival_time)) +
  geom_bar(stat = "identity") +
  labs(title = "Mean Price by Arrival Time",
       x = "Arrival Time", 
       y = "Mean Price") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Line plots with confidence intervals faceted by time

# Set up time order for the dataset
time_order2 <- c("Night","Early_Morning", "Morning", "Evening", "Afternoon", "Late_Night")
kaggle$arrival_time <- factor(kaggle$arrival_time, levels = time_order)
kaggle$departure_time <- factor(kaggle$departure_time, levels = time_order)

# Create line plot similar to seaborn relplot
ggplot(kaggle, aes(x = arrival_time, y = price)) +
  stat_summary(fun = mean, geom = "line", aes(group = 1), size = 1, color = "blue") +
  stat_summary(fun.data = mean_cl_normal, geom = "ribbon", aes(group = 1), alpha = 0.3, fill = "lightblue") +
  facet_wrap(~departure_time, nrow = 1) +
  labs(title = "Price by Arrival Time, Faceted by Departure Time",
       x = "Arrival Time", 
       y = "Price") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Price Analysis by Source and Destination Cities

How the price changes with change in Source and Destination?

price_by_source_city <- kaggle %>%
  dplyr::group_by(source_city) %>%
  summarise(
    Price = mean(price)
  )

price_by_dest_city <- kaggle %>%
  dplyr::group_by(destination_city) %>%
  summarise(
    Price = mean(price)
  )

ggplot(kaggle, aes(x = destination_city, y = price)) +
  stat_summary(fun = mean, geom = "line", aes(group = 1), size = 1, color = "blue") +
  stat_summary(fun.data = mean_cl_normal, geom = "ribbon", aes(group = 1), alpha = 0.3, fill = "lightblue") +
  facet_wrap(~source_city, nrow = 1) +
  labs(title = "Price by Destination City, Faceted by Source City",
       x = "Destination City", 
       y = "Price") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Price Analysis by Days Left

How is the price affected when tickets are bought in just 1 or 2 days before departure?

price_by_days_left <- kaggle %>%
  dplyr::group_by(days_left) %>%
  summarise(
    Price = mean(price)
  )

ggplot(kaggle, aes(x = days_left, y = price)) +
  stat_summary(fun = mean, geom = "line", aes(group = 1), size = 1, color = "blue") +
  stat_summary(fun.data = mean_cl_normal, geom = "ribbon", aes(group = 1), alpha = 0.5, fill = "lightblue") +
  labs(title = "Price by Days Left",
       x = "Days Left", 
       y = "Price") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Class Comparison Analysis

How does the ticket price vary between Economy and Business class?

kaggle_business <- kaggle %>%
  filter(
    class != 'Economy'
  )

kaggle_economy <- kaggle %>%
  filter(
    class == 'Economy'
  )

economy_mean <- kaggle_economy %>%
  summarise(
    Price = mean(price)
  )

business_mean <- kaggle_business %>%
  summarise(
    Price = mean(price)
  )

cat("Average Economy Class Price:", round(economy_mean$Price, 2), "\n")
## Average Economy Class Price: 6572.34
cat("Average Business Class Price:", round(business_mean$Price, 2), "\n")
## Average Business Class Price: 52540.08
cat("Price Difference:", round(business_mean$Price - economy_mean$Price, 2), "\n")
## Price Difference: 45967.74

Specific Route Analysis

What will be the Average Price of Vistara airline for a flight from Delhi to Hyderabad in Business Class?

new_data <- kaggle %>%
  filter(
    airline == 'Vistara',
    source_city == 'Delhi',
    destination_city == 'Hyderabad',
    class == 'Business'
  )

cat("Average Price for Vistara Business Class flight from Delhi to Hyderabad:", 
    round(mean(new_data$price), 2), "\n")
## Average Price for Vistara Business Class flight from Delhi to Hyderabad: 47939.84
cat("Number of flights in this category:", nrow(new_data), "\n")
## Number of flights in this category: 1660

Summary Statistics

cat("Dataset Summary:\n")
## Dataset Summary:
cat("Total number of flights:", nrow(kaggle), "\n")
## Total number of flights: 300153
cat("Number of airlines:", length(unique(kaggle$airline)), "\n")
## Number of airlines: 6
cat("Number of source cities:", length(unique(kaggle$source_city)), "\n")
## Number of source cities: 6
cat("Number of destination cities:", length(unique(kaggle$destination_city)), "\n")
## Number of destination cities: 6
cat("Average ticket price:", round(mean(kaggle$price), 2), "\n")
## Average ticket price: 20889.66
cat("Price range:", round(min(kaggle$price), 2), "to", round(max(kaggle$price), 2), "\n")
## Price range: 1105 to 123071