Data Description

Rows: 119391
Columns: 32

Load Libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(summarytools)

Load Dataset

hotel_data  <- read.csv("G:/semester_1/4_Statistics_R/syllabus/lab/week7/hotel_bookings.csv")
head(hotel_data, 10)

##           hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1  Resort Hotel           0       342              2015               July
## 2  Resort Hotel           0       737              2015               July
## 3  Resort Hotel           0         7              2015               July
## 4  Resort Hotel           0        13              2015               July
## 5  Resort Hotel           0        14              2015               July
## 6  Resort Hotel           0        14              2015               July
## 7  Resort Hotel           0         0              2015               July
## 8  Resort Hotel           0         9              2015               July
## 9  Resort Hotel           1        85              2015               July
## 10 Resort Hotel           1        75              2015               July
##    arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1                        27                         1                       0
## 2                        27                         1                       0
## 3                        27                         1                       0
## 4                        27                         1                       0
## 5                        27                         1                       0
## 6                        27                         1                       0
## 7                        27                         1                       0
## 8                        27                         1                       0
## 9                        27                         1                       0
## 10                       27                         1                       0
##    stays_in_week_nights adults children babies meal country market_segment
## 1                     0      2        0      0   BB     PRT         Direct
## 2                     0      2        0      0   BB     PRT         Direct
## 3                     1      1        0      0   BB     GBR         Direct
## 4                     1      1        0      0   BB     GBR      Corporate
## 5                     2      2        0      0   BB     GBR      Online TA
## 6                     2      2        0      0   BB     GBR      Online TA
## 7                     2      2        0      0   BB     PRT         Direct
## 8                     2      2        0      0   FB     PRT         Direct
## 9                     3      2        0      0   BB     PRT      Online TA
## 10                    3      2        0      0   HB     PRT  Offline TA/TO
##    distribution_channel is_repeated_guest previous_cancellations
## 1                Direct                 0                      0
## 2                Direct                 0                      0
## 3                Direct                 0                      0
## 4             Corporate                 0                      0
## 5                 TA/TO                 0                      0
## 6                 TA/TO                 0                      0
## 7                Direct                 0                      0
## 8                Direct                 0                      0
## 9                 TA/TO                 0                      0
## 10                TA/TO                 0                      0
##    previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1                               0                  C                  C
## 2                               0                  C                  C
## 3                               0                  A                  C
## 4                               0                  A                  A
## 5                               0                  A                  A
## 6                               0                  A                  A
## 7                               0                  C                  C
## 8                               0                  C                  C
## 9                               0                  A                  A
## 10                              0                  D                  D
##    booking_changes deposit_type agent company days_in_waiting_list
## 1                3   No Deposit  NULL    NULL                    0
## 2                4   No Deposit  NULL    NULL                    0
## 3                0   No Deposit  NULL    NULL                    0
## 4                0   No Deposit   304    NULL                    0
## 5                0   No Deposit   240    NULL                    0
## 6                0   No Deposit   240    NULL                    0
## 7                0   No Deposit  NULL    NULL                    0
## 8                0   No Deposit   303    NULL                    0
## 9                0   No Deposit   240    NULL                    0
## 10               0   No Deposit    15    NULL                    0
##    customer_type   adr required_car_parking_spaces total_of_special_requests
## 1      Transient   0.0                           0                         0
## 2      Transient   0.0                           0                         0
## 3      Transient  75.0                           0                         0
## 4      Transient  75.0                           0                         0
## 5      Transient  98.0                           0                         1
## 6      Transient  98.0                           0                         1
## 7      Transient 107.0                           0                         0
## 8      Transient 103.0                           0                         1
## 9      Transient  82.0                           0                         1
## 10     Transient 105.5                           0                         0
##    reservation_status reservation_status_date
## 1           Check-Out              2015-07-01
## 2           Check-Out              2015-07-01
## 3           Check-Out              2015-07-02
## 4           Check-Out              2015-07-02
## 5           Check-Out              2015-07-03
## 6           Check-Out              2015-07-03
## 7           Check-Out              2015-07-03
## 8           Check-Out              2015-07-03
## 9            Canceled              2015-05-06
## 10           Canceled              2015-04-22

Hypothesis 1: Lead Time Analysis

Null Hypothesis (H0):

The average lead time for canceled bookings is equal to the average lead time for non-canceled bookings.
Rationale: We are testing whether there is a significant difference in lead time between canceled and non-canceled bookings. This is important as it can help us understand if lead time plays a role in booking cancellations.

Alternative Hypothesis (H1):

The average lead time for canceled bookings is not equal to the average lead time for non-canceled bookings.
Alpha Level, Power Level, and Minimum Effect Size:
- Alpha level (α): 0.05
  - Common significance level in hypothesis testing.
- Power level: 0.80
  - Ensures a high probability of correctly rejecting the null hypothesis if it is false.
Minimum effect size: 10 days (for example)
- Represents a meaningful difference in lead time between the two groups.
Rationale: We chose these values to maintain a balance between Type I and Type II errors and to consider practical significance.
Assumptions and Preprocessing:
- No specific assumptions or preprocessing mentioned.

# Perform a t-test to compare the means of lead time for canceled and non-canceled bookings
lead_time_canceled <- hotel_data$lead_time[hotel_data$is_canceled == 1]
lead_time_non_canceled <- hotel_data$lead_time[hotel_data$is_canceled == 0]

t_test_result <- t.test(lead_time_canceled, lead_time_non_canceled)

# Display the t-test result
t_test_result

## 
##  Welch Two Sample t-test
## 
## data:  lead_time_canceled and lead_time_non_canceled
## t = 99.075, df = 74936, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  63.58092 66.14733
## sample estimates:
## mean of x mean of y 
## 144.84882  79.98469

# Define alpha level
alpha <- 0.05

# Calculate critical value for the Neyman-Pearson test
critical_value <- qnorm(1 - alpha) 

# Calculate the test statistic
test_statistic <- (mean(lead_time_canceled) - mean(lead_time_non_canceled)) / 
                  sqrt((var(lead_time_canceled) / length(lead_time_canceled)) + 
                       (var(lead_time_non_canceled) / length(lead_time_non_canceled)))

# Calculate the rejection boundary
rejection_boundary <- critical_value * sqrt((var(lead_time_canceled) / length(lead_time_canceled)) + 
                                            (var(lead_time_non_canceled) / length(lead_time_non_canceled)))

# Calculate power
power_of_test <- 1 - pnorm(test_statistic - rejection_boundary)

# Display the results
cat("Test Statistic:", test_statistic, "\n")

## Test Statistic: 99.07475

cat("Rejection Boundary:", rejection_boundary, "\n")

## Rejection Boundary: 1.076884

cat("Power of Test:", power_of_test, "\n")

## Power of Test: 0

Step 5: Interpret the Results Conclusion: Based on the Neyman-Pearson test, with a specified mean of 0 days for the alternative hypothesis, the test statistic is r test_statistic, and the rejection boundary is r rejection_boundary. The power of the test is r power_of_test.

The test statistic falls within the rejection region. Therefore, we reject the null hypothesis at the 0.05 significance level. The power of the test is 0.80, indicating a high probability of correctly rejecting the null hypothesis when it is false.

# Visualization 1: Bar chart comparing lead time for canceled and non-canceled bookings
lead_time_summary <- hotel_data %>%
  group_by(is_canceled) %>%
  summarize(mean_lead_time = mean(lead_time), sd_lead_time = sd(lead_time))

ggplot(lead_time_summary, aes(x = factor(is_canceled), y = mean_lead_time)) +
  geom_bar(stat = "identity", fill = "blue", alpha = 0.7) +
  geom_errorbar(aes(ymin = mean_lead_time - sd_lead_time, ymax = mean_lead_time + sd_lead_time),
                width = 0.2, position = position_dodge(0.9)) +
  labs(x = "Booking Status", y = "Mean Lead Time") +
  theme_minimal()

Explanation:
- The visualization above shows a bar chart comparing the lead time for canceled and non-canceled bookings. It is evident from the chart that the mean lead time for canceled bookings is notably higher than that for non-canceled bookings. This insight suggests that customers who cancel their bookings tend to book further in advance. The error bars represent the standard deviation, indicating the variability in lead time within each group.

Hypothesis 2: ADR Analysis by Market Segment

Null Hypothesis (H0):
- The average daily rate (ADR) for bookings from different market segments is equal.
Alternative Hypothesis (H1):
- The average daily rate (ADR) for bookings from at least one market segment is different.
Alpha Level, Power Level, and Minimum Effect Size:
- Alpha level (α): 0.05
- Power level: 0.80
- Minimum effect size: 10 units
Assumptions and Preprocessing:
- No specific assumptions or preprocessing mentioned.

# Perform Fisher's style test for ADR Analysis by Market Segment
adr_anova_result <- aov(adr ~ market_segment, data = hotel_data)

# Extracting the p-value
p_value <- summary(adr_anova_result)[[1]][["Pr(>F)"]]

# Interpret the p-value
if (any(p_value < alpha)) {
  cat("At least one market segment has a p-value less than the alpha level of", alpha, 
      "which indicates that at least one market segment has a different average daily rate (ADR).\n")
} else {
  cat("All market segments have p-values greater than the alpha level of", alpha, 
      "which suggests that there is not enough evidence to conclude a significant difference in ADR among market segments.\n")
}

## At least one market segment has a p-value less than the alpha level of 0.05 which indicates that at least one market segment has a different average daily rate (ADR).

# Visualization 2: Box plot of ADR by Market Segment
library(ggplot2)

ggplot(hotel_data, aes(x = market_segment, y = adr)) +
  geom_boxplot(fill = "lightblue", alpha = 0.7) +
  labs(x = "Market Segment", y = "ADR") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

- Explanation: - The box plot above illustrates the distribution of ADR (Average Daily Rate) across different market segments. Each box represents a market segment, and the notches in the boxes show the confidence intervals around the median ADR. The box plot helps visualize the spread and distribution of ADR values among market segments. It is evident that some market segments have notably different ADR distributions, while others overlap, indicating potential differences in ADR among market segments.

# Perform Fisher's exact test for ADR Analysis by Market Segment
alpha <- 0.05
specified_mean <- 0

# Create a contingency table
contingency_table <- table(hotel_data$market_segment, hotel_data$adr > specified_mean)

# Performing the Fisher's exact test
fisher_test_result <- fisher.test(contingency_table, simulate.p.value = TRUE)

# Extract the p-value
p_value_fisher <- fisher_test_result$p.value

# Interpret the p-value
if (p_value_fisher < alpha) {
  cat("The Fisher's exact test indicates a significant association between ADR and Market Segment (p-value < 0.05).\n")
} else {
  cat("The Fisher's exact test does not provide enough evidence to conclude a significant association between ADR and Market Segment (p-value >= 0.05).\n")
}

## The Fisher's exact test indicates a significant association between ADR and Market Segment (p-value < 0.05).

# Display the results of the Fisher's exact test
fisher_test_result

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  2000 replicates)
## 
## data:  contingency_table
## p-value = 0.0004998
## alternative hypothesis: two.sided

# Calculate sample sizes
n_canceled <- length(lead_time_canceled)
n_non_canceled <- length(lead_time_non_canceled)

# Check if sample sizes are large enough
if (n_canceled >= 30 && n_non_canceled >= 30) {
  cat("Sample sizes are large enough for the Neyman-Pearson test.\n")
  
  # Perform Neyman-Pearson test (assuming specified_mean = 0)
  alpha <- 0.05
  specified_mean <- 0
  test_statistic <- (mean(lead_time_canceled) - specified_mean) / (sd(lead_time_canceled) / sqrt(n_canceled))
  rejection_boundary <- qnorm(1 - alpha)
  power_of_test <- 1 - pnorm(test_statistic - rejection_boundary)
  
  # Interpret the results
  if (abs(test_statistic) > rejection_boundary) {
    cat("The test statistic falls within the rejection region. Therefore, we reject the null hypothesis at the", 
        alpha, "significance level.\n")
    cat("The power of the test is", power_of_test, "indicating a high probability of correctly rejecting the null hypothesis when it is false.\n")
  } else {
    cat("The test statistic does not fall within the rejection region. Therefore, we fail to reject the null hypothesis at the", 
        alpha, "significance level.\n")
  }
} else {
  cat("Sample sizes are not large enough for the Neyman-Pearson test. Consider increasing the sample sizes for more reliable results.\n")
}

## Sample sizes are large enough for the Neyman-Pearson test.
## The test statistic falls within the rejection region. Therefore, we reject the null hypothesis at the 0.05 significance level.
## The power of the test is 0 indicating a high probability of correctly rejecting the null hypothesis when it is false.

# Visualization 3: Assuming 'market_segment' is a categorical variable and 'adr' is a numeric variable
ggplot(hotel_data, aes(x = market_segment, y = adr, fill = market_segment)) +
  geom_bar(stat = "identity") +
  labs(title = "Grouped Bar Plot of ADR by Market Segment",
       x = "Market Segment",
       y = "ADR") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create a contingency table of the `meal` and `is_canceled` columns
contingency_table <- table(hotel_data$meal, hotel_data$is_canceled)

# Perform the chi-squared test
chi_square_result <- chisq.test(contingency_table)

# Checking if the chi-square test was successful
if (chi_square_result$p.value > 0) {
  # Extract observed and expected frequencies
  observed_freq <- chi_square_result$observed
  expected_freq <- chi_square_result$expected

  # Calculating the likelihood ratio test statistic
  likelihood_ratio_statistic <- 2 * sum(observed_freq * log(observed_freq / expected_freq))

  # Choosing a significance level (alpha)
  alpha <- 0.05

  # Determining the critical value from the chi-square distribution
  critical_value <- qchisq(1 - alpha, df = chi_square_result$parameter)

  # Comparing the test statistic to the critical value
  if (!is.na(likelihood_ratio_statistic) && !is.na(critical_value)) {
    if (likelihood_ratio_statistic > critical_value) {
      cat("Reject the null hypothesis at the", alpha, "significance level.\n")
    } else {
      cat("Fail to reject the null hypothesis at the", alpha, "significance level.\n")
    }
  } else {
    cat("Error: Unable to perform the likelihood ratio test.\n")
  }

  # Print the result of the chi-square test
  print(chi_square_result)
} else {
  cat("Error: Chi-square test unsuccessful. Check your data.\n")
}

## Reject the null hypothesis at the 0.05 significance level.
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 304.24, df = 4, p-value < 2.2e-16

# Create a contingency table of the `meal` and `is_canceled` columns
contingency_table <- table(hotel_data$meal, hotel_data$is_canceled)

# Perform Fisher's exact test
fisher_test_result <- fisher.test(contingency_table, simulate.p.value = TRUE)

# Extract the p-value
p_value_fisher <- fisher_test_result$p.value

# Define the significance level (alpha)
alpha <- 0.05

# Check if the p-value is less than alpha
if (p_value_fisher < alpha) {
  cat("Reject the null hypothesis at the", alpha, "significance level.\n")
} else {
  cat("Fail to reject the null hypothesis at the", alpha, "significance level.\n")
}

## Reject the null hypothesis at the 0.05 significance level.

# Print the result of Fisher's exact test
print(fisher_test_result)

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  2000 replicates)
## 
## data:  contingency_table
## p-value = 0.0004998
## alternative hypothesis: two.sided

# Create a contingency table of the `meal` and `is_canceled` columns
contingency_table <- table(hotel_data$meal, hotel_data$is_canceled)

# Convert the contingency table to a data frame for visualization
contingency_df <- as.data.frame.matrix(contingency_table)

# Rename the columns for clarity
colnames(contingency_df) <- c("Not Canceled", "Canceled")

# Create a stacked bar chart
ggplot(contingency_df, aes(x = row.names(contingency_df), fill = row.names(contingency_df))) +
  geom_bar(position = "stack") +
  labs(x = "Meal Type", y = "Count", fill = "Booking Status") +
  ggtitle("Stacked Bar Chart of Meal Type vs. Booking Status") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Visualization 4: Histograms of Lead Time for Canceled and Non-Canceled Bookings
library(ggplot2)

ggplot(hotel_data, aes(x = lead_time, fill = factor(is_canceled))) +
  geom_histogram(binwidth = 10, alpha = 0.7, position = "identity") +
  labs(x = "Lead Time", y = "Count") +
  facet_wrap(~ is_canceled, ncol = 2) +
  theme_minimal()

# Perform Fisher's style test for ADR Analysis by Market Segment
adr_anova_result <- aov(adr ~ market_segment, data = hotel_data)

# Extracting the p-value
p_value <- summary(adr_anova_result)[[1]][["Pr(>F)"]]

# Interpret the p-value
if (any(p_value < alpha)) {
  cat("At least one market segment has a p-value less than the alpha level of", alpha, 
      "which indicates that at least one market segment has a different average daily rate (ADR).\n")
} else {
  cat("All market segments have p-values greater than the alpha level of", alpha, 
      "which suggests that there is not enough evidence to conclude a significant difference in ADR among market segments.\n")
}

## At least one market segment has a p-value less than the alpha level of 0.05 which indicates that at least one market segment has a different average daily rate (ADR).

lead_time_summary <- hotel_data %>%
  group_by(is_canceled) %>%
  summarize(mean_lead_time = mean(lead_time), sd_lead_time = sd(lead_time))

ggplot(lead_time_summary, aes(x = factor(is_canceled), y = mean_lead_time)) +
  geom_bar(stat = "identity", fill = "blue", alpha = 0.7) +
  geom_errorbar(aes(ymin = mean_lead_time - sd_lead_time, ymax = mean_lead_time + sd_lead_time),
                width = 0.2, position = position_dodge(0.9)) +
  labs(x = "Booking Status", y = "Mean Lead Time") +
  theme_minimal()

print( "Explanation: This visualization consists of two histograms side by side, one for canceled bookings and another for non-canceled bookings. It helps illustrate the distribution of lead time for both groups, allowing you to compare their characteristics.")

## [1] "Explanation: This visualization consists of two histograms side by side, one for canceled bookings and another for non-canceled bookings. It helps illustrate the distribution of lead time for both groups, allowing you to compare their characteristics."

# Visualization 5: Box plot of ADR by Market Segment 

ggplot(hotel_data, aes(x = market_segment, y = adr, fill = market_segment)) +
  geom_boxplot(alpha = 0.7, notch = TRUE) +
  labs(title = "Box Plot of ADR by Market Segment",
       x = "Market Segment",
       y = "ADR") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?

# Assuming 'market_segment' is a categorical variable and 'adr' is a numeric variable
ggplot(hotel_data, aes(x = market_segment, y = adr, fill = market_segment)) +
  geom_bar(stat = "identity") +
  labs(title = "Grouped Bar Plot of ADR by Market Segment",
       x = "Market Segment",
       y = "ADR") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print("Explanation: This visualization is an improved version of the box plot for ADR by market segment. It includes notches to indicate the confidence intervals around the median. It helps visualize the spread and distribution of ADR values across different market segments, making it easier to compare them.

You can add these code blocks to your R Markdown file, and they will create the missing visualizations as described. Please make sure to include appropriate explanations for each visualization in your document.")

## [1] "Explanation: This visualization is an improved version of the box plot for ADR by market segment. It includes notches to indicate the confidence intervals around the median. It helps visualize the spread and distribution of ADR values across different market segments, making it easier to compare them.\n\nYou can add these code blocks to your R Markdown file, and they will create the missing visualizations as described. Please make sure to include appropriate explanations for each visualization in your document."

Data Dive Week-07

dhruv raghav

2023-10-08