Initial setup and Configure the data set. Load the data set file
in variable hotel_data files.
Data set - Hotels : This data comes
from an open hotel booking demand dataset from Antonio, Almeida and
Nunes.
knitr::opts_chunk$set(echo = TRUE)
# Load the 'dplyr' library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Load the data into hotel_data for further use
hotel_data <- read.csv(file.choose())
#hotel_data <- read.csv('C:/Users/amitg/Documents/workspaceR/data/hotels.csv')
In Below section, There are three pairs :
pair_1 <- hotel_data[, c("is_canceled", "lead_time")]
# Display the first few rows of pair 1
head(pair_1)
## is_canceled lead_time
## 1 0 342
## 2 0 737
## 3 0 7
## 4 0 13
## 5 0 14
## 6 0 14
2.Pair 2 : Continuous Variables
# Display the first few rows of pair 2 for variable - stays_in_weekend_nights and stays_in_week_nights.
pair_2 <- hotel_data[, c("stays_in_weekend_nights", "stays_in_week_nights")]
head(pair_2)
## stays_in_weekend_nights stays_in_week_nights
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 2
## 6 0 2
3. Pair 3 : Ordered Categorical Variables
The room_type_difference column (in this section) is calculated based
on the difference between the assigned and reserved room types. This
column serves as an additional variable that is created based on others
like - ‘reserved_room_type’,‘assigned_room_type’ ,
‘room_type_difference’.’
# Convert room types to numeric values
room_type_numeric <- function(room_type) {
if (room_type == 'A') {
return(1)
} else if (room_type == 'B') {
return(2)
} else if (room_type == 'C') {
return(3)
} else if (room_type == 'D') {
return(4)
} else if (room_type == 'E') {
return(5)
} else if (room_type == 'F') {
return(6)
} else if (room_type == 'G') {
return(7)
} else if (room_type == 'H') {
return(8)
} else {
return(NA) # If room type is not recognized
}
}
# Apply the function to the reserved and assigned room type columns
hotel_data$reserved_room_numeric <- sapply(hotel_data$reserved_room_type, room_type_numeric)
hotel_data$assigned_room_numeric <- sapply(hotel_data$assigned_room_type, room_type_numeric)
# Calculate room type difference
hotel_data$room_type_difference <- hotel_data$assigned_room_numeric - hotel_data$reserved_room_numeric
# Pair 3: Ordered Categorical Variables with Room Type Difference
pair_3 <- hotel_data[, c("reserved_room_type", "assigned_room_type", "room_type_difference")]
# Display the first few rows of pair 3
head(pair_3)
## reserved_room_type assigned_room_type room_type_difference
## 1 C C 0
## 2 C C 0
## 3 A C 2
## 4 A A 0
## 5 A A 0
## 6 A A 0
tail(pair_3)
## reserved_room_type assigned_room_type room_type_difference
## 119385 A A 0
## 119386 A A 0
## 119387 E E 0
## 119388 D D 0
## 119389 A A 0
## 119390 A A 0
Ask:
Since most points fall along the diagonal line, it suggests that the reserved and assigned room types match closely, indicating that guests generally receive the room type they booked.
In some cases, some points are scattered away from the diagonal line, it suggests discrepancies between the reserved and assigned room types. This could indicate instances where guests are upgraded to a different room type or where there are issues with room availability.
To calculate correlation coefficient for each combination and build
confidence intervals for the response variable.
1. Calculate the
correlation coefficient for each pair of variables.
2. Construct
confidence intervals for the response variable (is_canceled).
# Calculate correlation coefficients
cor_pair1 <- cor(hotel_data$is_canceled, hotel_data$lead_time)
cor_pair2 <- cor(hotel_data$stays_in_weekend_nights, hotel_data$stays_in_week_nights)
# Define function to calculate Cramer's V
cramers_v <- function(x, y) {
confusion_matrix <- table(x, y)
n <- sum(confusion_matrix)
chi_sq <- chisq.test(confusion_matrix)$statistic
v <- sqrt(chi_sq / (n * (min(nrow(confusion_matrix), ncol(confusion_matrix)) - 1)))
return(v)
}
# Calculate Cramer's V for reserved_room_type and assigned_room_type
cramer_v_pair3 <- cramers_v(hotel_data$reserved_room_type, hotel_data$assigned_room_type)
## Warning in chisq.test(confusion_matrix): Chi-squared approximation may be
## incorrect
# Print correlation coefficients and Cramer's V
print(paste("Correlation coefficient for Pair 1:", cor_pair1))
## [1] "Correlation coefficient for Pair 1: 0.293123355760716"
print(paste("Correlation coefficient for Pair 2:", cor_pair2))
## [1] "Correlation coefficient for Pair 2: 0.498968818495529"
print(paste("Cramer's V for Pair 3:", cramer_v_pair3))
## [1] "Cramer's V for Pair 3: 0.776358336690268"
# Confidence interval for response variable is_canceled
is_canceled_mean <- mean(hotel_data$is_canceled)
is_canceled_sd <- sd(hotel_data$is_canceled)
n <- length(hotel_data$is_canceled)
standard_error <- is_canceled_sd / sqrt(n)
# Assuming a normal distribution, construct a 95% confidence interval
lower_bound <- is_canceled_mean - qnorm(0.975) * standard_error
upper_bound <- is_canceled_mean + qnorm(0.975) * standard_error
# Print confidence interval
print(paste("95% Confidence Interval for is_canceled:", lower_bound, "-", upper_bound))
## [1] "95% Confidence Interval for is_canceled: 0.367676994663234 - 0.373155570878268"
1. Correlation Coefficients:Overall, these analyses allow us to better understand the relationships between variables and draw conclusions about cancellations and other relevant factors in the dataset. Further analysis and interpretation may be necessary based on the specific context and goals of the analysis.
Thank You.!!!