# Install and load required libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(summarytools)
# Load the hotel management data
hotel_data <- read.csv("G:/semester_1/4_Statistics_R/syllabus/lab/week3/hotel_bookings.csv")
# Display the first 10 rows of the dataset
head(hotel_data, 10)
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1 Resort Hotel 0 342 2015 July
## 2 Resort Hotel 0 737 2015 July
## 3 Resort Hotel 0 7 2015 July
## 4 Resort Hotel 0 13 2015 July
## 5 Resort Hotel 0 14 2015 July
## 6 Resort Hotel 0 14 2015 July
## 7 Resort Hotel 0 0 2015 July
## 8 Resort Hotel 0 9 2015 July
## 9 Resort Hotel 1 85 2015 July
## 10 Resort Hotel 1 75 2015 July
## arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1 27 1 0
## 2 27 1 0
## 3 27 1 0
## 4 27 1 0
## 5 27 1 0
## 6 27 1 0
## 7 27 1 0
## 8 27 1 0
## 9 27 1 0
## 10 27 1 0
## stays_in_week_nights adults children babies meal country market_segment
## 1 0 2 0 0 BB PRT Direct
## 2 0 2 0 0 BB PRT Direct
## 3 1 1 0 0 BB GBR Direct
## 4 1 1 0 0 BB GBR Corporate
## 5 2 2 0 0 BB GBR Online TA
## 6 2 2 0 0 BB GBR Online TA
## 7 2 2 0 0 BB PRT Direct
## 8 2 2 0 0 FB PRT Direct
## 9 3 2 0 0 BB PRT Online TA
## 10 3 2 0 0 HB PRT Offline TA/TO
## distribution_channel is_repeated_guest previous_cancellations
## 1 Direct 0 0
## 2 Direct 0 0
## 3 Direct 0 0
## 4 Corporate 0 0
## 5 TA/TO 0 0
## 6 TA/TO 0 0
## 7 Direct 0 0
## 8 Direct 0 0
## 9 TA/TO 0 0
## 10 TA/TO 0 0
## previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1 0 C C
## 2 0 C C
## 3 0 A C
## 4 0 A A
## 5 0 A A
## 6 0 A A
## 7 0 C C
## 8 0 C C
## 9 0 A A
## 10 0 D D
## booking_changes deposit_type agent company days_in_waiting_list
## 1 3 No Deposit NULL NULL 0
## 2 4 No Deposit NULL NULL 0
## 3 0 No Deposit NULL NULL 0
## 4 0 No Deposit 304 NULL 0
## 5 0 No Deposit 240 NULL 0
## 6 0 No Deposit 240 NULL 0
## 7 0 No Deposit NULL NULL 0
## 8 0 No Deposit 303 NULL 0
## 9 0 No Deposit 240 NULL 0
## 10 0 No Deposit 15 NULL 0
## customer_type adr required_car_parking_spaces total_of_special_requests
## 1 Transient 0.0 0 0
## 2 Transient 0.0 0 0
## 3 Transient 75.0 0 0
## 4 Transient 75.0 0 0
## 5 Transient 98.0 0 1
## 6 Transient 98.0 0 1
## 7 Transient 107.0 0 0
## 8 Transient 103.0 0 1
## 9 Transient 82.0 0 1
## 10 Transient 105.5 0 0
## reservation_status reservation_status_date
## 1 Check-Out 2015-07-01
## 2 Check-Out 2015-07-01
## 3 Check-Out 2015-07-02
## 4 Check-Out 2015-07-02
## 5 Check-Out 2015-07-03
## 6 Check-Out 2015-07-03
## 7 Check-Out 2015-07-03
## 8 Check-Out 2015-07-03
## 9 Canceled 2015-05-06
## 10 Canceled 2015-04-22
The key objectives of this data dive are as follows:
1: Creating Subsamples: We will create multiple random subsamples from the hotel management data to simulate the process of collecting data from a population.
2: Scrutinizing Subsamples: We will examine these subsamples to understand how they differ from one another and identify potential anomalies.
3: Consistency Across Subsamples: We will explore whether there are any consistent patterns or aspects of the data that are present across all subsamples.
4: Implications for Future Analysis: We will consider how this investigation affects our ability to draw conclusions about the entire dataset.
# Set seed for reproducibility
set.seed(123)
# Define the number of rows in each subsample (50% of the data)
sample_size <- round(0.5 * nrow(hotel_data))
nrow(hotel_data)
## [1] 119390
# Define the columns you want to include
selected_columns <- c(
"hotel", "is_canceled", "lead_time", "arrival_date_year",
"stays_in_weekend_nights", "stays_in_week_nights"
)
# Create a list to store subsamples
subsample_list <- list()
# Create 5-10 random subsamples
num_subsamples <- sample(5:10, 1)
num_subsamples
## [1] 7
for (i in 1:num_subsamples) {
# Randomly sample rows with replacement
sample_indices <- sample(1:nrow(hotel_data), size = sample_size, replace = TRUE)
# Extract the selected columns for the subsample
subsample <- hotel_data[sample_indices, selected_columns]
# Store the subsample in a data frame
subsample_df <- data.frame(subsample)
subsample_df
# Assign a name to the data frame (e.g., df_1, df_2, etc.)
# Store the subsample data frame in the list
subsample_list[[i]] <- subsample_df
subsample_list
}
# Display the first few rows of one of the subsamples (e.g., df_1)
head(subsample_list[[1]])
## hotel is_canceled lead_time arrival_date_year
## 100146 City Hotel 0 153 2016
## 101675 City Hotel 0 83 2016
## 62708 City Hotel 1 559 2017
## 109942 City Hotel 0 1 2017
## 6746 Resort Hotel 0 230 2016
## 16128 Resort Hotel 0 61 2015
## stays_in_weekend_nights stays_in_week_nights
## 100146 2 5
## 101675 2 3
## 62708 0 2
## 109942 0 1
## 6746 2 7
## 16128 2 5
head(subsample)
## hotel is_canceled lead_time arrival_date_year
## 35619 Resort Hotel 0 44 2017
## 89497 City Hotel 0 14 2016
## 12417 Resort Hotel 1 160 2017
## 96204 City Hotel 0 9 2016
## 51678 City Hotel 1 158 2016
## 16476 Resort Hotel 0 35 2015
## stays_in_weekend_nights stays_in_week_nights
## 35619 0 1
## 89497 0 4
## 12417 1 4
## 96204 0 2
## 51678 0 2
## 16476 1 5
head(subsample[1])
## hotel
## 35619 Resort Hotel
## 89497 City Hotel
## 12417 Resort Hotel
## 96204 City Hotel
## 51678 City Hotel
## 16476 Resort Hotel
head(subsample[3])
## lead_time
## 35619 44
## 89497 14
## 12417 160
## 96204 9
## 51678 158
## 16476 35
head(subsample[4])
## arrival_date_year
## 35619 2017
## 89497 2016
## 12417 2017
## 96204 2016
## 51678 2016
## 16476 2015
# Visualizing 'lead_time' in Subsamples
# Number of random samples (between 5 and 10)
num_samples <- sample(5:10, 1)
# Define the percentage of data to include in each subsample (e.g., 50%)
sample_percentage <- 0.5
# Calculate the number of rows to include in each subsample
sample_size <- round(nrow(hotel_data) * sample_percentage)
# Create a list to store random samples
samples <- list()
# Define columns to include in the random samples
columns_to_include <- c(
"hotel", "is_canceled", "lead_time", "arrival_date_year",
"stays_in_weekend_nights", "stays_in_week_nights"
)
# Creating random samples
for (i in 1:num_samples) {
# Randomly sample rows with replacement
sample_indices <- sample(1:nrow(hotel_data), sample_size, replace = TRUE)
# Extract the selected columns for the random sample
random_sample <- hotel_data[sample_indices, columns_to_include]
# Store the random sample in the 'samples' list
samples[[i]] <- random_sample
}
# Create a list to store histogram plots
hist_plots <- list()
# Create histograms for 'lead_time' in each sample
for (i in 1:num_samples) {
hist_plot <- ggplot(samples[[i]], aes(x = lead_time)) +
geom_histogram(binwidth = 20, fill = "blue", color = "black") +
labs(title = paste("Distribution of lead_time - Subsample", i),
x = "lead_time", y = "Frequency") +
theme_minimal()
# Store the histogram plot in the list
hist_plots[[i]] <- hist_plot
}
# Display histograms for each subsample
for (i in 1:num_samples) {
print(hist_plots[[i]])
}
subsample_list_with_lead_time <- list()
# Create 5-10 random subsamples
num_subsamples <- sample(5:10, 1) # Randomly choose the number of subsamples
for (i in 1:num_subsamples) {
# Randomly sample rows with replacement
sample_indices <- sample(1:nrow(hotel_data), size = sample_size, replace = TRUE)
# Extract the selected columns for the subsample, including 'lead_time'
subsample <- hotel_data[sample_indices, c(selected_columns, "lead_time")]
# Store the subsample in a data frame
subsample_df <- data.frame(subsample)
# Assign a name to the data frame (e.g., df_1, df_2, etc.)
# Store the subsample data frame in the list
subsample_list_with_lead_time[[i]] <- subsample_df
}
# Display the first few rows of one of the subsamples (e.g., df_1)
head(subsample_list_with_lead_time[[1]])
## hotel is_canceled lead_time arrival_date_year
## 86206 City Hotel 0 0 2016
## 28164 Resort Hotel 0 132 2016
## 57457 City Hotel 0 418 2016
## 27398 Resort Hotel 0 104 2016
## 62954 City Hotel 1 61 2017
## 43464 City Hotel 1 74 2015
## stays_in_weekend_nights stays_in_week_nights lead_time.1
## 86206 0 1 0
## 28164 1 3 132
## 57457 1 2 418
## 27398 0 5 104
## 62954 0 2 61
## 43464 0 2 74
summary_stats <- list()
box_plots <- list()
# Loop through each subsample
for (i in 1:num_subsamples) {
# Check if 'lead_time' is present in the current subsample
if ("lead_time" %in% colnames(subsample_list_with_lead_time[[i]])) {
# If 'lead_time' is present, calculate summary statistics
summary_stats[[i]] <- summary(subsample_list_with_lead_time[[i]]$lead_time)
# Create a box plot for 'lead_time'
box_plots[[i]] <- ggplot(subsample_list_with_lead_time[[i]], aes(x = factor(1), y = lead_time)) +
geom_boxplot() +
labs(title = paste("Lead Time Distribution for Subsample", i))
# Display summary statistics and box plots
cat(paste("Summary Statistics for Subsample", i, ":\n"))
print(summary_stats[[i]])
print(box_plots[[i]])
} else {
# If 'lead_time' is not present, provide a message
cat("No box plot available for Subsample", i, "(lead_time not found).\n")
}
}
## Summary Statistics for Subsample 1 :
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 17.0 68.0 103.4 159.0 709.0
## Summary Statistics for Subsample 2 :
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 18.0 70.0 104.7 161.0 709.0
## Summary Statistics for Subsample 3 :
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 18.0 68.0 103.1 158.0 629.0
## Summary Statistics for Subsample 4 :
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 18.0 70.0 104.4 162.0 737.0
## Summary Statistics for Subsample 5 :
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 18.0 70.0 104.1 161.0 737.0
## Summary Statistics for Subsample 6 :
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 18.0 69.0 104.4 161.0 629.0
## Summary Statistics for Subsample 7 :
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 18.0 69.0 104.2 160.0 629.0
## Summary Statistics for Subsample 8 :
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 18.0 69.0 104.4 161.0 709.0
# Number of random samples (between 5 and 10)
num_samples <- sample(5:10, 1)
# Define the percentage of data to include in each subsample (e.g., 50%)
sample_percentage <- 0.5
# Calculate the number of rows to include in each subsample
sample_size <- round(nrow(hotel_data) * sample_percentage)
# Create a list to store random samples
samples <- list()
# Define columns to include in the random samples
columns_to_include <- c(
"hotel", "is_canceled", "lead_time", "arrival_date_year",
"stays_in_weekend_nights", "stays_in_week_nights"
)
# Creating random samples
for (i in 1:num_samples) {
# Randomly sample rows with replacement
sample_indices <- sample(1:nrow(hotel_data), sample_size, replace = TRUE)
# Extract the selected columns for the random sample
random_sample <- hotel_data[sample_indices, columns_to_include]
# Store the random sample in the 'samples' list
samples[[i]] <- random_sample
}
# Create a list to store summary statistics for each sample
summary_stats <- list()
# Loop through each random sample
for (i in 1:num_samples) {
summary_stats[[i]] <- summary(samples[[i]])
}
# Display summary statistics for each sample
for (i in 1:num_samples) {
cat(paste("Summary Statistics for Random Sample", i, ":\n"))
print(summary_stats[[i]])
}
## Summary Statistics for Random Sample 1 :
## hotel is_canceled lead_time arrival_date_year
## Length:59695 Min. :0.000 Min. : 0.0 Min. :2015
## Class :character 1st Qu.:0.000 1st Qu.: 18.0 1st Qu.:2016
## Mode :character Median :0.000 Median : 69.0 Median :2016
## Mean :0.371 Mean :103.8 Mean :2016
## 3rd Qu.:1.000 3rd Qu.:159.0 3rd Qu.:2017
## Max. :1.000 Max. :629.0 Max. :2017
## stays_in_weekend_nights stays_in_week_nights
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.000
## Median : 1.0000 Median : 2.000
## Mean : 0.9309 Mean : 2.502
## 3rd Qu.: 2.0000 3rd Qu.: 3.000
## Max. :19.0000 Max. :50.000
## Summary Statistics for Random Sample 2 :
## hotel is_canceled lead_time arrival_date_year
## Length:59695 Min. :0.0000 Min. : 0.0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18.0 1st Qu.:2016
## Mode :character Median :0.0000 Median : 68.0 Median :2016
## Mean :0.3735 Mean :103.6 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:160.0 3rd Qu.:2017
## Max. :1.0000 Max. :629.0 Max. :2017
## stays_in_weekend_nights stays_in_week_nights
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.000
## Median : 1.0000 Median : 2.000
## Mean : 0.9334 Mean : 2.506
## 3rd Qu.: 2.0000 3rd Qu.: 3.000
## Max. :19.0000 Max. :50.000
## Summary Statistics for Random Sample 3 :
## hotel is_canceled lead_time arrival_date_year
## Length:59695 Min. :0.000 Min. : 0.0 Min. :2015
## Class :character 1st Qu.:0.000 1st Qu.: 18.0 1st Qu.:2016
## Mode :character Median :0.000 Median : 69.0 Median :2016
## Mean :0.373 Mean :104.1 Mean :2016
## 3rd Qu.:1.000 3rd Qu.:160.0 3rd Qu.:2017
## Max. :1.000 Max. :629.0 Max. :2017
## stays_in_weekend_nights stays_in_week_nights
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.000
## Median : 1.0000 Median : 2.000
## Mean : 0.9285 Mean : 2.505
## 3rd Qu.: 2.0000 3rd Qu.: 3.000
## Max. :16.0000 Max. :40.000
## Summary Statistics for Random Sample 4 :
## hotel is_canceled lead_time arrival_date_year
## Length:59695 Min. :0.000 Min. : 0.0 Min. :2015
## Class :character 1st Qu.:0.000 1st Qu.: 18.0 1st Qu.:2016
## Mode :character Median :0.000 Median : 68.0 Median :2016
## Mean :0.369 Mean :102.7 Mean :2016
## 3rd Qu.:1.000 3rd Qu.:159.0 3rd Qu.:2017
## Max. :1.000 Max. :737.0 Max. :2017
## stays_in_weekend_nights stays_in_week_nights
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.000
## Median : 1.0000 Median : 2.000
## Mean : 0.9256 Mean : 2.495
## 3rd Qu.: 2.0000 3rd Qu.: 3.000
## Max. :19.0000 Max. :50.000
## Summary Statistics for Random Sample 5 :
## hotel is_canceled lead_time arrival_date_year
## Length:59695 Min. :0.0000 Min. : 0.0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18.0 1st Qu.:2016
## Mode :character Median :0.0000 Median : 70.0 Median :2016
## Mean :0.3724 Mean :104.6 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:162.0 3rd Qu.:2017
## Max. :1.0000 Max. :629.0 Max. :2017
## stays_in_weekend_nights stays_in_week_nights
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.000
## Median : 1.0000 Median : 2.000
## Mean : 0.9329 Mean : 2.504
## 3rd Qu.: 2.0000 3rd Qu.: 3.000
## Max. :19.0000 Max. :50.000
## Summary Statistics for Random Sample 6 :
## hotel is_canceled lead_time arrival_date_year
## Length:59695 Min. :0.0000 Min. : 0.0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18.0 1st Qu.:2016
## Mode :character Median :0.0000 Median : 69.0 Median :2016
## Mean :0.3703 Mean :104.2 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:161.0 3rd Qu.:2017
## Max. :1.0000 Max. :737.0 Max. :2017
## stays_in_weekend_nights stays_in_week_nights
## Min. : 0.0000 Min. : 0.0
## 1st Qu.: 0.0000 1st Qu.: 1.0
## Median : 1.0000 Median : 2.0
## Mean : 0.9245 Mean : 2.5
## 3rd Qu.: 2.0000 3rd Qu.: 3.0
## Max. :14.0000 Max. :35.0
## Summary Statistics for Random Sample 7 :
## hotel is_canceled lead_time arrival_date_year
## Length:59695 Min. :0.0000 Min. : 0.0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18.0 1st Qu.:2016
## Mode :character Median :0.0000 Median : 69.0 Median :2016
## Mean :0.3684 Mean :103.8 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:160.0 3rd Qu.:2017
## Max. :1.0000 Max. :629.0 Max. :2017
## stays_in_weekend_nights stays_in_week_nights
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.000
## Median : 1.0000 Median : 2.000
## Mean : 0.9314 Mean : 2.501
## 3rd Qu.: 2.0000 3rd Qu.: 3.000
## Max. :16.0000 Max. :40.000
## Summary Statistics for Random Sample 8 :
## hotel is_canceled lead_time arrival_date_year
## Length:59695 Min. :0.0000 Min. : 0.0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18.0 1st Qu.:2016
## Mode :character Median :0.0000 Median : 69.0 Median :2016
## Mean :0.3708 Mean :103.6 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:159.0 3rd Qu.:2017
## Max. :1.0000 Max. :709.0 Max. :2017
## stays_in_weekend_nights stays_in_week_nights
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.000
## Median : 1.0000 Median : 2.000
## Mean : 0.9205 Mean : 2.493
## 3rd Qu.: 2.0000 3rd Qu.: 3.000
## Max. :18.0000 Max. :42.000
## Summary Statistics for Random Sample 9 :
## hotel is_canceled lead_time arrival_date_year
## Length:59695 Min. :0.0000 Min. : 0.0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18.0 1st Qu.:2016
## Mode :character Median :0.0000 Median : 69.0 Median :2016
## Mean :0.3699 Mean :103.9 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:161.0 3rd Qu.:2017
## Max. :1.0000 Max. :629.0 Max. :2017
## stays_in_weekend_nights stays_in_week_nights
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.000
## Median : 1.0000 Median : 2.000
## Mean : 0.9228 Mean : 2.499
## 3rd Qu.: 2.0000 3rd Qu.: 3.000
## Max. :19.0000 Max. :50.000
## Summary Statistics for Random Sample 10 :
## hotel is_canceled lead_time arrival_date_year
## Length:59695 Min. :0.0000 Min. : 0.0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18.0 1st Qu.:2016
## Mode :character Median :0.0000 Median : 70.0 Median :2016
## Mean :0.3706 Mean :104.7 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:161.0 3rd Qu.:2017
## Max. :1.0000 Max. :629.0 Max. :2017
## stays_in_weekend_nights stays_in_week_nights
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.000
## Median : 1.0000 Median : 2.000
## Mean : 0.9256 Mean : 2.508
## 3rd Qu.: 2.0000 3rd Qu.: 3.000
## Max. :19.0000 Max. :50.000
# Create a list to store box plots for each sample
box_plots <- list()
# Loop through each random sample
for (i in 1:num_samples) {
# Create box plots for numeric variables (e.g., lead_time)
box_plot <- ggplot(samples[[i]], aes(x = factor(1), y = lead_time)) +
geom_boxplot() +
labs(title = paste("Box Plot of Lead Time for Random Sample", i))
# Store the box plot in the list
box_plots[[i]] <- box_plot
}
box_plots
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
# Display box plots for each sample
for (i in 1:num_samples) {
print(box_plots[[i]])
}
# Calculate the means of 'lead_time' in each random sample
sample_means <- sapply(samples, function(sample) mean(sample$lead_time, na.rm = TRUE))
sample_means
## [1] 103.8252 103.5865 104.1092 102.7333 104.5528 104.2428 103.8381 103.5522
## [9] 103.8795 104.7286
# Calculate the mean of 'lead_time' in the entire dataset
full_dataset_mean <- mean(hotel_data$lead_time, na.rm = TRUE)
full_dataset_mean
## [1] 104.0114
# Calculate the standard deviations of 'lead_time' in each random sample
sample_std_devs <- sapply(samples, function(sample) sd(sample$lead_time, na.rm = TRUE))
sample_std_devs
## [1] 106.6626 107.0315 107.2264 105.8010 107.0285 106.7365 106.8020 106.5620
## [9] 106.9671 107.2167
# Calculate the standard deviation of 'lead_time' in the entire dataset
full_dataset_std_dev <- sd(hotel_data$lead_time, na.rm = TRUE)
full_dataset_std_dev
## [1] 106.8631
# Print the means and standard deviations
cat("Means of 'lead_time' in Random Samples:", sample_means, "\n")
## Means of 'lead_time' in Random Samples: 103.8252 103.5865 104.1092 102.7333 104.5528 104.2428 103.8381 103.5522 103.8795 104.7286
cat("Mean of 'lead_time' in Full Dataset:", full_dataset_mean, "\n")
## Mean of 'lead_time' in Full Dataset: 104.0114
cat("\n")
cat("Standard Deviations of 'lead_time' in Random Samples:", sample_std_devs, "\n")
## Standard Deviations of 'lead_time' in Random Samples: 106.6626 107.0315 107.2264 105.801 107.0285 106.7365 106.802 106.562 106.9671 107.2167
cat("Standard Deviation of 'lead_time' in Full Dataset:", full_dataset_std_dev, "\n")
## Standard Deviation of 'lead_time' in Full Dataset: 106.8631
# Compare means and standard deviations
cat("\n")
cat("Comparison of Sample Means and Full Dataset Mean:\n")
## Comparison of Sample Means and Full Dataset Mean:
mean_differences <- sample_means - full_dataset_mean
cat("Mean Differences:", mean_differences, "\n")
## Mean Differences: -0.1862049 -0.4248848 0.09778876 -1.278072 0.541377 0.2313845 -0.1733395 -0.4591758 -0.1319625 0.7171874
cat("\n")
cat("Comparison of Sample Standard Deviations and Full Dataset Standard Deviation:\n")
## Comparison of Sample Standard Deviations and Full Dataset Standard Deviation:
std_dev_differences <- sample_std_devs - full_dataset_std_dev
cat("Standard Deviation Differences:", std_dev_differences, "\n")
## Standard Deviation Differences: -0.2004601 0.1683731 0.3633127 -1.06206 0.1654356 -0.1265577 -0.0611144 -0.3010776 0.1039902 0.3536398
# Create a list to store outliers for each sub-sample
outliers_list <- list()
num_samples
## [1] 10
# Detect outliers in each sub-sample using box plots
for (i in 1:num_samples) {
# Create a box plot for 'lead_time'
box_plot <- box_plots[[i]]
# Extract the upper and lower bounds for outliers
upper_bound <- box_plot$data$stats["75%"] + 1.5 * IQR(samples[[i]]$lead_time)
lower_bound <- box_plot$data$stats["25%"] - 1.5 * IQR(samples[[i]]$lead_time)
# Identify outliers in 'lead_time' for the current sub-sample
outliers <- samples[[i]]$lead_time > upper_bound | samples[[i]]$lead_time < lower_bound
# Store the outliers in the list
outliers_list[[i]] <- outliers
}
# Print the number of outliers in each sub-sample
for (i in 1:num_samples) {
cat(paste("Number of Outliers in Subsample", i, ":", sum(outliers_list[[i]]), "\n"))
}
## Number of Outliers in Subsample 1 : 0
## Number of Outliers in Subsample 2 : 0
## Number of Outliers in Subsample 3 : 0
## Number of Outliers in Subsample 4 : 0
## Number of Outliers in Subsample 5 : 0
## Number of Outliers in Subsample 6 : 0
## Number of Outliers in Subsample 7 : 0
## Number of Outliers in Subsample 8 : 0
## Number of Outliers in Subsample 9 : 0
## Number of Outliers in Subsample 10 : 0
# Calculate means of 'lead_time' in each sub-sample
means_lead_time <- sapply(samples, function(sample) mean(sample$lead_time, na.rm = TRUE))
# Compare means across sub-samples
cat("Means of 'lead_time' in Sub-Samples:", means_lead_time, "\n")
## Means of 'lead_time' in Sub-Samples: 103.8252 103.5865 104.1092 102.7333 104.5528 104.2428 103.8381 103.5522 103.8795 104.7286
# Check if means are consistent (within a certain tolerance)
consistent_means <- all(abs(means_lead_time - means_lead_time[1]) < 2)
if (consistent_means) {
cat("Means of 'lead_time' are consistent across all sub-samples.\n")
} else {
cat("Means of 'lead_time' vary across sub-samples.\n")
}
## Means of 'lead_time' are consistent across all sub-samples.
# Create a list to store anomalies for each sub-sample
anomalies_list <- list()
# Define anomaly detection criteria (e.g., values below a certain threshold)
anomaly_threshold <- 10 # Adjust the threshold as needed
# Detect anomalies in each sub-sample
for (i in 1:num_samples)
{
# Extract 'lead_time' from the current sub-sample
lead_time <- samples[[i]]$lead_time
# Identify anomalies in 'lead_time' for the current sub-sample
anomalies <- lead_time > anomaly_threshold
# Store the anomalies in the list
anomalies_list[[i]] <- anomalies
}
# Compare the number of anomalies across sub-samples
cat("\n")
anomaly_counts <- sapply(anomalies_list, sum)
cat("Number of Anomalies in Each Sub-Sample:", anomaly_counts, "\n")
## Number of Anomalies in Each Sub-Sample: 48298 48085 48325 48294 48301 48352 48077 48272 48079 48336
# Check if the number of anomalies is consistent (within a certain tolerance)
consistent_anomalies <- all(abs(anomaly_counts - anomaly_counts[1]) < 5)
if (consistent_anomalies) {
cat("Number of anomalies are consistent across all sub-samples.\n")
} else {
cat("Number of anomalies vary across sub-samples.\n")
}
## Number of anomalies vary across sub-samples.
# Load your dataset
file_path <- "G:/semester_1/4_Statistics_R/syllabus/lab/week3/hotel_bookings.csv"
df <- read.csv(file_path, header = TRUE)
# Check the first few rows to ensure the data loaded correctly
head(df)
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1 Resort Hotel 0 342 2015 July
## 2 Resort Hotel 0 737 2015 July
## 3 Resort Hotel 0 7 2015 July
## 4 Resort Hotel 0 13 2015 July
## 5 Resort Hotel 0 14 2015 July
## 6 Resort Hotel 0 14 2015 July
## arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1 27 1 0
## 2 27 1 0
## 3 27 1 0
## 4 27 1 0
## 5 27 1 0
## 6 27 1 0
## stays_in_week_nights adults children babies meal country market_segment
## 1 0 2 0 0 BB PRT Direct
## 2 0 2 0 0 BB PRT Direct
## 3 1 1 0 0 BB GBR Direct
## 4 1 1 0 0 BB GBR Corporate
## 5 2 2 0 0 BB GBR Online TA
## 6 2 2 0 0 BB GBR Online TA
## distribution_channel is_repeated_guest previous_cancellations
## 1 Direct 0 0
## 2 Direct 0 0
## 3 Direct 0 0
## 4 Corporate 0 0
## 5 TA/TO 0 0
## 6 TA/TO 0 0
## previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1 0 C C
## 2 0 C C
## 3 0 A C
## 4 0 A A
## 5 0 A A
## 6 0 A A
## booking_changes deposit_type agent company days_in_waiting_list customer_type
## 1 3 No Deposit NULL NULL 0 Transient
## 2 4 No Deposit NULL NULL 0 Transient
## 3 0 No Deposit NULL NULL 0 Transient
## 4 0 No Deposit 304 NULL 0 Transient
## 5 0 No Deposit 240 NULL 0 Transient
## 6 0 No Deposit 240 NULL 0 Transient
## adr required_car_parking_spaces total_of_special_requests reservation_status
## 1 0 0 0 Check-Out
## 2 0 0 0 Check-Out
## 3 75 0 0 Check-Out
## 4 75 0 0 Check-Out
## 5 98 0 1 Check-Out
## 6 98 0 1 Check-Out
## reservation_status_date
## 1 2015-07-01
## 2 2015-07-01
## 3 2015-07-02
## 4 2015-07-02
## 5 2015-07-03
## 6 2015-07-03
num_subsamples <- 5
subsample_percentage <- 0.5
subsamples <- list()
set.seed(123) # Set a random seed for reproducibility
for (i in 1:num_subsamples) {
subsample <- df[sample(nrow(df), size = floor(nrow(df) * subsample_percentage), replace = TRUE), ]
subsamples[[i]] <- subsample
}
for (i in 1:num_subsamples) {
ggplot(subsamples[[i]], aes(x = lead_time)) +
geom_histogram(binwidth = 20, fill = "blue", color = "black") +
labs(title = paste("Distribution of lead_time - Subsample", i),
x = "lead_time", y = "Frequency") +
theme_minimal()
}
# Calculate the mean lead time for each subsample
mean_lead_time <- sapply(samples, function(df) {
mean(df$lead_time, na.rm = TRUE)
})
# Check if the mean lead time is consistent across all sub-samples
consistent_lead_time <- all(mean_lead_time == mean_lead_time[1])
# Print the results
if (consistent_lead_time) {
cat("The mean lead time is consistent across all sub-samples.\n")
} else {
cat("The mean lead time varies across sub-samples.\n")
}
## The mean lead time varies across sub-samples.
# Calculate quartiles
Q1 <- quantile(hotel_data$lead_time, 0.25)
Q3 <- quantile(hotel_data$lead_time, 0.75)
# Calculate IQR
IQR <- Q3 - Q1
# Identify outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliers <- hotel_data$lead_time[hotel_data$lead_time < lower_bound | hotel_data$lead_time > upper_bound]
# Print the outliers
cat("Outliers in lead_time column:\n")
## Outliers in lead_time column:
print(outliers)
## [1] 737 394 460 381 382 709 468 468 468 468 468 468 468 468 468 468 398 424
## [19] 434 374 406 406 406 406 406 400 379 399 385 422 390 390 394 376 376 376
## [37] 375 385 385 385 397 397 385 385 385 397 397 397 385 385 397 397 397 397
## [55] 397 385 385 385 385 385 397 397 397 385 385 397 397 385 385 397 397 385
## [73] 385 385 542 542 542 542 542 542 542 542 542 542 403 403 383 383 383 383
## [91] 383 383 383 383 383 383 383 383 383 383 383 383 383 383 383 383 383 383
## [109] 383 383 383 383 383 383 383 383 383 383 383 383 383 383 383 383 383 383
## [127] 383 383 383 383 383 383 383 383 383 383 383 384 385 393 393 393 393 393
## [145] 393 393 393 393 393 393 393 393 393 393 393 393 393 393 393 393 393 393
## [163] 393 393 393 393 393 435 375 382 386 386 385 385 385 385 385 385 386 386
## [181] 386 386 386 386 386 386 386 386 386 386 386 386 386 378 378 378 378 378
## [199] 378 378 378 378 378 378 378 378 378 378 378 378 378 471 471 471 471 471
## [217] 471 462 462 462 462 462 462 462 462 462 462 462 462 462 462 462 462 462
## [235] 462 462 462 411 411 411 411 411 411 411 411 411 411 411 411 411 411 411
## [253] 411 411 411 411 411 411 450 411 411 411 390 381 378 454 399 468 468 468
## [271] 468 468 468 468 468 468 468 468 468 468 468 468 468 468 468 468 468 460
## [289] 460 532 468 468 468 468 468 468 468 468 468 468 468 468 468 468 468 468
## [307] 468 383 386 383 406 422 445 542 542 542 542 445 445 542 542 542 542 542
## [325] 542 542 542 542 383 383 383 383 383 383 383 383 383 383 383 383 383 383
## [343] 384 383 383 383 383 383 386 386 386 386 386 386 386 386 386 386 386 386
## [361] 390 386 386 389 389 386 386 386 386 386 386 386 386 386 386 386 386 386
## [379] 386 386 386 386 386 386 386 386 386 386 385 386 386 386 386 386 386 386
## [397] 386 386 386 386 386 386 386 388 388 388 388 388 388 388 388 388 388 388
## [415] 388 388 388 388 388 388 388 388 388 379 379 407 407 379 379 393 393 393
## [433] 393 393 393 393 393 393 393 393 393 393 393 393 393 393 393 393 393 393
## [451] 393 393 393 393 393 393 393 393 393 443 443 443 443 443 443 443 443 443
## [469] 443 443 443 443 443 443 443 443 443 443 443 443 443 437 437 437 437 437
## [487] 437 437 437 437 437 437 437 437 437 437 437 437 437 437 437 437 437 437
## [505] 437 437 437 437 437 437 437 451 451 451 451 451 451 451 451 451 451 451
## [523] 451 451 451 451 451 451 451 451 451 451 451 451 451 451 451 451 451 451
## [541] 451 384 384 384 384 384 384 384 384 384 384 384 384 384 384 384 384 384
## [559] 384 384 384 384 384 384 379 379 379 379 379 379 379 379 379 379 379 379
## [577] 379 379 379 379 379 379 379 379 379 379 379 379 379 379 379 379 379 379
## [595] 379 379 391 391 391 391 391 391 391 391 391 391 391 391 391 391 391 391
## [613] 391 391 391 391 391 391 391 386 386 386 386 386 386 386 386 386 386 386
## [631] 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386
## [649] 386 386 386 391 391 391 391 391 391 391 391 391 391 391 391 391 391 391
## [667] 391 391 391 391 391 391 391 391 391 391 391 391 391 391 391 391 391 405
## [685] 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405
## [703] 405 405 405 405 398 398 398 398 398 398 398 398 398 398 398 398 398 398
## [721] 398 398 398 398 398 398 398 398 398 398 398 398 398 398 398 398 398 398
## [739] 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405
## [757] 405 405 405 405 405 405 405 405 405 405 405 405 405 405 412 412 412 412
## [775] 412 412 412 412 412 412 412 412 412 412 412 412 412 412 412 412 412 412
## [793] 412 412 412 412 412 412 412 412 412 412 419 419 419 419 419 419 419 419
## [811] 419 419 419 419 419 419 419 419 419 419 419 419 419 419 419 419 419 419
## [829] 419 419 419 419 419 419 420 420 420 420 420 420 420 420 420 420 420 420
## [847] 420 420 420 420 420 420 420 420 420 420 420 420 420 420 420 420 420 420
## [865] 426 426 426 426 426 426 426 426 426 426 426 426 426 426 426 426 426 426
## [883] 426 426 426 426 426 426 426 426 426 426 426 426 426 426 433 433 433 433
## [901] 433 433 433 433 433 433 433 433 433 433 433 433 433 433 433 433 433 433
## [919] 433 433 433 433 433 433 433 433 433 433 433 433 433 433 433 433 433 433
## [937] 433 433 433 433 433 433 433 433 433 433 433 433 422 422 422 422 422 422
## [955] 422 422 422 422 422 422 422 422 422 440 440 440 440 440 440 440 440 440
## [973] 440 440 440 440 440 440 440 440 440 440 440 440 440 440 440 440 440 440
## [991] 440 440 440 440 440 429 429 429 429 429 429 429 429 429 429 429 429 429
## [1009] 429 429 418 418 418 418 418 418 418 418 418 418 418 418 418 418 418 418
## [1027] 418 418 418 418 418 418 418 418 418 418 418 418 418 418 418 418 418 418
## [1045] 418 418 418 418 418 418 418 418 418 418 418 418 418 418 418 418 418 418
## [1063] 418 418 418 418 418 418 418 418 447 447 447 447 447 447 447 447 447 447
## [1081] 447 447 447 447 447 447 447 447 447 447 447 447 447 447 447 447 447 447
## [1099] 447 447 447 447 454 454 454 454 454 454 454 454 454 454 454 454 454 454
## [1117] 454 454 454 454 454 454 454 454 454 454 454 454 454 454 454 454 454 454
## [1135] 443 443 443 443 443 443 443 443 443 443 443 443 443 443 443 461 461 461
## [1153] 461 461 461 461 461 461 461 461 461 461 461 461 461 461 461 461 461 461
## [1171] 461 461 461 461 461 461 461 461 461 461 461 605 605 605 605 605 605 605
## [1189] 605 605 605 605 605 605 605 605 605 605 605 605 605 605 605 605 605 605
## [1207] 605 605 605 605 605 468 468 468 468 468 468 468 468 468 468 468 468 468
## [1225] 468 468 468 468 468 468 468 468 468 468 468 468 468 468 468 468 468 468
## [1243] 468 457 457 457 457 457 457 457 457 457 457 457 457 457 457 457 386 386
## [1261] 386 386 475 475 475 475 475 475 475 475 475 475 475 475 475 475 475 475
## [1279] 475 475 475 475 475 475 475 475 475 475 475 475 475 475 475 475 464 464
## [1297] 464 464 464 464 464 464 464 464 464 464 464 464 464 482 482 482 482 482
## [1315] 482 482 482 482 482 482 482 482 482 482 482 482 626 626 626 626 626 626
## [1333] 626 626 626 626 626 626 626 626 626 626 626 626 626 626 626 626 626 626
## [1351] 626 626 626 626 626 626 489 489 489 489 489 489 489 489 489 489 489 489
## [1369] 489 489 489 489 489 496 496 496 496 496 496 496 496 496 496 496 496 496
## [1387] 496 496 496 496 503 503 503 503 503 503 503 503 503 503 503 503 503 503
## [1405] 503 503 503 510 510 510 510 510 510 510 510 510 510 510 510 510 510 510
## [1423] 510 510 517 517 517 517 517 517 517 517 517 517 517 517 517 517 517 517
## [1441] 517 524 524 524 524 524 524 524 524 524 524 524 524 524 524 524 524 524
## [1459] 531 531 531 531 531 531 531 531 531 531 531 531 531 531 531 531 531 381
## [1477] 381 538 538 538 538 538 538 538 538 538 538 538 538 538 538 538 538 538
## [1495] 545 545 545 545 545 545 545 545 545 545 545 545 545 545 545 545 545 552
## [1513] 552 552 552 552 552 552 552 552 552 552 552 552 552 552 552 552 559 559
## [1531] 559 559 559 559 559 559 559 559 559 559 559 559 559 559 559 566 566 566
## [1549] 566 566 566 566 566 566 566 566 566 566 566 566 566 566 573 573 573 573
## [1567] 573 573 573 573 573 573 573 573 573 573 573 573 573 580 580 580 580 580
## [1585] 580 580 580 580 580 580 580 580 580 580 580 580 587 587 587 587 587 587
## [1603] 587 587 587 587 587 587 587 587 587 587 587 594 594 594 594 594 594 594
## [1621] 594 594 594 594 594 594 594 594 594 594 601 601 601 601 601 601 601 601
## [1639] 601 601 601 601 601 601 601 601 601 608 608 608 608 608 608 608 608 608
## [1657] 608 608 608 608 608 608 608 608 615 615 615 615 615 615 615 615 615 615
## [1675] 615 615 615 615 615 615 615 622 622 622 622 622 622 622 622 622 622 622
## [1693] 622 622 622 622 622 622 629 629 629 629 629 629 629 629 629 629 629 629
## [1711] 629 629 629 629 629 386 386 386 386 386 386 386 386 386 386 393 393 393
## [1729] 393 393 393 393 393 393 393 393 393 393 393 393 393 393 393 393 393 393
## [1747] 393 393 393 393 393 393 396 396 396 396 396 396 396 396 396 396 407 407
## [1765] 407 407 407 407 407 407 407 407 407 407 407 407 407 407 407 407 407 407
## [1783] 407 407 407 407 407 407 407 410 410 410 410 410 410 410 410 410 410 394
## [1801] 394 394 394 394 394 394 394 394 394 395 395 395 395 420 420 420 420 420
## [1819] 420 420 420 420 420 420 420 420 420 420 420 420 420 420 420 420 420 420
## [1837] 420 420 420 420 423 423 423 423 423 423 423 423 423 423 408 408 408 408
## [1855] 408 408 408 408 408 408 409 409 409 409 434 434 434 434 437 437 437 437
## [1873] 437 437 437 437 437 437 422 422 422 422 422 422 422 422 422 422 423 423
## [1891] 423 423 423 448 448 448 448 448 451 451 451 451 451 451 451 451 451 451
## [1909] 396 396 396 396 396 396 396 396 462 462 462 462 462 465 465 405 405 405
## [1927] 405 405 405 405 405 465 465 465 465 465 465 465 465 387 450 450 450 450
## [1945] 450 450 450 450 450 450 451 451 451 451 451 451 414 414 414 414 414 414
## [1963] 414 414 414 414 476 476 476 476 476 476 476 476 476 476 476 479 479 479
## [1981] 479 479 479 479 479 479 479 423 423 423 423 423 423 423 423 423 423 423
## [1999] 423 423 423 423 423 423 423 423 464 464 464 464 464 464 464 464 464 464
## [2017] 465 465 465 465 465 467 467 467 467 467 467 467 467 467 467 468 468 490
## [2035] 468 490 490 468 468 490 468 490 490 468 468 490 490 468 468 490 468 468
## [2053] 468 493 493 493 493 493 493 493 493 493 493 389 478 478 478 478 478 478
## [2071] 478 478 478 478 478 479 479 479 504 504 504 504 504 507 507 507 507 507
## [2089] 507 507 507 507 507 457 457 457 457 457 457 457 458 518 518 518 518 521
## [2107] 521 521 521 521 521 521 521 521 521 377 377 377 377 377 377 377 377 377
## [2125] 377 377 377 377 377 377 377 377 377 377 377 374 374 374 374 374 374 374
## [2143] 374 374 374 374 374 374 374 374 374 374 374 374 374 379 379 379 379 379
## [2161] 379 379 379 379 379 379 379 379 379 379 379 379 379 379 379 379 379 379
## [2179] 379 379 379 379 379 379 379 379 414 414 414 414 414 414 414 414 414 414
## [2197] 414 414 414 414 414 414 414 414 414 414 414 414 414 414 414 414 414 414
## [2215] 414 414 414 395 395 395 395 395 395 395 395 395 395 395 395 395 395 395
## [2233] 395 395 395 395 395 395 395 395 395 395 395 395 395 395 395 444 444 444
## [2251] 444 444 444 444 444 444 444 444 444 444 444 444 444 444 444 444 444 444
## [2269] 444 444 444 444 444 444 444 444 444 444 444 444 444 444 444 444 444 444
## [2287] 444 380 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377
## [2305] 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377
## [2323] 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377
## [2341] 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377 377 386
## [2359] 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386
## [2377] 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386
## [2395] 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386 386
## [2413] 386 386 386 386 386 386 386 386 386 386 395 395 394 395 395 395 395 395
## [2431] 395 395 394 395 395 395 395 395 395 395 394 395 395 395 395 409 408 409
## [2449] 409 409 409 409 409 409 409 409 409 409 409 409 409 409 409 409 409 409
## [2467] 409 434 434 434 434 434 434 434 434 434 434 434 434 434 434 434 434 434
## [2485] 434 434 434 434 434 423 423 423 422 423 423 423 423 423 423 423 423 423
## [2503] 422 423 423 423 423 423 423 423 448 448 448 448 448 448 448 448 448 448
## [2521] 448 448 448 448 448 448 448 448 448 448 448 396 396 396 396 396 396 396
## [2539] 396 396 396 396 396 396 396 396 396 396 396 396 396 396 396 396 396 396
## [2557] 396 396 396 396 396 396 396 396 396 396 396 396 396 396 396 396 396 396
## [2575] 396 396 396 396 396 396 396 396 396 396 396 396 396 396 396 396 396 396
## [2593] 396 396 396 396 396 396 462 462 462 462 462 462 462 462 462 462 462 462
## [2611] 462 462 462 462 462 462 462 462 405 462 405 405 405 405 405 405 405 405
## [2629] 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405
## [2647] 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405
## [2665] 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405 405
## [2683] 405 405 405 414 414 414 414 414 414 414 451 451 451 451 414 414 414 414
## [2701] 451 451 414 451 468 451 451 414 451 451 414 414 451 451 451 414 414 414
## [2719] 451 414 414 414 451 451 451 414 414 451 451 414 414 414 414 414 414 414
## [2737] 414 414 414 414 414 414 414 414 414 414 414 414 414 414 414 414 414 414
## [2755] 414 414 414 414 414 414 414 414 414 414 414 414 414 414 414 414 476 476
## [2773] 476 476 476 476 476 476 476 476 476 476 476 476 476 423 423 423 423 423
## [2791] 423 423 423 423 423 423 423 423 423 423 423 423 423 423 423 423 423 423
## [2809] 423 423 423 423 423 423 423 423 423 423 423 423 423 423 423 423 423 423
## [2827] 423 423 423 423 423 423 423 423 423 423 423 423 423 423 381 465 464 464
## [2845] 465 465 465 465 465 465 465 465 465 464 465 464 465 465 463 465 465 465
## [2863] 468 468 468 468 468 468 468 468 468 468 468 468 479 479 479 479 479 478
## [2881] 479 479 479 478 479 479 479 479 479 479 479 479 479 479 479 479 504 504
## [2899] 504 504 504 504 504 504 504 504 504 504 504 504 504 504 504 504 504 504
## [2917] 504 457 457 457 457 457 457 457 457 457 457 457 457 457 457 457 457 457
## [2935] 457 457 457 457 457 457 457 457 457 457 457 457 457 457 457 457 457 457
## [2953] 457 457 457 457 457 457 457 457 457 457 457 457 457 457 457 457 457 457
## [2971] 457 457 457 457 457 457 457 457 457 457 457 457 518 518 518 518 518 518
## [2989] 518 518 518 518 518 518 518 518 518 518 518 518 518 518 518 518 457
# Create a scatter plot for two numerical variables (e.g., 'lead_time' vs. 'adr')
ggplot(hotel_data, aes(x = lead_time, y = adr)) +
geom_point() +
labs(title = "Scatter Plot of 'lead_time' vs. 'adr'")
# Assuming your data frame is called 'df'
# Filter rows where adr is greater than 3000
anomalies <- df[df$adr > 4000, ]
# View the anomalies
head(anomalies)
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## 48516 City Hotel 1 35 2016 March
## arrival_date_week_number arrival_date_day_of_month
## 48516 13 25
## stays_in_weekend_nights stays_in_week_nights adults children babies meal
## 48516 0 1 2 0 0 BB
## country market_segment distribution_channel is_repeated_guest
## 48516 PRT Offline TA/TO TA/TO 0
## previous_cancellations previous_bookings_not_canceled reserved_room_type
## 48516 0 0 A
## assigned_room_type booking_changes deposit_type agent company
## 48516 A 1 Non Refund 12 NULL
## days_in_waiting_list customer_type adr required_car_parking_spaces
## 48516 0 Transient 5400 0
## total_of_special_requests reservation_status reservation_status_date
## 48516 0 Canceled 2016-02-19
# Calculate the IQR for the 'lead_time' variable in your full dataset
Q1 <- quantile(df$lead_time, 0.05)
Q3 <- quantile(df$lead_time, 0.90)
IQR_value <- Q3 - Q1
# Define the lower and upper bounds to identify outliers
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Find outliers in the 'lead_time' variable
outliers <- df$lead_time[df$lead_time < lower_bound | df$lead_time > upper_bound]
outliers
## [1] 737 709
boxplot(df$lead_time, main = "Boxplot of lead_time (with Outliers)",
ylab = "lead_time", col = "lightblue")
df_no_outliers <- df[!(df$lead_time < lower_bound | df$lead_time > upper_bound), ]
# Create a
boxplot(df_no_outliers$lead_time, main = "Boxplot of lead_time (without Outliers)",
ylab = "lead_time", col = "lightgreen")
par(mfrow=c(1,2))
# Create a boxplot to visualize outliers in 'lead_time'
boxplot(df$lead_time, main = "Boxplot of lead_time (with Outliers)",
ylab = "lead_time", col = "lightblue")
# Create a
boxplot(df_no_outliers$lead_time, main = "Boxplot of lead_time (without Outliers)",
ylab = "lead_time", col = "lightgreen")
Outliers seem to be present for all the numerical values in the different ‘samples’ dataset. Outliers are there a few places, the defination of outler changes with situation to situation. It is up to the perspective of a person for all the numerical values in the different ‘samples’ dataset.
When we looked at smaller random samples from the “Hotel Management” dataset, we noticed some differences, especially in the lead times and outliers. These differences remind us that we should be cautious when making conclusions based on samples.
One interesting finding is that the outliers in our smaller samples often had the same values across different samples. This suggests that these outliers might not actually be unusual when we consider the entire “Hotel Management” dataset.
In other words, the outliers we saw in our samples might not be outliers in the bigger picture of the data. So, when analyzing data, it’s important to be aware of how random sampling can affect our conclusions.