title: “Assignment 4 - LDA” output: html_notebook — library(‘dplyr’) library(‘stargazer’) library(‘caret’) library(‘MASS’) library ( “ISLR” ) library(“psych”) library(“MASS”) library(“ggord”) library(“devtools”) library(“ggplot2”) library(“tidyverse”) library(“survival”)
install.packages(“epitools”) library(epitools)
hotel_data <- read.csv("C:/Users/Issac/OneDrive/Desktop/Meharry/MSDS 565 Predictive Modeling and Analytics/Predicitive Modeling/Hotel.csv")
### Visualize the data
# Install the psych package if you haven't already
install.packages("psych")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/Issac/AppData/Local/R/win-library/4.3’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.3/psych_2.4.6.26.zip'
Content type 'application/zip' length 3720730 bytes (3.5 MB)
downloaded 3.5 MB
package ‘psych’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\Issac\AppData\Local\Temp\RtmpsxsHlm\downloaded_packages
# Load the psych package
library(psych)
Warning: package ‘psych’ was built under R version 4.3.3
Attaching package: ‘psych’
The following objects are masked from ‘package:ggplot2’:
%+%, alpha
# Now you can use the pairs.panels function
pairs.panels(hotel_data[15:16], gap = 0, bg = c("red", "green"))
head(hotel_data)
nrow(hotel$status)
Error: object 'hotel' not found
### Odds Ratio_
### Example Approach
# Sample data a <- 50 # previous cancellations with current cancellations b <- 30 # previous cancellations without current cancellations c <- 20 # no previous cancellations with current cancellations d <- 100 # no previous cancellations without current cancellations
## Create a matrix table <- matrix(c(npc, pc, cc, ncc), ncol=2)##
### People who do not have previous cancellations
npc = length(which(hotel_data$previous_cancellations!=0))
npc
[1] 338
### People who do have previous cancellations
pc = length(which(hotel_data$previous_cancellations==0))
### People who do have current cancellations
cc = length(which(hotel_data$status=="Canceled"))
### People who do not have current cancellations
ncc = length(which(hotel_data$status == "Not_Canceled"))
df1 <- cbind(npc,pc,cc,ncc)
df1
npc pc cc ncc
[1,] 338 35937 11885 24390
table <- matrix(c(npc, pc, cc, ncc), ncol=2)
table
[,1] [,2]
[1,] 338 11885
[2,] 35937 24390
## 1. Among previous cancellations, what is the odds that have current cancellation
Odds_Ratio <-(npc*ncc)/(cc*pc)
Odds_Ratio
[1] 0.01930134
### Which type of room is more likely to have current booking cancelled?
table(hotel_data$room_type,hotel_data$status)
Canceled Not_Canceled
Room_Type 1 9072 19058
Room_Type 2 228 464
Room_Type 3 2 5
Room_Type 4 2069 3988
Room_Type 5 72 193
Room_Type 6 406 560
Room_Type 7 36 122
newc = which(hotel_data$status == "Canceled")/which(hotel_data$status == "Not_Canceled")
Warning: longer object length is not a multiple of shorter object length
## Room type 6 is more likely to have current booking cancelled as it has a 42% cancellation rate.
##Preprossesing
# Remove rows with missing values
hotel_data <- na.omit(hotel_data)
# Replace infinite values with NA
hotel_data[sapply(hotel_data, is.infinite)] <- NA
# Optionally, you can impute missing values
# For example, using the mean for numeric columns
hotel_data[is.na(hotel_data)] <- lapply(hotel_data, function(x) if(is.numeric(x)) mean(x, na.rm = TRUE) else x)
#Pick 5 to 8 predictors, split your data, 70% training and 30% testing, outcome variable is status
Reduced_set <- hotel_data[,c("previous_bookings_not_canceled","status","previous_cancellations","avg_room_price","room_type","lead_time")]
print(Reduced_set)
### Setting Training and Testing Data
install.packages("caret")
Error in install.packages : Updating loaded packages
library(caret)
Warning: package ‘caret’ was built under R version 4.3.3Loading required package: ggplot2
Warning: package ‘ggplot2’ was built under R version 4.3.3Loading required package: lattice
# Determine the number of rows in the dataset
num_rows <- nrow(Reduced_set)
# Calculate the number of rows for the training set (70% of the data)
train_size <- round(0.7 * num_rows)
# Create a random sample of row indices for the training set
train_indices <- sample(1:num_rows, size = train_size)
# Split the data into training and testing sets
training_data <- Reduced_set[train_indices, ]
testing_data <- Reduced_set[-train_indices, ]
# Print a summary of the training and testing data
summary(training_data)
previous_bookings_not_canceled status previous_cancellations
Min. : 0.0000 Min. :0 Min. : 0.0000
1st Qu.: 0.0000 1st Qu.:0 1st Qu.: 0.0000
Median : 0.0000 Median :0 Median : 0.0000
Mean : 0.1563 Mean :0 Mean : 0.0239
3rd Qu.: 0.0000 3rd Qu.:0 3rd Qu.: 0.0000
Max. :58.0000 Max. :0 Max. :13.0000
avg_room_price room_type lead_time
Min. : 0.00 Length:25392 Min. : 0.00
1st Qu.: 80.75 Class :character 1st Qu.: 17.00
Median : 99.95 Mode :character Median : 57.00
Mean :103.51 Mean : 85.59
3rd Qu.:120.60 3rd Qu.:127.00
Max. :540.00 Max. :443.00
summary(testing_data)
previous_bookings_not_canceled status previous_cancellations
Min. : 0.0000 Min. :0 Min. : 0.00000
1st Qu.: 0.0000 1st Qu.:0 1st Qu.: 0.00000
Median : 0.0000 Median :0 Median : 0.00000
Mean : 0.1466 Mean :0 Mean : 0.02205
3rd Qu.: 0.0000 3rd Qu.:0 3rd Qu.: 0.00000
Max. :53.0000 Max. :0 Max. :13.00000
avg_room_price room_type lead_time
Min. : 0.00 Length:10883 Min. : 0.0
1st Qu.: 80.04 Class :character 1st Qu.: 17.0
Median : 99.00 Mode :character Median : 57.0
Mean :103.22 Mean : 84.4
3rd Qu.:120.00 3rd Qu.:124.0
Max. :349.63 Max. :443.0
# Check the dimensions of the split data
dim(training_data)
[1] 25392 6
dim(testing_data)
[1] 10883 6
## Build a linear discriminant analysis model, report confusion matrix and accuracy
library('MASS')
# Fit the LDA model
lda_model <- lda(status ~., data = Reduced_set)
summary(lda_model)
## Visualizing the lda model
ggplot2::aes(hotel_data$status,hotel_data$previous_bookings_not_canceled)
lda2 <- predict(lda1,type="response")
training_data
### Build a logistic regression model, accuracy
unique(Reduced_set$status)
[1] 0
Reduced_set$status <- ifelse(Reduced_set$status == "yes", 1, 0)
Reduced_set$status <- as.numeric(Reduced_set$status)
logit1 <- glm(status ~ ., data = Reduced_set, family = binomial)
Warning: glm.fit: algorithm did not converge
exp(coef(logit1))
(Intercept) previous_bookings_not_canceled
2.900701e-12 1.000000e+00
previous_cancellations avg_room_price
1.000000e+00 1.000000e+00
room_typeRoom_Type 2 room_typeRoom_Type 3
1.000000e+00 1.000000e+00
room_typeRoom_Type 4 room_typeRoom_Type 5
1.000000e+00 1.000000e+00
room_typeRoom_Type 6 room_typeRoom_Type 7
1.000000e+00 1.000000e+00
lead_time
1.000000e+00