Question 1

# Load necessary libraries
library(readr)
library(dplyr)
library(caret)

# Load data set (replace the path with your actual file location)
train_data <- read_csv("/Users/valfa/Downloads/spaceship-titanic/train.csv")

# Drop irrelevant columns
train_data <- train_data %>%
  select(-PassengerId, -Name, -Cabin)

# Define numeric and categorical features
numeric_features <- c('Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck')
categorical_features <- c('HomePlanet', 'CryoSleep', 'Destination', 'VIP')

# Impute numeric columns
for (col in numeric_features) {
  train_data[[col]][is.na(train_data[[col]])] <- median(train_data[[col]], na.rm = TRUE)
}

# Impute categorical columns
for (col in categorical_features) {
  train_data[[col]][is.na(train_data[[col]])] <- as.character(stats::na.omit(train_data[[col]])[which.max(table(train_data[[col]]))])
}

# Convert categorical columns to factors
train_data <- train_data %>%
  mutate(across(all_of(categorical_features), as.factor))

# Convert target to factor
train_data$Transported <- as.factor(train_data$Transported)

# Train logistic regression model
model <- glm(Transported ~ ., data = train_data, family = binomial())

# Make predictions on training data
predictions <- predict(model, train_data, type = "response")
predicted_classes <- ifelse(predictions > 0.5, TRUE, FALSE)

# Calculate training error rate
actual_classes <- train_data$Transported == "TRUE"
error_rate <- mean(predicted_classes != actual_classes) 

print(paste("Training classification error rate:", round(error_rate, 4)))
## [1] "Training classification error rate: 0.2163"

Question 2) In Chapter 4, we used logistic regression to predict the

probability of default using income and balance on the Default data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis.

  1. Fit a logistic regression model that uses income and balance to predict default.
knitr::opts_chunk$set(echo = TRUE)
library(ISLR)
set.seed(1)
fit.glm = glm(default ~ income + balance, data = Default, family = "binomial")
summary(fit.glm)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

(bi) Using the validation set approach, estimate the test error of this model. In order to do this, you must perform the following steps:

train = sample(dim(Default)[1], dim(Default)[1] / 2)

bii)

fit.glm = glm(default ~ income + balance, data = Default[train,], family = "binomial")
fit.glm = glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
#Both above formulas have the same outcome.
summary(fit.glm)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default, subset = train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.194e+01  6.178e-01 -19.333  < 2e-16 ***
## income       3.262e-05  7.024e-06   4.644 3.41e-06 ***
## balance      5.689e-03  3.158e-04  18.014  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1523.8  on 4999  degrees of freedom
## Residual deviance:  803.3  on 4997  degrees of freedom
## AIC: 809.3
## 
## Number of Fisher Scoring iterations: 8

biii)

glm.probs = predict(fit.glm, newdata = Default[-train, ], type="response")
glm.pred=rep("No",5000)
glm.pred[glm.probs>0.5] = "Yes"

biv)

mean(glm.pred != Default[-train, ]$default)
## [1] 0.0254

bc)

train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0274

Question 3: Caravan

  1. Create a training set consisting of the first 1,000 observations, and a test set consisting of the remaining observations.
library(ISLR2)
data("Caravan")
train = 1:1000
Caravan$Purchase = ifelse(Caravan$Purchase == "Yes", 1, 0)
Caravan.train = Caravan[train,]
Caravan.test = Caravan[-train,]
  1. Fit a boosting model to the training set with Purchase as the response and the other variables as predictors. Use 1,000 trees, and a shrinkage value of 0.01. Which predictors appear to be the most important?
library(gbm)
set.seed(1)
boost.caravan = gbm(Purchase ~ ., data = Caravan.train, distribution = "gaussian", n.trees = 1000, shrinkage = 0.01)
summary(boost.caravan)

The predictor variables PPERSAUT and MKOOKPKLA are the most important.

  1. Use the boosting model to predict the response on the test data. Predict that a person will make a purchase if the estimated probability of purchase is greater than 20%.
probs.test <- predict(boost.caravan, Caravan.test, n.trees = 1000, type = "response")
pred.test <- ifelse(probs.test > 0.2, 1, 0)
table1 <- table(Caravan.test$Purchase, pred.test)
purchase <- table1[2, 2] / sum(table1[, 2])
print(paste("Fraction of correct predictions:", purchase))
## [1] "Fraction of correct predictions: 0.215686274509804"

The fraction of correct predictions is around 21.37%

Question 4

set.seed(2)
x <- matrix(rnorm(20 * 3 * 50, mean = 0, sd = 0.001), ncol = 50)
x[1:20, 2] <- 1
x[21:40, 1] <- 2
x[21:40, 2] <- 2
x[41:60, 1] <- 1
data.four <- c(rep(1, 20), rep(2, 20), rep(3, 20))

Perform PCA on the 60 observations and plot the first two principal component score vectors. Use a different color to indicate the observations in each of the three classes.

principal.component <- prcomp(x)
plot(principal.component$x[,1:2], col = 1:3, xlab = 'Z1', ylab = 'Z2', main = 'first two principal component', pch = 19)

Perform K-means clustering of the observations with K = 3. How well do the clusters that you obtained in K-means clustering compare to the true class labels? clustered/distributed evenly

k.clustering <- kmeans(x, 3, nstart = 20)
table(data.four, k.clustering$cluster)
##          
## data.four  1  2  3
##         1  0  0 20
##         2 20  0  0
##         3  0 20  0

Perform K-means clustering with K = 2. Describe your results. 2 can be seen in the first cluster and the other is in the second cluster

k.clustering2 <- kmeans(x, 2, nstart = 20)
table(data.four, k.clustering2$cluster)
##          
## data.four  1  2
##         1 20  0
##         2  0 20
##         3 20  0

Now perform K-means clustering with K = 4, and describe your results. the first 20 is split amongst the first and second clusters

k.clustering3 <- kmeans(x, 4, nstart = 20)
table(data.four, k.clustering3$cluster)
##          
## data.four  1  2  3  4
##         1 11  9  0  0
##         2  0  0 20  0
##         3  0  0  0 20

Now perform K-means clustering with K = 3 on the first two principal component score vectors, rather than on the raw data. That is, perform K-means clustering on the 60 2 matrix of which the first column is the first principal component score vector, and the second column is the second principal component score vector. Comment on the results. four are not correctly labeled

k.clustering.final <- kmeans(principal.component$x[,1:2], 3, nstart = 20)
table(data.four, k.clustering.final$cluster)
##          
## data.four  1  2  3
##         1  0  0 20
##         2  0 20  0
##         3 20  0  0

Using the scale() function, perform K-means clustering with K = 3 on the data after scaling each variable to have standard deviation one. How do these results compare to those obtained in (b)? Explain. scaling made it worse because the distance between observations increased

k.clustering.final2 <- kmeans(scale(x), 3, nstart = 20)
table(data.four, k.clustering.final2$cluster)
##          
## data.four  1  2  3
##         1  9  2  9
##         2  2 18  0
##         3  7  1 12