# Load necessary libraries
library(readr)
library(dplyr)
library(caret)
# Load data set (replace the path with your actual file location)
train_data <- read_csv("/Users/valfa/Downloads/spaceship-titanic/train.csv")
# Drop irrelevant columns
train_data <- train_data %>%
select(-PassengerId, -Name, -Cabin)
# Define numeric and categorical features
numeric_features <- c('Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck')
categorical_features <- c('HomePlanet', 'CryoSleep', 'Destination', 'VIP')
# Impute numeric columns
for (col in numeric_features) {
train_data[[col]][is.na(train_data[[col]])] <- median(train_data[[col]], na.rm = TRUE)
}
# Impute categorical columns
for (col in categorical_features) {
train_data[[col]][is.na(train_data[[col]])] <- as.character(stats::na.omit(train_data[[col]])[which.max(table(train_data[[col]]))])
}
# Convert categorical columns to factors
train_data <- train_data %>%
mutate(across(all_of(categorical_features), as.factor))
# Convert target to factor
train_data$Transported <- as.factor(train_data$Transported)
# Train logistic regression model
model <- glm(Transported ~ ., data = train_data, family = binomial())
# Make predictions on training data
predictions <- predict(model, train_data, type = "response")
predicted_classes <- ifelse(predictions > 0.5, TRUE, FALSE)
# Calculate training error rate
actual_classes <- train_data$Transported == "TRUE"
error_rate <- mean(predicted_classes != actual_classes)
print(paste("Training classification error rate:", round(error_rate, 4)))
## [1] "Training classification error rate: 0.2163"
probability of default using income and balance on the Default data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis.
knitr::opts_chunk$set(echo = TRUE)
library(ISLR)
set.seed(1)
fit.glm = glm(default ~ income + balance, data = Default, family = "binomial")
summary(fit.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
(bi) Using the validation set approach, estimate the test error of this model. In order to do this, you must perform the following steps:
train = sample(dim(Default)[1], dim(Default)[1] / 2)
bii)
fit.glm = glm(default ~ income + balance, data = Default[train,], family = "binomial")
fit.glm = glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
#Both above formulas have the same outcome.
summary(fit.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default, subset = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.194e+01 6.178e-01 -19.333 < 2e-16 ***
## income 3.262e-05 7.024e-06 4.644 3.41e-06 ***
## balance 5.689e-03 3.158e-04 18.014 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1523.8 on 4999 degrees of freedom
## Residual deviance: 803.3 on 4997 degrees of freedom
## AIC: 809.3
##
## Number of Fisher Scoring iterations: 8
biii)
glm.probs = predict(fit.glm, newdata = Default[-train, ], type="response")
glm.pred=rep("No",5000)
glm.pred[glm.probs>0.5] = "Yes"
biv)
mean(glm.pred != Default[-train, ]$default)
## [1] 0.0254
bc)
train <- sample(dim(Default)[1], dim(Default)[1] / 2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(fit.glm, newdata = Default[-train, ], type = "response")
pred.glm <- rep("No", length(probs))
pred.glm[probs > 0.5] <- "Yes"
mean(pred.glm != Default[-train, ]$default)
## [1] 0.0274
library(ISLR2)
data("Caravan")
train = 1:1000
Caravan$Purchase = ifelse(Caravan$Purchase == "Yes", 1, 0)
Caravan.train = Caravan[train,]
Caravan.test = Caravan[-train,]
library(gbm)
set.seed(1)
boost.caravan = gbm(Purchase ~ ., data = Caravan.train, distribution = "gaussian", n.trees = 1000, shrinkage = 0.01)
summary(boost.caravan)
The predictor variables PPERSAUT and MKOOKPKLA are the most important.
probs.test <- predict(boost.caravan, Caravan.test, n.trees = 1000, type = "response")
pred.test <- ifelse(probs.test > 0.2, 1, 0)
table1 <- table(Caravan.test$Purchase, pred.test)
purchase <- table1[2, 2] / sum(table1[, 2])
print(paste("Fraction of correct predictions:", purchase))
## [1] "Fraction of correct predictions: 0.215686274509804"
The fraction of correct predictions is around 21.37%
set.seed(2)
x <- matrix(rnorm(20 * 3 * 50, mean = 0, sd = 0.001), ncol = 50)
x[1:20, 2] <- 1
x[21:40, 1] <- 2
x[21:40, 2] <- 2
x[41:60, 1] <- 1
data.four <- c(rep(1, 20), rep(2, 20), rep(3, 20))
Perform PCA on the 60 observations and plot the first two principal component score vectors. Use a different color to indicate the observations in each of the three classes.
principal.component <- prcomp(x)
plot(principal.component$x[,1:2], col = 1:3, xlab = 'Z1', ylab = 'Z2', main = 'first two principal component', pch = 19)
Perform K-means clustering of the observations with K = 3. How well do
the clusters that you obtained in K-means clustering compare to the true
class labels? clustered/distributed evenly
k.clustering <- kmeans(x, 3, nstart = 20)
table(data.four, k.clustering$cluster)
##
## data.four 1 2 3
## 1 0 0 20
## 2 20 0 0
## 3 0 20 0
Perform K-means clustering with K = 2. Describe your results. 2 can be seen in the first cluster and the other is in the second cluster
k.clustering2 <- kmeans(x, 2, nstart = 20)
table(data.four, k.clustering2$cluster)
##
## data.four 1 2
## 1 20 0
## 2 0 20
## 3 20 0
Now perform K-means clustering with K = 4, and describe your results. the first 20 is split amongst the first and second clusters
k.clustering3 <- kmeans(x, 4, nstart = 20)
table(data.four, k.clustering3$cluster)
##
## data.four 1 2 3 4
## 1 11 9 0 0
## 2 0 0 20 0
## 3 0 0 0 20
Now perform K-means clustering with K = 3 on the first two principal component score vectors, rather than on the raw data. That is, perform K-means clustering on the 60 2 matrix of which the first column is the first principal component score vector, and the second column is the second principal component score vector. Comment on the results. four are not correctly labeled
k.clustering.final <- kmeans(principal.component$x[,1:2], 3, nstart = 20)
table(data.four, k.clustering.final$cluster)
##
## data.four 1 2 3
## 1 0 0 20
## 2 0 20 0
## 3 20 0 0
Using the scale() function, perform K-means clustering with K = 3 on the data after scaling each variable to have standard deviation one. How do these results compare to those obtained in (b)? Explain. scaling made it worse because the distance between observations increased
k.clustering.final2 <- kmeans(scale(x), 3, nstart = 20)
table(data.four, k.clustering.final2$cluster)
##
## data.four 1 2 3
## 1 9 2 9
## 2 2 18 0
## 3 7 1 12