R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(MASS)
data("Boston")
Boston$crim_above_median <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
set.seed(123)
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.3
split <- sample.split(Boston$crim_above_median, SplitRatio = 0.7)
train <- subset(Boston, split == TRUE)
test <- subset(Boston, split == FALSE)
logistic_model <- glm(crim_above_median ~ ., data = train, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_model)
## 
## Call:
## glm(formula = crim_above_median ~ ., family = binomial, data = train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.315e+03  1.581e+05  -0.008    0.993
## crim         6.665e+02  1.726e+04   0.039    0.969
## zn           1.249e+00  1.170e+02   0.011    0.991
## indus        8.851e-01  5.064e+02   0.002    0.999
## chas        -3.317e+00  1.364e+04   0.000    1.000
## nox          3.191e+01  7.608e+04   0.000    1.000
## rm          -1.782e+01  3.197e+03  -0.006    0.996
## age          3.317e-01  1.055e+02   0.003    0.997
## dis         -1.472e+01  2.631e+03  -0.006    0.996
## rad          1.160e+01  1.402e+03   0.008    0.993
## tax         -3.735e-01  2.555e+01  -0.015    0.988
## ptratio      1.262e+01  1.530e+03   0.008    0.993
## black        2.615e+00  3.093e+02   0.008    0.993
## lstat        1.458e-01  2.383e+02   0.001    1.000
## medv         2.255e+00  5.640e+02   0.004    0.997
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4.9075e+02  on 353  degrees of freedom
## Residual deviance: 8.6635e-06  on 339  degrees of freedom
## AIC: 30
## 
## Number of Fisher Scoring iterations: 25
# Predict on the test set
logistic_preds <- predict(logistic_model, newdata = test, type = "response")
logistic_pred_class <- ifelse(logistic_preds > 0.5, 1, 0)
library(MASS)
lda_model <- lda(crim_above_median ~ ., data = train)
lda_preds <- predict(lda_model, newdata = test)
lda_pred_class <- lda_preds$class
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
nb_model <- naiveBayes(crim_above_median ~ ., data = train)
nb_preds <- predict(nb_model, newdata = test)
library(class)
train_X <- train[, -which(names(train) == "crim_above_median")]
test_X <- test[, -which(names(test) == "crim_above_median")]
train_y <- train$crim_above_median

# Scale the predictors
train_X_scaled <- scale(train_X)
test_X_scaled <- scale(test_X)

knn_preds <- knn(train = train_X_scaled, test = test_X_scaled, cl = train_y, k = 5)
# Confusion matrix and accuracy for Logistic Regression
logistic_cm <- table(test$crim_above_median, logistic_pred_class)
logistic_accuracy <- sum(diag(logistic_cm)) / sum(logistic_cm)
print(paste("Logistic Regression Accuracy:", logistic_accuracy))
## [1] "Logistic Regression Accuracy: 0.980263157894737"
# Confusion matrix and accuracy for LDA
lda_cm <- table(test$crim_above_median, lda_pred_class)
lda_accuracy <- sum(diag(lda_cm)) / sum(lda_cm)
print(paste("LDA Accuracy:", lda_accuracy))
## [1] "LDA Accuracy: 0.881578947368421"
# Confusion matrix and accuracy for Naive Bayes
nb_cm <- table(test$crim_above_median, nb_preds)
nb_accuracy <- sum(diag(nb_cm)) / sum(nb_cm)
print(paste("Naive Bayes Accuracy:", nb_accuracy))
## [1] "Naive Bayes Accuracy: 0.947368421052632"
# Confusion matrix and accuracy for KNN
knn_cm <- table(test$crim_above_median, knn_preds)
knn_accuracy <- sum(diag(knn_cm)) / sum(knn_cm)
print(paste("KNN Accuracy:", knn_accuracy))
## [1] "KNN Accuracy: 0.901315789473684"
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
data("Default")
set.seed(123)
logistic_model <- glm(default ~ income + balance, data = Default, family = binomial)
summary(logistic_model)
## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8
set.seed(123)
library(caTools)
split <- sample.split(Default$default, SplitRatio = 0.7)
train <- subset(Default, split == TRUE)
test <- subset(Default, split == FALSE)

# Fit the model on the training set
logistic_model_train <- glm(default ~ income + balance, data = train, family = binomial)

# Predict on the validation set
test_probs <- predict(logistic_model_train, newdata = test, type = "response")
test_preds <- ifelse(test_probs > 0.5, "Yes", "No")

# Calculate the validation set error
test_actual <- test$default
conf_matrix <- table(test_actual, test_preds)
validation_error <- 1 - sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Validation Error:", validation_error))
## [1] "Validation Error: 0.0246666666666666"
# Function to perform the validation set approach
validation_set_approach <- function(seed) {
  set.seed(seed)
  split <- sample.split(Default$default, SplitRatio = 0.7)
  train <- subset(Default, split == TRUE)
  test <- subset(Default, split == FALSE)
  
  logistic_model_train <- glm(default ~ income + balance, data = train, family = binomial)
  test_probs <- predict(logistic_model_train, newdata = test, type = "response")
  test_preds <- ifelse(test_probs > 0.5, "Yes", "No")
  test_actual <- test$default
  conf_matrix <- table(test_actual, test_preds)
  validation_error <- 1 - sum(diag(conf_matrix)) / sum(conf_matrix)
  return(validation_error)
}

# Perform the process three times with different seeds
set.seed(123)
seeds <- sample(1:1000, 3)
errors <- sapply(seeds, validation_set_approach)
print(errors)
## [1] 0.02600000 0.02566667 0.02866667
mean_error <- mean(errors)
print(paste("Mean Validation Error:", mean_error))
## [1] "Mean Validation Error: 0.0267777777777778"
# Add the dummy variable for student
set.seed(123)
split <- sample.split(Default$default, SplitRatio = 0.7)
train <- subset(Default, split == TRUE)
test <- subset(Default, split == FALSE)

logistic_model_student <- glm(default ~ income + balance + student, data = train, family = binomial)
test_probs_student <- predict(logistic_model_student, newdata = test, type = "response")
test_preds_student <- ifelse(test_probs_student > 0.5, "Yes", "No")

conf_matrix_student <- table(test$default, test_preds_student)
validation_error_student <- 1 - sum(diag(conf_matrix_student)) / sum(conf_matrix_student)
print(paste("Validation Error with Student Variable:", validation_error_student))
## [1] "Validation Error with Student Variable: 0.0243333333333333"
library(ISLR)
data("College")
set.seed(123)
library(caTools)
split <- sample.split(College$Apps, SplitRatio = 0.7)
train <- subset(College, split == TRUE)
test <- subset(College, split == FALSE)
linear_model <- lm(Apps ~ ., data = train)
test_preds_linear <- predict(linear_model, newdata = test)
test_error_linear <- mean((test$Apps - test_preds_linear)^2)
print(paste("Test Error for Linear Model:", test_error_linear))
## [1] "Test Error for Linear Model: 1304671.1253516"
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
x_train <- model.matrix(Apps ~ ., train)[, -1]
y_train <- train$Apps
x_test <- model.matrix(Apps ~ ., test)[, -1]
y_test <- test$Apps

# Perform cross-validation to choose the best lambda
set.seed(123)
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)
best_lambda_ridge <- cv_ridge$lambda.min

# Fit the ridge regression model
ridge_model <- glmnet(x_train, y_train, alpha = 0, lambda = best_lambda_ridge)
test_preds_ridge <- predict(ridge_model, s = best_lambda_ridge, newx = x_test)
test_error_ridge <- mean((y_test - test_preds_ridge)^2)
print(paste("Test Error for Ridge Regression:", test_error_ridge))
## [1] "Test Error for Ridge Regression: 1313120.09915936"
# Perform cross-validation to choose the best lambda
set.seed(123)
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)
best_lambda_lasso <- cv_lasso$lambda.min

# Fit the lasso model
lasso_model <- glmnet(x_train, y_train, alpha = 1, lambda = best_lambda_lasso)
test_preds_lasso <- predict(lasso_model, s = best_lambda_lasso, newx = x_test)
test_error_lasso <- mean((y_test - test_preds_lasso)^2)

# Number of non-zero coefficients
num_nonzero_coef <- sum(coef(lasso_model, s = best_lambda_lasso) != 0)

print(paste("Test Error for Lasso:", test_error_lasso))
## [1] "Test Error for Lasso: 1311182.68289331"
print(paste("Number of Non-zero Coefficients in Lasso:", num_nonzero_coef))
## [1] "Number of Non-zero Coefficients in Lasso: 13"
library(pls)
## Warning: package 'pls' was built under R version 4.3.3
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
set.seed(123)
pcr_model <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pcr_model, val.type = "MSEP")

# Select the number of components
optimal_M <- which.min(pcr_model$validation$PRESS)
test_preds_pcr <- predict(pcr_model, newdata = test, ncomp = optimal_M)
test_error_pcr <- mean((test$Apps - test_preds_pcr)^2)
print(paste("Test Error for PCR:", test_error_pcr))
## [1] "Test Error for PCR: 1304671.1253516"
print(paste("Optimal Number of Components for PCR:", optimal_M))
## [1] "Optimal Number of Components for PCR: 17"
set.seed(123)
pls_model <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pls_model, val.type = "MSEP")

# Select the number of components
optimal_M_pls <- which.min(pls_model$validation$PRESS)
test_preds_pls <- predict(pls_model, newdata = test, ncomp = optimal_M_pls)
test_error_pls <- mean((test$Apps - test_preds_pls)^2)
print(paste("Test Error for PLS:", test_error_pls))
## [1] "Test Error for PLS: 1304653.05383128"
print(paste("Optimal Number of Components for PLS:", optimal_M_pls))
## [1] "Optimal Number of Components for PLS: 16"
# Summary of results
results <- data.frame(
  Method = c("Linear Model", "Ridge Regression", "Lasso", "PCR", "PLS"),
  TestError = c(test_error_linear, test_error_ridge, test_error_lasso, test_error_pcr, test_error_pls)
)
print(results)
##             Method TestError
## 1     Linear Model   1304671
## 2 Ridge Regression   1313120
## 3            Lasso   1311183
## 4              PCR   1304671
## 5              PLS   1304653

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.