# Load necessary libraries
library(MASS) # For Boston dataset
library(caret) # For modeling functions
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071) # For Naive Bayes classifier
library(glmnet) # For ridge and lasso regression
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(pls) # For PCR and PLS regression
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
# Chapter 4: Predicting Crime Rate Using the Boston Data Set
cat("Chapter 4: Predicting Crime Rate Using the Boston Data Set\n")
## Chapter 4: Predicting Crime Rate Using the Boston Data Set
# Load Boston dataset
data(Boston)
boston <- Boston
# Create binary response variable for crime rate
median_crime <- median(boston$crim)
boston$crime_binary <- ifelse(boston$crim > median_crime, 1, 0)
# Split data into training and test sets
set.seed(123) # Set seed for reproducibility
trainIndex <- createDataPartition(boston$crime_binary, p = 0.7, list = FALSE)
train_data <- boston[trainIndex, ]
test_data <- boston[-trainIndex, ]
# Logistic Regression
logit_model <- glm(crime_binary ~ ., data = train_data, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logit_pred <- predict(logit_model, newdata = test_data, type = "response")
logit_acc <- mean((logit_pred > 0.5) == test_data$crime_binary)
cat("Logistic Regression Accuracy:", logit_acc, "\n")
## Logistic Regression Accuracy: 0.9866667
# LDA
lda_model <- lda(crime_binary ~ ., data = train_data)
lda_pred <- predict(lda_model, newdata = test_data)$class
lda_acc <- mean(lda_pred == test_data$crime_binary)
cat("LDA Accuracy:", lda_acc, "\n")
## LDA Accuracy: 0.8666667
# Naive Bayes
nb_model <- naiveBayes(crime_binary ~ ., data = train_data)
nb_pred <- predict(nb_model, newdata = test_data)
nb_acc <- mean(nb_pred == test_data$crime_binary)
cat("Naive Bayes Accuracy:", nb_acc, "\n")
## Naive Bayes Accuracy: 0.9666667
# KNN (Classification)
train_data$crime_binary <- as.factor(train_data$crime_binary) # Ensure factor for classification
test_data$crime_binary <- as.factor(test_data$crime_binary) # Ensure factor for classification
knn_model <- train(crime_binary ~ ., data = train_data, method = "knn")
knn_pred <- predict(knn_model, newdata = test_data)
knn_acc <- mean(knn_pred == test_data$crime_binary)
cat("KNN Accuracy:", knn_acc, "\n\n")
## KNN Accuracy: 0.9333333
# Chapter 5: Logistic Regression for Default Prediction
cat("\nChapter 5: Logistic Regression for Default Prediction\n")
##
## Chapter 5: Logistic Regression for Default Prediction
# Simulated Data for Default Prediction (Income, Balance, Default)
set.seed(456) # Set seed for reproducibility
n <- 1000 # Number of observations
income <- rnorm(n, mean = 50000, sd = 10000)
balance <- rnorm(n, mean = 1500, sd = 500)
default <- sample(c("Yes", "No"), n, replace = TRUE, prob = c(0.2, 0.8))
default_df <- data.frame(income, balance, default)
# Convert default to binary response variable
default_df$default_binary <- ifelse(default_df$default == "Yes", 1, 0)
# Split data into training and validation sets
trainIndex <- createDataPartition(default_df$default_binary, p = 0.7, list = FALSE)
train_data <- default_df[trainIndex, ]
validation_data <- default_df[-trainIndex, ]
# Logistic Regression with income and balance
logit_model <- glm(default_binary ~ income + balance, data = train_data, family = binomial)
logit_pred <- predict(logit_model, newdata = validation_data, type = "response")
logit_pred_class <- ifelse(logit_pred > 0.5, "Yes", "No")
validation_error <- mean(logit_pred_class != validation_data$default_binary)
cat("Validation Set Error (Logistic Regression):", validation_error, "\n")
## Validation Set Error (Logistic Regression): 1
# Logistic Regression with income, balance, and student (dummy variable)
default_df$student <- sample(c(0, 1), n, replace = TRUE)
trainIndex <- createDataPartition(default_df$default_binary, p = 0.7, list = FALSE)
train_data <- default_df[trainIndex, ]
validation_data <- default_df[-trainIndex, ]
logit_model_student <- glm(default_binary ~ income + balance + student, data = train_data, family = binomial)
logit_pred_student <- predict(logit_model_student, newdata = validation_data, type = "response")
logit_pred_class_student <- ifelse(logit_pred_student > 0.5, "Yes", "No")
validation_error_student <- mean(logit_pred_class_student != validation_data$default_binary)
cat("Validation Set Error with Student (Logistic Regression):", validation_error_student, "\n\n")
## Validation Set Error with Student (Logistic Regression): 1
# Chapter 6: Regression Analysis for College Applications
cat("\nChapter 6: Regression Analysis for College Applications\n")
##
## Chapter 6: Regression Analysis for College Applications
# Simulated Data for College Applications (SAT, GPA, Acceptance Rate, Applications)
set.seed(789) # Set seed for reproducibility
n <- 1000 # Number of observations
SAT <- rnorm(n, mean = 1200, sd = 200)
GPA <- rnorm(n, mean = 3.5, sd = 0.3)
AcceptanceRate <- runif(n, min = 0.1, max = 0.9)
Apps <- rpois(n, lambda = 500)
college_df <- data.frame(SAT, GPA, AcceptanceRate, Apps)
# Split data into training and test sets
trainIndex <- createDataPartition(college_df$Apps, p = 0.7, list = FALSE)
train_data <- college_df[trainIndex, ]
test_data <- college_df[-trainIndex, ]
# Linear Regression
linear_model <- lm(Apps ~ ., data = train_data)
linear_pred <- predict(linear_model, newdata = test_data)
linear_mse <- mean((test_data$Apps - linear_pred)^2)
cat("Linear Regression Test Error:", linear_mse, "\n")
## Linear Regression Test Error: 517.4696
# Ridge Regression
ridge_model <- cv.glmnet(as.matrix(train_data[, -4]), train_data$Apps, alpha = 0)
best_lambda <- ridge_model$lambda.min
ridge_pred <- predict(ridge_model, newx = as.matrix(test_data[, -4]), s = best_lambda)
ridge_mse <- mean((test_data$Apps - ridge_pred)^2)
cat("Ridge Regression Test Error:", ridge_mse, "\n")
## Ridge Regression Test Error: 518.7761
# Lasso Regression
lasso_model <- cv.glmnet(as.matrix(train_data[, -4]), train_data$Apps, alpha = 1)
best_lambda_lasso <- lasso_model$lambda.min
lasso_pred <- predict(lasso_model, newx = as.matrix(test_data[, -4]), s = best_lambda_lasso)
lasso_mse <- mean((test_data$Apps - lasso_pred)^2)
num_nonzero_coef <- sum(coef(lasso_model) != 0)
cat("Lasso Regression Test Error:", lasso_mse, "\n")
## Lasso Regression Test Error: 518.8514
cat("Number of Non-zero Coefficients (Lasso):", num_nonzero_coef, "\n")
## Number of Non-zero Coefficients (Lasso): 1
# PCR
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
best_comp <- pcr_model$validation$cp[which.min(pcr_model$validation$cp)]
pcr_pred <- predict(pcr_model, newdata = test_data, ncomp = best_comp)
pcr_mse <- mean((test_data$Apps - pcr_pred)^2)
cat("PCR Test Error (M selected):", pcr_mse, "\n")
## PCR Test Error (M selected): NaN
cat("Number of Components selected:", best_comp, "\n")
## Number of Components selected:
# PLS
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
best_comp_pls <- pls_model$validation$cp[which.min(pls_model$validation$cp)]
pls_pred <- predict(pls_model, newdata = test_data, ncomp = best_comp_pls)
pls_mse <- mean((test_data$Apps - pls_pred)^2)
cat("PLS Test Error (M selected):", pls_mse, "\n")
## PLS Test Error (M selected): NaN
cat("Number of Components selected:", best_comp_pls, "\n")
## Number of Components selected:
# Commentary on the results
cat("\nCommentary on Results:\n")
##
## Commentary on Results:
cat("Linear Regression serves as a baseline model, predicting college applications with moderate accuracy.\n")
## Linear Regression serves as a baseline model, predicting college applications with moderate accuracy.
cat("Regularization techniques like Ridge and Lasso help control model complexity and improve generalization.\n")
## Regularization techniques like Ridge and Lasso help control model complexity and improve generalization.
cat("PCR and PLS offer dimensionality reduction, with PLS often providing competitive predictive performance.\n")
## PCR and PLS offer dimensionality reduction, with PLS often providing competitive predictive performance.
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.