final

# Load necessary libraries
library(MASS)    # For Boston dataset
library(caret)   # For modeling functions

## Loading required package: ggplot2

## Loading required package: lattice

library(e1071)   # For Naive Bayes classifier
library(glmnet)  # For ridge and lasso regression

## Loading required package: Matrix

## Loaded glmnet 4.1-8

library(pls)     # For PCR and PLS regression

## 
## Attaching package: 'pls'

## The following object is masked from 'package:caret':
## 
##     R2

## The following object is masked from 'package:stats':
## 
##     loadings

# Chapter 4: Predicting Crime Rate Using the Boston Data Set
cat("Chapter 4: Predicting Crime Rate Using the Boston Data Set\n")

## Chapter 4: Predicting Crime Rate Using the Boston Data Set

# Load Boston dataset
data(Boston)
boston <- Boston

# Create binary response variable for crime rate
median_crime <- median(boston$crim)
boston$crime_binary <- ifelse(boston$crim > median_crime, 1, 0)

# Split data into training and test sets
set.seed(123)  # Set seed for reproducibility
trainIndex <- createDataPartition(boston$crime_binary, p = 0.7, list = FALSE)
train_data <- boston[trainIndex, ]
test_data <- boston[-trainIndex, ]

# Logistic Regression
logit_model <- glm(crime_binary ~ ., data = train_data, family = binomial)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

logit_pred <- predict(logit_model, newdata = test_data, type = "response")
logit_acc <- mean((logit_pred > 0.5) == test_data$crime_binary)
cat("Logistic Regression Accuracy:", logit_acc, "\n")

## Logistic Regression Accuracy: 0.9866667

# LDA
lda_model <- lda(crime_binary ~ ., data = train_data)
lda_pred <- predict(lda_model, newdata = test_data)$class
lda_acc <- mean(lda_pred == test_data$crime_binary)
cat("LDA Accuracy:", lda_acc, "\n")

## LDA Accuracy: 0.8666667

# Naive Bayes
nb_model <- naiveBayes(crime_binary ~ ., data = train_data)
nb_pred <- predict(nb_model, newdata = test_data)
nb_acc <- mean(nb_pred == test_data$crime_binary)
cat("Naive Bayes Accuracy:", nb_acc, "\n")

## Naive Bayes Accuracy: 0.9666667

# KNN (Classification)
train_data$crime_binary <- as.factor(train_data$crime_binary)  # Ensure factor for classification
test_data$crime_binary <- as.factor(test_data$crime_binary)    # Ensure factor for classification
knn_model <- train(crime_binary ~ ., data = train_data, method = "knn")
knn_pred <- predict(knn_model, newdata = test_data)
knn_acc <- mean(knn_pred == test_data$crime_binary)
cat("KNN Accuracy:", knn_acc, "\n\n")

## KNN Accuracy: 0.9333333

# Chapter 5: Logistic Regression for Default Prediction
cat("\nChapter 5: Logistic Regression for Default Prediction\n")

## 
## Chapter 5: Logistic Regression for Default Prediction

# Simulated Data for Default Prediction (Income, Balance, Default)
set.seed(456)  # Set seed for reproducibility
n <- 1000  # Number of observations
income <- rnorm(n, mean = 50000, sd = 10000)
balance <- rnorm(n, mean = 1500, sd = 500)
default <- sample(c("Yes", "No"), n, replace = TRUE, prob = c(0.2, 0.8))
default_df <- data.frame(income, balance, default)

# Convert default to binary response variable
default_df$default_binary <- ifelse(default_df$default == "Yes", 1, 0)

# Split data into training and validation sets
trainIndex <- createDataPartition(default_df$default_binary, p = 0.7, list = FALSE)
train_data <- default_df[trainIndex, ]
validation_data <- default_df[-trainIndex, ]

# Logistic Regression with income and balance
logit_model <- glm(default_binary ~ income + balance, data = train_data, family = binomial)
logit_pred <- predict(logit_model, newdata = validation_data, type = "response")
logit_pred_class <- ifelse(logit_pred > 0.5, "Yes", "No")
validation_error <- mean(logit_pred_class != validation_data$default_binary)
cat("Validation Set Error (Logistic Regression):", validation_error, "\n")

## Validation Set Error (Logistic Regression): 1

# Logistic Regression with income, balance, and student (dummy variable)
default_df$student <- sample(c(0, 1), n, replace = TRUE)
trainIndex <- createDataPartition(default_df$default_binary, p = 0.7, list = FALSE)
train_data <- default_df[trainIndex, ]
validation_data <- default_df[-trainIndex, ]

logit_model_student <- glm(default_binary ~ income + balance + student, data = train_data, family = binomial)
logit_pred_student <- predict(logit_model_student, newdata = validation_data, type = "response")
logit_pred_class_student <- ifelse(logit_pred_student > 0.5, "Yes", "No")
validation_error_student <- mean(logit_pred_class_student != validation_data$default_binary)
cat("Validation Set Error with Student (Logistic Regression):", validation_error_student, "\n\n")

## Validation Set Error with Student (Logistic Regression): 1

# Chapter 6: Regression Analysis for College Applications
cat("\nChapter 6: Regression Analysis for College Applications\n")

## 
## Chapter 6: Regression Analysis for College Applications

# Simulated Data for College Applications (SAT, GPA, Acceptance Rate, Applications)
set.seed(789)  # Set seed for reproducibility
n <- 1000  # Number of observations
SAT <- rnorm(n, mean = 1200, sd = 200)
GPA <- rnorm(n, mean = 3.5, sd = 0.3)
AcceptanceRate <- runif(n, min = 0.1, max = 0.9)
Apps <- rpois(n, lambda = 500)
college_df <- data.frame(SAT, GPA, AcceptanceRate, Apps)

# Split data into training and test sets
trainIndex <- createDataPartition(college_df$Apps, p = 0.7, list = FALSE)
train_data <- college_df[trainIndex, ]
test_data <- college_df[-trainIndex, ]

# Linear Regression
linear_model <- lm(Apps ~ ., data = train_data)
linear_pred <- predict(linear_model, newdata = test_data)
linear_mse <- mean((test_data$Apps - linear_pred)^2)
cat("Linear Regression Test Error:", linear_mse, "\n")

## Linear Regression Test Error: 517.4696

# Ridge Regression
ridge_model <- cv.glmnet(as.matrix(train_data[, -4]), train_data$Apps, alpha = 0)
best_lambda <- ridge_model$lambda.min
ridge_pred <- predict(ridge_model, newx = as.matrix(test_data[, -4]), s = best_lambda)
ridge_mse <- mean((test_data$Apps - ridge_pred)^2)
cat("Ridge Regression Test Error:", ridge_mse, "\n")

## Ridge Regression Test Error: 518.7761

# Lasso Regression
lasso_model <- cv.glmnet(as.matrix(train_data[, -4]), train_data$Apps, alpha = 1)
best_lambda_lasso <- lasso_model$lambda.min
lasso_pred <- predict(lasso_model, newx = as.matrix(test_data[, -4]), s = best_lambda_lasso)
lasso_mse <- mean((test_data$Apps - lasso_pred)^2)
num_nonzero_coef <- sum(coef(lasso_model) != 0)
cat("Lasso Regression Test Error:", lasso_mse, "\n")

## Lasso Regression Test Error: 518.8514

cat("Number of Non-zero Coefficients (Lasso):", num_nonzero_coef, "\n")

## Number of Non-zero Coefficients (Lasso): 1

# PCR
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
best_comp <- pcr_model$validation$cp[which.min(pcr_model$validation$cp)]
pcr_pred <- predict(pcr_model, newdata = test_data, ncomp = best_comp)
pcr_mse <- mean((test_data$Apps - pcr_pred)^2)
cat("PCR Test Error (M selected):", pcr_mse, "\n")

## PCR Test Error (M selected): NaN

cat("Number of Components selected:", best_comp, "\n")

## Number of Components selected:

# PLS
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
best_comp_pls <- pls_model$validation$cp[which.min(pls_model$validation$cp)]
pls_pred <- predict(pls_model, newdata = test_data, ncomp = best_comp_pls)
pls_mse <- mean((test_data$Apps - pls_pred)^2)
cat("PLS Test Error (M selected):", pls_mse, "\n")

## PLS Test Error (M selected): NaN

cat("Number of Components selected:", best_comp_pls, "\n")

## Number of Components selected:

# Commentary on the results
cat("\nCommentary on Results:\n")

## 
## Commentary on Results:

cat("Linear Regression serves as a baseline model, predicting college applications with moderate accuracy.\n")

## Linear Regression serves as a baseline model, predicting college applications with moderate accuracy.

cat("Regularization techniques like Ridge and Lasso help control model complexity and improve generalization.\n")

## Regularization techniques like Ridge and Lasso help control model complexity and improve generalization.

cat("PCR and PLS offer dimensionality reduction, with PLS often providing competitive predictive performance.\n")

## PCR and PLS offer dimensionality reduction, with PLS often providing competitive predictive performance.

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

final

2024-06-13

R Markdown

Including Plots