# Load necessary libraries
library(MASS)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
library(class)
### Chapter 4: Predicting Crime Rate Using the Boston Data Set ###
# Load Boston dataset
data(Boston)
boston_df <- Boston
# Create binary response variable
median_crim <- median(boston_df$crim)
boston_df$crime_binary <- ifelse(boston_df$crim > median_crim, "High", "Low")
# Ensure 'crime_binary' is a factor
boston_df$crime_binary <- factor(boston_df$crime_binary, levels = c("Low", "High"))
# Split data into training and test sets
set.seed(123)
train_index <- createDataPartition(boston_df$crime_binary, p = 0.7, list = FALSE)
train_data <- boston_df[train_index, ]
test_data <- boston_df[-train_index, ]
# Logistic Regression
logit_model <- glm(crime_binary ~ ., data = train_data, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logit_pred <- predict(logit_model, newdata = test_data, type = "response")
logit_acc <- mean(ifelse(logit_pred > 0.5, "High", "Low") == test_data$crime_binary)
cat("Logistic Regression Accuracy:", logit_acc, "\n")
## Logistic Regression Accuracy: 0.9866667
# LDA
lda_model <- lda(crime_binary ~ ., data = train_data)
lda_pred <- predict(lda_model, newdata = test_data)
lda_acc <- mean(lda_pred$class == test_data$crime_binary)
cat("LDA Accuracy:", lda_acc, "\n")
## LDA Accuracy: 0.8666667
# Naive Bayes
nb_model <- naiveBayes(crime_binary ~ ., data = train_data)
nb_pred <- predict(nb_model, newdata = test_data)
nb_acc <- mean(nb_pred == test_data$crime_binary)
cat("Naive Bayes Accuracy:", nb_acc, "\n")
## Naive Bayes Accuracy: 0.9666667
# KNN
knn_model <- knn(train = train_data[, -c(1, 15)], test = test_data[, -c(1, 15)], cl = train_data$crime_binary, k = 5)
knn_acc <- mean(knn_model == test_data$crime_binary)
cat("KNN Accuracy:", knn_acc, "\n")
## KNN Accuracy: 0.9333333
### Chapter 5: Logistic Regression for Default Prediction ###
# Generate example data for demonstration
set.seed(123)
Default <- data.frame(
default = sample(c("Yes", "No"), 1000, replace = TRUE, prob = c(0.05, 0.95)),
income = rnorm(1000, mean = 45, sd = 15),
balance = rgamma(1000, shape = 10, rate = 0.1),
student = sample(c("Yes", "No"), 1000, replace = TRUE, prob = c(0.1, 0.9))
)
# Convert 'default' to factor
Default$default <- factor(Default$default, levels = c("No", "Yes"))
# Convert 'student' to dummy variable
Default$student <- ifelse(Default$student == "Yes", 1, 0)
# Split data into training and validation sets
set.seed(123)
train_index <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data <- Default[train_index, ]
val_data <- Default[-train_index, ]
# Logistic Regression Model
log_reg <- glm(default ~ income + balance, data = train_data, family = "binomial")
# Predict on validation set
val_data$pred_default <- predict(log_reg, newdata = val_data, type = "response")
val_data$pred_class <- ifelse(val_data$pred_default > 0.5, "Yes", "No")
# Evaluate Model
conf_matrix <- table(val_data$pred_class, val_data$default)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Logistic Regression Accuracy:", accuracy, "\n")
## Logistic Regression Accuracy: 0.9498328
# Repeat with student variable
log_reg_student <- glm(default ~ income + balance + student, data = train_data, family = "binomial")
val_data$pred_default_student <- predict(log_reg_student, newdata = val_data, type = "response")
val_data$pred_class_student <- ifelse(val_data$pred_default_student > 0.5, "Yes", "No")
# Evaluate Model with student variable
conf_matrix_student <- table(val_data$pred_class_student, val_data$default)
accuracy_student <- sum(diag(conf_matrix_student)) / sum(conf_matrix_student)
cat("Logistic Regression Accuracy with Student:", accuracy_student, "\n")
## Logistic Regression Accuracy with Student: 0.9498328
### Chapter 6: Regression Analysis for College Applications ###
# Generate example data for demonstration
set.seed(123)
College <- data.frame(
Apps = rpois(100, lambda = 500),
Accept = rpois(100, lambda = 400),
Enroll = rpois(100, lambda = 300),
Top10perc = round(runif(100, min = 0, max = 100)),
Top25perc = round(runif(100, min = 0, max = 100)),
F_Undergrad = rpois(100, lambda = 2000),
P_Undergrad = rpois(100, lambda = 500),
Outstate = rnorm(100, mean = 10000, sd = 2000),
Room_Board = rnorm(100, mean = 5000, sd = 1000),
Books = rnorm(100, mean = 600, sd = 100),
Personal = rnorm(100, mean = 3000, sd = 500),
PhD = round(runif(100, min = 0, max = 100)),
Terminal = round(runif(100, min = 0, max = 100)),
S_F_Ratio = runif(100, min = 0, max = 20),
perc_alumni = round(runif(100, min = 0, max = 100)),
Expend = rnorm(100, mean = 10000, sd = 2000),
Grad_Rate = round(runif(100, min = 0, max = 100))
)
# Split data into training and test sets
set.seed(123)
train_index <- createDataPartition(College$Apps, p = 0.7, list = FALSE)
train_data <- College[train_index, ]
test_data <- College[-train_index, ]
# Linear Regression
lr <- lm(Apps ~ ., data = train_data)
lr_pred <- predict(lr, newdata = test_data)
lr_error <- mean((test_data$Apps - lr_pred)^2)
cat("Linear Regression Test Error:", lr_error, "\n")
## Linear Regression Test Error: 428.868
# Ridge Regression with cross-validation
ridge_cv <- train(Apps ~ ., data = train_data, method = "ridge", trControl = trainControl(method = "cv"))
ridge_pred <- predict(ridge_cv, newdata = test_data)
ridge_error <- mean((test_data$Apps - ridge_pred)^2)
cat("Ridge Regression Test Error:", ridge_error, "\n")
## Ridge Regression Test Error: 428.8613
# Lasso Regression with cross-validation
lasso_cv <- train(Apps ~ ., data = train_data, method = "lasso", trControl = trainControl(method = "cv"))
lasso_pred <- predict(lasso_cv, newdata = test_data)
lasso_error <- mean((test_data$Apps - lasso_pred)^2)
cat("Lasso Regression Test Error:", lasso_error, "\n")
## Lasso Regression Test Error: 338.8139
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.