Question 1
# Step 1: Load the ISLR package and dataset
data(Auto)
mpg_median <- median(Auto$mpg)
mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
Auto$mpg01 <- mpg01
#boxplots
boxplot(cylinders ~ mpg01, data = Auto, main = "Cylinders vs mpg01")

boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01")

boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01")

boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01")

boxplot(acceleration ~ mpg01, data = Auto, main = "Acceleration vs mpg01")

boxplot(year ~ mpg01, data = Auto, main = "Year vs mpg01")

boxplot(origin ~ mpg01, data = Auto, main = "Origin vs mpg01")

#Cylinders, displacement, horsepower, and weight are typically lower for cars with high mpg (mpg01 = 1).
#The year and origin of the car appear to also show some difference.
#acceleration likely shows little association.
set.seed(1)
train_index <- sample(1:nrow(Auto), nrow(Auto)/2)
train <- Auto[train_index, ]
test <- Auto[-train_index, ]
lda_fit <- lda(mpg01 ~ cylinders + displacement + horsepower + weight + year + origin, data = train)
lda_pred <- predict(lda_fit, test)
lda_class <- lda_pred$class
lda_error <- mean(lda_class != test$mpg01)
lda_error
## [1] 0.1173469
glm_fit <- glm(mpg01 ~ cylinders + displacement + horsepower + weight + year + origin,
data = train, family = binomial)
glm_probs <- predict(glm_fit, test, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)
glm_error <- mean(glm_pred != test$mpg01)
glm_error
## [1] 0.09183673
#scale
train_X <- scale(train[, c("cylinders", "displacement", "horsepower", "weight", "year", "origin")])
test_X <- scale(test[, c("cylinders", "displacement", "horsepower", "weight", "year", "origin")],
center = attr(train_X, "scaled:center"),
scale = attr(train_X, "scaled:scale"))
train_y <- train$mpg01
#try diff K's
knn_errors <- c()
for (k in 1:20) {
knn_pred <- knn(train_X, test_X, train_y, k = k)
error <- mean(knn_pred != test$mpg01)
knn_errors[k] <- error
}
#best K
best_k <- which.min(knn_errors)
best_k
## [1] 1
knn_errors[best_k]
## [1] 0.09183673
#summary table
test_errors <- data.frame(
Method = c("LDA", "Logistic Regression", paste("KNN (K =", best_k, ")")),
Test_Error = c(lda_error, glm_error, knn_errors[best_k])
)
print(test_errors)
## Method Test_Error
## 1 LDA 0.11734694
## 2 Logistic Regression 0.09183673
## 3 KNN (K = 1 ) 0.09183673
Question 2
data("College")
str(College)
## 'data.frame': 777 obs. of 18 variables:
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : num 1660 2186 1428 417 193 ...
## $ Accept : num 1232 1924 1097 349 146 ...
## $ Enroll : num 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : num 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : num 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: num 2885 2683 1036 510 249 ...
## $ P.Undergrad: num 537 1227 99 63 869 ...
## $ Outstate : num 7440 12280 11250 12960 7560 ...
## $ Room.Board : num 3300 6450 3750 5450 4120 ...
## $ Books : num 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : num 2200 1500 1165 875 1500 ...
## $ PhD : num 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : num 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: num 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : num 7041 10527 8735 19016 10922 ...
## $ Grad.Rate : num 60 56 54 59 15 55 63 73 80 52 ...
summary(College)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
#777 US Colleges. many different variables such as whether or not the college is private, the number of students enrolled, and the out of state tuition.
set.seed(1)
train_index <- sample(1:nrow(College), nrow(College)/2)
train <- College[train_index, ]
test <- College[-train_index, ]
#forward stepwise selection
regfit_fwd <- regsubsets(Outstate ~ ., data = train, method = "forward")
summary_fwd <- summary(regfit_fwd)
par(mfrow = c(1, 3))
plot(summary_fwd$cp, xlab = "Number of Variables", ylab = "Cp", type = "b")
plot(summary_fwd$bic, xlab = "Number of Variables", ylab = "BIC", type = "b")
plot(summary_fwd$adjr2, xlab = "Number of Variables", ylab = "Adjusted R²", type = "b")

#assuming these variables were selected: Private, Room.Board, PhD, Terminal, S.F.Ratio, perc.alumni, Expend, Grad.Rate for model.
gam_model <- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(Terminal) +
s(S.F.Ratio) + s(perc.alumni) + s(Expend) + s(Grad.Rate),
data = train)
par(mfrow = c(2, 4))
plot(gam_model, se = TRUE, col = "blue")

#each plot is the affect of 1 predictor
#the wider the plot, the more uncertainty for that area. for eaxmple - room board is better than Phd or terminal
preds <- predict(gam_model, newdata = test)
#RMSE calculation
rmse <- sqrt(mean((test$Outstate - preds)^2))
rmse
## [1] 1829.689
#this value is the mean root squared value and it indicates the difference between the models predicted out of state tuition vs the actual amount
summary(gam_model)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(Terminal) +
## s(S.F.Ratio) + s(perc.alumni) + s(Expend) + s(Grad.Rate),
## data = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6446.90 -1128.59 -20.65 1211.88 6812.89
##
## (Dispersion Parameter for gaussian family taken to be 3682843)
##
## Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1318456803 on 357.9997 degrees of freedom
## AIC: 6998.121
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1700512563 1700512563 461.7391 < 2.2e-16 ***
## s(Room.Board) 1 1689412313 1689412313 458.7250 < 2.2e-16 ***
## s(PhD) 1 349662610 349662610 94.9437 < 2.2e-16 ***
## s(Terminal) 1 18562912 18562912 5.0404 0.02537 *
## s(S.F.Ratio) 1 152293040 152293040 41.3520 4.070e-10 ***
## s(perc.alumni) 1 248087751 248087751 67.3631 4.128e-15 ***
## s(Expend) 1 558930934 558930934 151.7662 < 2.2e-16 ***
## s(Grad.Rate) 1 86968217 86968217 23.6144 1.764e-06 ***
## Residuals 358 1318456803 3682843
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board) 3 1.6469 0.17822
## s(PhD) 3 2.1718 0.09101 .
## s(Terminal) 3 2.1322 0.09582 .
## s(S.F.Ratio) 3 3.0363 0.02919 *
## s(perc.alumni) 3 0.4348 0.72826
## s(Expend) 3 20.0512 4.911e-12 ***
## s(Grad.Rate) 3 0.9436 0.41964
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#a value of greater than one indicates that maybe there is some non linearity
#based on the GAM output, there is strong evidence of non-linear relationships between the response (which is out of state tuition) and both Expend and S.F.Ratio, as seen by the low p-values. There is also weak evidence for non-linearity in PhD and terminal.
Question 3
desktop_path <- file.path(path.expand("~"), "Desktop")
cc_data <- read.csv(file.path(desktop_path, "cc_data.csv"))
cc_data_clean <- na.omit(cc_data[ , !(names(cc_data) %in% "CUST_ID")])
#scale
cc_data_scaled <- scale(cc_data_clean)
wss <- numeric(20)
for (k in 1:20) {
set.seed(123)
kmeans_result <- kmeans(cc_data_scaled, centers = k, nstart = 20)
wss[k] <- kmeans_result$tot.withinss
}
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 431800)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
#elbow plot
plot(1:20, wss, type = "b", pch = 19, frame = FALSE,
xlab = "Number of Clusters",
ylab = "Total Within-Cluster Sum of Squares",
main = "Elbow Method for Choosing K")

#elbow is about 7 or 8, so this is how many clusters are being used in model
set.seed(123)
k <- 8
final_kmeans <- kmeans(cc_data_scaled, centers = k, nstart = 20)
table(final_kmeans$cluster)
##
## 1 2 3 4 5 6 7 8
## 855 590 1068 348 2692 24 1959 1100
#Clusters still vary a lot in size — from as small as 24 (Cluster 6) to as many as 2692 (Cluster 5).
#Most clusters are moderately sized.
#Cluster 6 is very small, I think this might represent a very distinct or outlier group.
#Cluster unscaled data
cc_data_clean$Cluster <- as.factor(final_kmeans$cluster)
cluster_summary <- cc_data_clean %>%
group_by(Cluster) %>%
summarise(Balance = mean(BALANCE),
CreditLimit = mean(CREDIT_LIMIT),
Payments = mean(PAYMENTS))
cluster_summary_long <- tidyr::pivot_longer(cluster_summary,
cols = -Cluster,
names_to = "Feature",
values_to = "Average")
ggplot(cluster_summary_long, aes(x = Cluster, y = Average, fill = Feature)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Average Balance, Credit Limit, and Payments by Cluster") +
theme_minimal()

#In this plot we can see cluster 6 is much higher than the other clusters. My understanding is that this group of people have higher credit card balances, but make the payments on them frequently. Likely a wealthier subset of people compared to the other clusters.