TTP 201 - Homework #4

Question 1

# Step 1: Load the ISLR package and dataset

data(Auto)

mpg_median <- median(Auto$mpg)

mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)

Auto$mpg01 <- mpg01






#boxplots
boxplot(cylinders ~ mpg01, data = Auto, main = "Cylinders vs mpg01")

boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01")

boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01")

boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01")

boxplot(acceleration ~ mpg01, data = Auto, main = "Acceleration vs mpg01")

boxplot(year ~ mpg01, data = Auto, main = "Year vs mpg01")

boxplot(origin ~ mpg01, data = Auto, main = "Origin vs mpg01")

#Cylinders, displacement, horsepower, and weight are typically lower for cars with high mpg (mpg01 = 1).

#The year and origin of the car appear to also show some difference.

#acceleration likely shows little association.


set.seed(1)

train_index <- sample(1:nrow(Auto), nrow(Auto)/2)
train <- Auto[train_index, ]
test <- Auto[-train_index, ]



lda_fit <- lda(mpg01 ~ cylinders + displacement + horsepower + weight + year + origin, data = train)

lda_pred <- predict(lda_fit, test)
lda_class <- lda_pred$class

lda_error <- mean(lda_class != test$mpg01)
lda_error

## [1] 0.1173469

glm_fit <- glm(mpg01 ~ cylinders + displacement + horsepower + weight + year + origin, 
               data = train, family = binomial)

glm_probs <- predict(glm_fit, test, type = "response")


glm_pred <- ifelse(glm_probs > 0.5, 1, 0)

glm_error <- mean(glm_pred != test$mpg01)
glm_error

## [1] 0.09183673

#scale
train_X <- scale(train[, c("cylinders", "displacement", "horsepower", "weight", "year", "origin")])
test_X <- scale(test[, c("cylinders", "displacement", "horsepower", "weight", "year", "origin")],
                center = attr(train_X, "scaled:center"),
                scale = attr(train_X, "scaled:scale"))

train_y <- train$mpg01

#try diff K's
knn_errors <- c()
for (k in 1:20) {
  knn_pred <- knn(train_X, test_X, train_y, k = k)
  error <- mean(knn_pred != test$mpg01)
  knn_errors[k] <- error
}

#best K
best_k <- which.min(knn_errors)
best_k

## [1] 1

knn_errors[best_k]

## [1] 0.09183673

#summary table
test_errors <- data.frame(
  Method = c("LDA", "Logistic Regression", paste("KNN (K =", best_k, ")")),
  Test_Error = c(lda_error, glm_error, knn_errors[best_k])
)

print(test_errors)

##                Method Test_Error
## 1                 LDA 0.11734694
## 2 Logistic Regression 0.09183673
## 3        KNN (K = 1 ) 0.09183673

Question 2

data("College")
str(College)

## 'data.frame':    777 obs. of  18 variables:
##  $ Private    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Apps       : num  1660 2186 1428 417 193 ...
##  $ Accept     : num  1232 1924 1097 349 146 ...
##  $ Enroll     : num  721 512 336 137 55 158 103 489 227 172 ...
##  $ Top10perc  : num  23 16 22 60 16 38 17 37 30 21 ...
##  $ Top25perc  : num  52 29 50 89 44 62 45 68 63 44 ...
##  $ F.Undergrad: num  2885 2683 1036 510 249 ...
##  $ P.Undergrad: num  537 1227 99 63 869 ...
##  $ Outstate   : num  7440 12280 11250 12960 7560 ...
##  $ Room.Board : num  3300 6450 3750 5450 4120 ...
##  $ Books      : num  450 750 400 450 800 500 500 450 300 660 ...
##  $ Personal   : num  2200 1500 1165 875 1500 ...
##  $ PhD        : num  70 29 53 92 76 67 90 89 79 40 ...
##  $ Terminal   : num  78 30 66 97 72 73 93 100 84 41 ...
##  $ S.F.Ratio  : num  18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
##  $ perc.alumni: num  12 16 30 37 2 11 26 37 23 15 ...
##  $ Expend     : num  7041 10527 8735 19016 10922 ...
##  $ Grad.Rate  : num  60 56 54 59 15 55 63 73 80 52 ...

summary(College)

##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00

#777 US Colleges. many different variables such as whether or not the college is private, the number of students enrolled, and the out of state tuition. 


set.seed(1)
train_index <- sample(1:nrow(College), nrow(College)/2)
train <- College[train_index, ]
test <- College[-train_index, ]

#forward stepwise selection
regfit_fwd <- regsubsets(Outstate ~ ., data = train, method = "forward")
summary_fwd <- summary(regfit_fwd)

par(mfrow = c(1, 3))
plot(summary_fwd$cp, xlab = "Number of Variables", ylab = "Cp", type = "b")
plot(summary_fwd$bic, xlab = "Number of Variables", ylab = "BIC", type = "b")
plot(summary_fwd$adjr2, xlab = "Number of Variables", ylab = "Adjusted R²", type = "b")

#assuming these variables were selected: Private, Room.Board, PhD, Terminal, S.F.Ratio, perc.alumni, Expend, Grad.Rate for model.



gam_model <- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(Terminal) + 
                   s(S.F.Ratio) + s(perc.alumni) + s(Expend) + s(Grad.Rate),
                 data = train)

par(mfrow = c(2, 4))
plot(gam_model, se = TRUE, col = "blue")

#each plot is the affect of 1 predictor
#the wider the plot, the more uncertainty for that area. for eaxmple - room board is better than Phd or terminal

preds <- predict(gam_model, newdata = test)

#RMSE calculation
rmse <- sqrt(mean((test$Outstate - preds)^2))
rmse

## [1] 1829.689

#this value is the mean root squared value and it indicates the difference between the models predicted out of state tuition vs the actual amount


summary(gam_model)

## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(Terminal) + 
##     s(S.F.Ratio) + s(perc.alumni) + s(Expend) + s(Grad.Rate), 
##     data = train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -6446.90 -1128.59   -20.65  1211.88  6812.89 
## 
## (Dispersion Parameter for gaussian family taken to be 3682843)
## 
##     Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1318456803 on 357.9997 degrees of freedom
## AIC: 6998.121 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private          1 1700512563 1700512563 461.7391 < 2.2e-16 ***
## s(Room.Board)    1 1689412313 1689412313 458.7250 < 2.2e-16 ***
## s(PhD)           1  349662610  349662610  94.9437 < 2.2e-16 ***
## s(Terminal)      1   18562912   18562912   5.0404   0.02537 *  
## s(S.F.Ratio)     1  152293040  152293040  41.3520 4.070e-10 ***
## s(perc.alumni)   1  248087751  248087751  67.3631 4.128e-15 ***
## s(Expend)        1  558930934  558930934 151.7662 < 2.2e-16 ***
## s(Grad.Rate)     1   86968217   86968217  23.6144 1.764e-06 ***
## Residuals      358 1318456803    3682843                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df  Npar F     Pr(F)    
## (Intercept)                                 
## Private                                     
## s(Room.Board)        3  1.6469   0.17822    
## s(PhD)               3  2.1718   0.09101 .  
## s(Terminal)          3  2.1322   0.09582 .  
## s(S.F.Ratio)         3  3.0363   0.02919 *  
## s(perc.alumni)       3  0.4348   0.72826    
## s(Expend)            3 20.0512 4.911e-12 ***
## s(Grad.Rate)         3  0.9436   0.41964    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#a value of greater than one indicates that maybe there is some non linearity

#based on the GAM output, there is strong evidence of non-linear relationships between the response (which is out of state tuition) and both Expend and S.F.Ratio, as seen by the low p-values. There is also weak evidence for non-linearity in PhD and terminal.

Question 3

desktop_path <- file.path(path.expand("~"), "Desktop")
cc_data <- read.csv(file.path(desktop_path, "cc_data.csv"))

cc_data_clean <- na.omit(cc_data[ , !(names(cc_data) %in% "CUST_ID")])

#scale
cc_data_scaled <- scale(cc_data_clean)


wss <- numeric(20)
for (k in 1:20) {
  set.seed(123)
  kmeans_result <- kmeans(cc_data_scaled, centers = k, nstart = 20)
  wss[k] <- kmeans_result$tot.withinss
}

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 431800)

## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations

#elbow plot
plot(1:20, wss, type = "b", pch = 19, frame = FALSE,
     xlab = "Number of Clusters",
     ylab = "Total Within-Cluster Sum of Squares",
     main = "Elbow Method for Choosing K")

#elbow is about 7 or 8, so this is how many clusters are being used in model


set.seed(123)
k <- 8  
final_kmeans <- kmeans(cc_data_scaled, centers = k, nstart = 20)

table(final_kmeans$cluster)

## 
##    1    2    3    4    5    6    7    8 
##  855  590 1068  348 2692   24 1959 1100

#Clusters still vary a lot in size — from as small as 24 (Cluster 6) to as many as 2692 (Cluster 5).

#Most clusters are moderately sized.

#Cluster 6 is very small, I think this might represent a very distinct or outlier group.




#Cluster unscaled data
cc_data_clean$Cluster <- as.factor(final_kmeans$cluster)




cluster_summary <- cc_data_clean %>%
  group_by(Cluster) %>%
  summarise(Balance = mean(BALANCE),
            CreditLimit = mean(CREDIT_LIMIT),
            Payments = mean(PAYMENTS))


cluster_summary_long <- tidyr::pivot_longer(cluster_summary, 
                                            cols = -Cluster, 
                                            names_to = "Feature", 
                                            values_to = "Average")

ggplot(cluster_summary_long, aes(x = Cluster, y = Average, fill = Feature)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Average Balance, Credit Limit, and Payments by Cluster") +
  theme_minimal()

#In this plot we can see cluster 6 is much higher than the other clusters. My understanding is that this group of people have higher credit card balances, but make the payments on them frequently. Likely a wealthier subset of people compared to the other clusters.

TTP 201 - Homework #4

Jared Garcia

5/28/2025

Question 1

Question 2

Question 3