Week 4 Homework

Question 9.1 - PCA Regression

uscrime <- read.table("C:/Users/moham/Downloads/Georgia Tech University/WK4/week 4 hw-summer 2026/week 4 data-summer/data 9.1/uscrime.txt", header=TRUE)
pca <- prcomp(uscrime[, 1:15], scale.=TRUE)
summary(pca)

## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.4534 1.6739 1.4160 1.07806 0.97893 0.74377 0.56729
## Proportion of Variance 0.4013 0.1868 0.1337 0.07748 0.06389 0.03688 0.02145
## Cumulative Proportion  0.4013 0.5880 0.7217 0.79920 0.86308 0.89996 0.92142
##                            PC8     PC9    PC10    PC11    PC12    PC13   PC14
## Standard deviation     0.55444 0.48493 0.44708 0.41915 0.35804 0.26333 0.2418
## Proportion of Variance 0.02049 0.01568 0.01333 0.01171 0.00855 0.00462 0.0039
## Cumulative Proportion  0.94191 0.95759 0.97091 0.98263 0.99117 0.99579 0.9997
##                           PC15
## Standard deviation     0.06793
## Proportion of Variance 0.00031
## Cumulative Proportion  1.00000

pc_scores_2 <- as.data.frame(pca$x[, 1:2])
pc_scores_2$Crime <- uscrime$Crime
lm_pca2 <- lm(Crime ~ ., data=pc_scores_2)
summary(lm_pca2)

## 
## Call:
## lm(formula = Crime ~ ., data = pc_scores_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -575.1 -222.1    4.4  159.6  905.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   905.09      49.52  18.279  < 2e-16 ***
## PC1            65.22      20.40   3.197  0.00257 ** 
## PC2           -70.08      29.90  -2.344  0.02367 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 339.5 on 44 degrees of freedom
## Multiple R-squared:  0.2631, Adjusted R-squared:  0.2296 
## F-statistic: 7.856 on 2 and 44 DF,  p-value: 0.001209

beta_pc2 <- coef(lm_pca2)[-1]
rotation2 <- pca$rotation[, 1:2]
beta_scaled2 <- rotation2 %*% beta_pc2
beta_orig2 <- beta_scaled2 / pca$scale
intercept2 <- coef(lm_pca2)[1] - sum(beta_orig2 * pca$center)
cat("Intercept:", intercept2, "\n")

## Intercept: 1484.748

print(beta_orig2)

##                 [,1]
## M      -1.926244e+01
## So     -2.187904e+01
## Ed      6.353881e+00
## Po1     1.313555e+01
## Po2     1.386953e+01
## LF     -2.696512e+02
## M.F    -6.803086e+00
## Pop     1.053809e+00
## NW     -3.079661e-01
## U1      1.151190e+02
## U2      2.461094e+01
## Wealth  3.126959e-02
## Ineq   -5.496028e+00
## Prob   -1.230546e+03
## Time    3.569499e+00

new_city <- data.frame(M=14.0, So=0, Ed=10.0, Po1=12.0, Po2=15.5,
  LF=0.640, M.F=94.0, Pop=150, NW=1.1, U1=0.120, U2=3.6,
  Wealth=3200, Ineq=20.1, Prob=0.04, Time=39.0)
new_scaled <- as.matrix(sweep(sweep(new_city, 2, pca$center, "-"), 2, pca$scale, "/"))
new_pc2 <- as.data.frame(new_scaled %*% pca$rotation[, 1:2])
colnames(new_pc2) <- c("PC1", "PC2")
predict(lm_pca2, newdata=new_pc2)

##        1 
## 1178.877

The PCA model predicted crime rate of 1,179 for the new city, compared to 155 from the full OLS model. The PCA model is more realistic as it avoids overfitting.

Question 10.1a - Regression Tree

library(rpart)

## Warning: package 'rpart' was built under R version 4.5.3

tree_model <- rpart(Crime ~ ., data=uscrime, method="anova")
printcp(tree_model)

## 
## Regression tree:
## rpart(formula = Crime ~ ., data = uscrime, method = "anova")
## 
## Variables actually used in tree construction:
## [1] NW  Po1 Pop
## 
## Root node error: 6880928/47 = 146403
## 
## n= 47 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.362963      0   1.00000 1.05318 0.26538
## 2 0.148143      1   0.63704 0.99112 0.23354
## 3 0.051732      2   0.48889 1.08641 0.25840
## 4 0.010000      3   0.43716 1.07598 0.25934

plot(tree_model)
text(tree_model, cex=0.8)

Takeaway 1: Police spending (Po1) is the strongest predictor. Cities with Po1 >= 7.65 have much higher crime — likely reverse causation.

Takeaway 2: Within high-spending cities, NW >= 7.65 predicts dramatically higher crime, reflecting historical inequality patterns.

Question 10.1b - Random Forest

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.5.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

set.seed(42)
rf_model <- randomForest(Crime ~ ., data=uscrime, ntree=500, importance=TRUE)
print(rf_model)

## 
## Call:
##  randomForest(formula = Crime ~ ., data = uscrime, ntree = 500,      importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 87379.48
##                     % Var explained: 40.32

importance(rf_model)

##            %IncMSE IncNodePurity
## M       3.00690191     225937.17
## So     -0.01356852      22096.35
## Ed      1.80268010     204621.10
## Po1    10.83366782    1164661.94
## Po2    10.47727967    1116121.11
## LF      2.15720728     258079.02
## M.F     1.46736557     315056.16
## Pop    -0.83229685     323553.45
## NW      9.10386424     530706.09
## U1     -1.22865150     135559.27
## U2      1.15212612     196488.67
## Wealth  4.60644846     726978.33
## Ineq    1.34905118     214692.60
## Prob    8.48888959     882981.35
## Time    0.93682829     186298.58

varImpPlot(rf_model)

Question 10.2 - Logistic Regression Situation

I used to work for the Los Angeles Department of Public Social Services identifying fraudulent benefit cases. Logistic regression would predict whether a transaction is fraudulent (yes/no). Predictors: transaction location, amount, state, average amount, transaction time.

Question 10.3 - German Credit Logistic Regression

germancredit <- read.table("C:/Users/moham/Downloads/Georgia Tech University/WK4/week 4 hw-summer 2026/week 4 data-summer/data 10.3/germancredit.txt", header=FALSE)
gc <- germancredit
gc$response <- ifelse(gc$V21 == 1, 1, 0)
logit_model <- glm(response ~ . - V21, data=gc, family=binomial(link="logit"))
summary(logit_model)

## 
## Call:
## glm(formula = response ~ . - V21, family = binomial(link = "logit"), 
##     data = gc)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.005e-01  1.084e+00  -0.369 0.711869    
## V1A12        3.749e-01  2.179e-01   1.720 0.085400 .  
## V1A13        9.657e-01  3.692e-01   2.616 0.008905 ** 
## V1A14        1.712e+00  2.322e-01   7.373 1.66e-13 ***
## V2          -2.786e-02  9.296e-03  -2.997 0.002724 ** 
## V3A31       -1.434e-01  5.489e-01  -0.261 0.793921    
## V3A32        5.861e-01  4.305e-01   1.362 0.173348    
## V3A33        8.532e-01  4.717e-01   1.809 0.070470 .  
## V3A34        1.436e+00  4.399e-01   3.264 0.001099 ** 
## V4A41        1.666e+00  3.743e-01   4.452 8.51e-06 ***
## V4A410       1.489e+00  7.764e-01   1.918 0.055163 .  
## V4A42        7.916e-01  2.610e-01   3.033 0.002421 ** 
## V4A43        8.916e-01  2.471e-01   3.609 0.000308 ***
## V4A44        5.228e-01  7.623e-01   0.686 0.492831    
## V4A45        2.164e-01  5.500e-01   0.393 0.694000    
## V4A46       -3.628e-02  3.965e-01  -0.092 0.927082    
## V4A48        2.059e+00  1.212e+00   1.699 0.089297 .  
## V4A49        7.401e-01  3.339e-01   2.216 0.026668 *  
## V5          -1.283e-04  4.444e-05  -2.887 0.003894 ** 
## V6A62        3.577e-01  2.861e-01   1.250 0.211130    
## V6A63        3.761e-01  4.011e-01   0.938 0.348476    
## V6A64        1.339e+00  5.249e-01   2.551 0.010729 *  
## V6A65        9.467e-01  2.625e-01   3.607 0.000310 ***
## V7A72        6.691e-02  4.270e-01   0.157 0.875475    
## V7A73        1.828e-01  4.105e-01   0.445 0.656049    
## V7A74        8.310e-01  4.455e-01   1.866 0.062110 .  
## V7A75        2.766e-01  4.134e-01   0.669 0.503410    
## V8          -3.301e-01  8.828e-02  -3.739 0.000185 ***
## V9A92        2.755e-01  3.865e-01   0.713 0.476040    
## V9A93        8.161e-01  3.799e-01   2.148 0.031718 *  
## V9A94        3.671e-01  4.537e-01   0.809 0.418448    
## V10A102     -4.360e-01  4.101e-01  -1.063 0.287700    
## V10A103      9.786e-01  4.243e-01   2.307 0.021072 *  
## V11         -4.776e-03  8.641e-02  -0.055 0.955920    
## V12A122     -2.814e-01  2.534e-01  -1.111 0.266630    
## V12A123     -1.945e-01  2.360e-01  -0.824 0.409743    
## V12A124     -7.304e-01  4.245e-01  -1.721 0.085308 .  
## V13          1.454e-02  9.222e-03   1.576 0.114982    
## V14A142      1.232e-01  4.119e-01   0.299 0.764878    
## V14A143      6.463e-01  2.391e-01   2.703 0.006871 ** 
## V15A152      4.436e-01  2.347e-01   1.890 0.058715 .  
## V15A153      6.839e-01  4.770e-01   1.434 0.151657    
## V16         -2.721e-01  1.895e-01  -1.436 0.151109    
## V17A172     -5.361e-01  6.796e-01  -0.789 0.430160    
## V17A173     -5.547e-01  6.549e-01  -0.847 0.397015    
## V17A174     -4.795e-01  6.623e-01  -0.724 0.469086    
## V18         -2.647e-01  2.492e-01  -1.062 0.288249    
## V19A192      3.000e-01  2.013e-01   1.491 0.136060    
## V20A202      1.392e+00  6.258e-01   2.225 0.026095 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1221.73  on 999  degrees of freedom
## Residual deviance:  895.82  on 951  degrees of freedom
## AIC: 993.82
## 
## Number of Fisher Scoring iterations: 5

pred_probs <- predict(logit_model, type="response")
thresholds <- seq(0.01, 0.99, by=0.01)
costs <- sapply(thresholds, function(t) {
  pc <- ifelse(pred_probs >= t, 1, 0)
  FP <- sum(pc == 1 & gc$response == 0)
  FN <- sum(pc == 0 & gc$response == 1)
  5*FP + 1*FN
})
best_threshold <- thresholds[which.min(costs)]
cat("Optimal threshold:", best_threshold, "\n")

## Optimal threshold: 0.87

pred_opt <- ifelse(pred_probs >= best_threshold, 1, 0)
table(Predicted=pred_opt, Actual=gc$response)

##          Actual
## Predicted   0   1
##         0 280 357
##         1  20 343

The optimal threshold is 0.87, reducing total cost from 774 to 457 — a 41% improvement over the default 0.5 threshold.