uscrime <- read.table("C:/Users/moham/Downloads/Georgia Tech University/WK4/week 4 hw-summer 2026/week 4 data-summer/data 9.1/uscrime.txt", header=TRUE)
pca <- prcomp(uscrime[, 1:15], scale.=TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4534 1.6739 1.4160 1.07806 0.97893 0.74377 0.56729
## Proportion of Variance 0.4013 0.1868 0.1337 0.07748 0.06389 0.03688 0.02145
## Cumulative Proportion 0.4013 0.5880 0.7217 0.79920 0.86308 0.89996 0.92142
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.55444 0.48493 0.44708 0.41915 0.35804 0.26333 0.2418
## Proportion of Variance 0.02049 0.01568 0.01333 0.01171 0.00855 0.00462 0.0039
## Cumulative Proportion 0.94191 0.95759 0.97091 0.98263 0.99117 0.99579 0.9997
## PC15
## Standard deviation 0.06793
## Proportion of Variance 0.00031
## Cumulative Proportion 1.00000
pc_scores_2 <- as.data.frame(pca$x[, 1:2])
pc_scores_2$Crime <- uscrime$Crime
lm_pca2 <- lm(Crime ~ ., data=pc_scores_2)
summary(lm_pca2)
##
## Call:
## lm(formula = Crime ~ ., data = pc_scores_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -575.1 -222.1 4.4 159.6 905.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 905.09 49.52 18.279 < 2e-16 ***
## PC1 65.22 20.40 3.197 0.00257 **
## PC2 -70.08 29.90 -2.344 0.02367 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 339.5 on 44 degrees of freedom
## Multiple R-squared: 0.2631, Adjusted R-squared: 0.2296
## F-statistic: 7.856 on 2 and 44 DF, p-value: 0.001209
beta_pc2 <- coef(lm_pca2)[-1]
rotation2 <- pca$rotation[, 1:2]
beta_scaled2 <- rotation2 %*% beta_pc2
beta_orig2 <- beta_scaled2 / pca$scale
intercept2 <- coef(lm_pca2)[1] - sum(beta_orig2 * pca$center)
cat("Intercept:", intercept2, "\n")
## Intercept: 1484.748
print(beta_orig2)
## [,1]
## M -1.926244e+01
## So -2.187904e+01
## Ed 6.353881e+00
## Po1 1.313555e+01
## Po2 1.386953e+01
## LF -2.696512e+02
## M.F -6.803086e+00
## Pop 1.053809e+00
## NW -3.079661e-01
## U1 1.151190e+02
## U2 2.461094e+01
## Wealth 3.126959e-02
## Ineq -5.496028e+00
## Prob -1.230546e+03
## Time 3.569499e+00
new_city <- data.frame(M=14.0, So=0, Ed=10.0, Po1=12.0, Po2=15.5,
LF=0.640, M.F=94.0, Pop=150, NW=1.1, U1=0.120, U2=3.6,
Wealth=3200, Ineq=20.1, Prob=0.04, Time=39.0)
new_scaled <- as.matrix(sweep(sweep(new_city, 2, pca$center, "-"), 2, pca$scale, "/"))
new_pc2 <- as.data.frame(new_scaled %*% pca$rotation[, 1:2])
colnames(new_pc2) <- c("PC1", "PC2")
predict(lm_pca2, newdata=new_pc2)
## 1
## 1178.877
The PCA model predicted crime rate of 1,179 for the new city, compared to 155 from the full OLS model. The PCA model is more realistic as it avoids overfitting.
library(rpart)
## Warning: package 'rpart' was built under R version 4.5.3
tree_model <- rpart(Crime ~ ., data=uscrime, method="anova")
printcp(tree_model)
##
## Regression tree:
## rpart(formula = Crime ~ ., data = uscrime, method = "anova")
##
## Variables actually used in tree construction:
## [1] NW Po1 Pop
##
## Root node error: 6880928/47 = 146403
##
## n= 47
##
## CP nsplit rel error xerror xstd
## 1 0.362963 0 1.00000 1.05318 0.26538
## 2 0.148143 1 0.63704 0.99112 0.23354
## 3 0.051732 2 0.48889 1.08641 0.25840
## 4 0.010000 3 0.43716 1.07598 0.25934
plot(tree_model)
text(tree_model, cex=0.8)
Takeaway 1: Police spending (Po1) is the strongest predictor. Cities with Po1 >= 7.65 have much higher crime — likely reverse causation.
Takeaway 2: Within high-spending cities, NW >= 7.65 predicts dramatically higher crime, reflecting historical inequality patterns.
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
set.seed(42)
rf_model <- randomForest(Crime ~ ., data=uscrime, ntree=500, importance=TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = Crime ~ ., data = uscrime, ntree = 500, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 87379.48
## % Var explained: 40.32
importance(rf_model)
## %IncMSE IncNodePurity
## M 3.00690191 225937.17
## So -0.01356852 22096.35
## Ed 1.80268010 204621.10
## Po1 10.83366782 1164661.94
## Po2 10.47727967 1116121.11
## LF 2.15720728 258079.02
## M.F 1.46736557 315056.16
## Pop -0.83229685 323553.45
## NW 9.10386424 530706.09
## U1 -1.22865150 135559.27
## U2 1.15212612 196488.67
## Wealth 4.60644846 726978.33
## Ineq 1.34905118 214692.60
## Prob 8.48888959 882981.35
## Time 0.93682829 186298.58
varImpPlot(rf_model)
I used to work for the Los Angeles Department of Public Social Services identifying fraudulent benefit cases. Logistic regression would predict whether a transaction is fraudulent (yes/no). Predictors: transaction location, amount, state, average amount, transaction time.
germancredit <- read.table("C:/Users/moham/Downloads/Georgia Tech University/WK4/week 4 hw-summer 2026/week 4 data-summer/data 10.3/germancredit.txt", header=FALSE)
gc <- germancredit
gc$response <- ifelse(gc$V21 == 1, 1, 0)
logit_model <- glm(response ~ . - V21, data=gc, family=binomial(link="logit"))
summary(logit_model)
##
## Call:
## glm(formula = response ~ . - V21, family = binomial(link = "logit"),
## data = gc)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.005e-01 1.084e+00 -0.369 0.711869
## V1A12 3.749e-01 2.179e-01 1.720 0.085400 .
## V1A13 9.657e-01 3.692e-01 2.616 0.008905 **
## V1A14 1.712e+00 2.322e-01 7.373 1.66e-13 ***
## V2 -2.786e-02 9.296e-03 -2.997 0.002724 **
## V3A31 -1.434e-01 5.489e-01 -0.261 0.793921
## V3A32 5.861e-01 4.305e-01 1.362 0.173348
## V3A33 8.532e-01 4.717e-01 1.809 0.070470 .
## V3A34 1.436e+00 4.399e-01 3.264 0.001099 **
## V4A41 1.666e+00 3.743e-01 4.452 8.51e-06 ***
## V4A410 1.489e+00 7.764e-01 1.918 0.055163 .
## V4A42 7.916e-01 2.610e-01 3.033 0.002421 **
## V4A43 8.916e-01 2.471e-01 3.609 0.000308 ***
## V4A44 5.228e-01 7.623e-01 0.686 0.492831
## V4A45 2.164e-01 5.500e-01 0.393 0.694000
## V4A46 -3.628e-02 3.965e-01 -0.092 0.927082
## V4A48 2.059e+00 1.212e+00 1.699 0.089297 .
## V4A49 7.401e-01 3.339e-01 2.216 0.026668 *
## V5 -1.283e-04 4.444e-05 -2.887 0.003894 **
## V6A62 3.577e-01 2.861e-01 1.250 0.211130
## V6A63 3.761e-01 4.011e-01 0.938 0.348476
## V6A64 1.339e+00 5.249e-01 2.551 0.010729 *
## V6A65 9.467e-01 2.625e-01 3.607 0.000310 ***
## V7A72 6.691e-02 4.270e-01 0.157 0.875475
## V7A73 1.828e-01 4.105e-01 0.445 0.656049
## V7A74 8.310e-01 4.455e-01 1.866 0.062110 .
## V7A75 2.766e-01 4.134e-01 0.669 0.503410
## V8 -3.301e-01 8.828e-02 -3.739 0.000185 ***
## V9A92 2.755e-01 3.865e-01 0.713 0.476040
## V9A93 8.161e-01 3.799e-01 2.148 0.031718 *
## V9A94 3.671e-01 4.537e-01 0.809 0.418448
## V10A102 -4.360e-01 4.101e-01 -1.063 0.287700
## V10A103 9.786e-01 4.243e-01 2.307 0.021072 *
## V11 -4.776e-03 8.641e-02 -0.055 0.955920
## V12A122 -2.814e-01 2.534e-01 -1.111 0.266630
## V12A123 -1.945e-01 2.360e-01 -0.824 0.409743
## V12A124 -7.304e-01 4.245e-01 -1.721 0.085308 .
## V13 1.454e-02 9.222e-03 1.576 0.114982
## V14A142 1.232e-01 4.119e-01 0.299 0.764878
## V14A143 6.463e-01 2.391e-01 2.703 0.006871 **
## V15A152 4.436e-01 2.347e-01 1.890 0.058715 .
## V15A153 6.839e-01 4.770e-01 1.434 0.151657
## V16 -2.721e-01 1.895e-01 -1.436 0.151109
## V17A172 -5.361e-01 6.796e-01 -0.789 0.430160
## V17A173 -5.547e-01 6.549e-01 -0.847 0.397015
## V17A174 -4.795e-01 6.623e-01 -0.724 0.469086
## V18 -2.647e-01 2.492e-01 -1.062 0.288249
## V19A192 3.000e-01 2.013e-01 1.491 0.136060
## V20A202 1.392e+00 6.258e-01 2.225 0.026095 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1221.73 on 999 degrees of freedom
## Residual deviance: 895.82 on 951 degrees of freedom
## AIC: 993.82
##
## Number of Fisher Scoring iterations: 5
pred_probs <- predict(logit_model, type="response")
thresholds <- seq(0.01, 0.99, by=0.01)
costs <- sapply(thresholds, function(t) {
pc <- ifelse(pred_probs >= t, 1, 0)
FP <- sum(pc == 1 & gc$response == 0)
FN <- sum(pc == 0 & gc$response == 1)
5*FP + 1*FN
})
best_threshold <- thresholds[which.min(costs)]
cat("Optimal threshold:", best_threshold, "\n")
## Optimal threshold: 0.87
pred_opt <- ifelse(pred_probs >= best_threshold, 1, 0)
table(Predicted=pred_opt, Actual=gc$response)
## Actual
## Predicted 0 1
## 0 280 357
## 1 20 343
The optimal threshold is 0.87, reducing total cost from 774 to 457 — a 41% improvement over the default 0.5 threshold.