crime <- read.delim('/Users/helenalindsay/Documents/Fall_23/ISYE6501/hw5/uscrime.txt')

Q10.1

Regression tree model

#install.packages("rpart")
library(rpart)
tree_model <- rpart(Crime ~ ., data = crime)
print(tree_model)
## n= 47 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 47 6880928.0  905.0851  
##   2) Po1< 7.65 23  779243.5  669.6087  
##     4) Pop< 22.5 12  243811.0  550.5000 *
##     5) Pop>=22.5 11  179470.7  799.5455 *
##   3) Po1>=7.65 24 3604162.0 1130.7500  
##     6) NW< 7.65 10  557574.9  886.9000 *
##     7) NW>=7.65 14 2027225.0 1304.9290 *
Visualize the model
plot(tree_model,uniform=TRUE, margin=0.1, compress=TRUE, cex=0.8, width=20, height=10)
text(tree_model)

Access model performance
# Assess model performance on training data
actual <- crime$Crime
predicted <- predict(tree_model, newdata = crime)
mse <- mean((actual - predicted)^2)
mae <- mean(abs(actual - predicted))
rsquared <- 1 - (sum((actual - predicted)^2) / sum((actual - mean(actual))^2))

print(paste("MSE:", mse))
## [1] "MSE: 64001.7352307267"
print(paste("MAE:", mae))
## [1] "MAE: 186.982315556784"
print(paste("R-squared:", rsquared))
## [1] "R-squared: 0.562837788062114"

Random forest model

#install.packages("randomForest")
library(randomForest)

set.seed(131)
rf_model <- randomForest(Crime ~ ., data = crime) # ntree=300


print(rf_model)
## 
## Call:
##  randomForest(formula = Crime ~ ., data = crime) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 82510.74
##                     % Var explained: 43.64
Visualize the model
plot(rf_model)

Access model performance
# Assess model performance on training data
actual <- crime$Crime
predicted <- predict(rf_model, newdata = crime)
mse <- mean((actual - predicted)^2)
mae <- mean(abs(actual - predicted))
rsquared <- 1 - (sum((actual - predicted)^2) / sum((actual - mean(actual))^2))

print(paste("MSE:", mse))
## [1] "MSE: 14936.2581706394"
print(paste("MAE:", mae))
## [1] "MAE: 85.2924248226951"
print(paste("R-squared:", rsquared))
## [1] "R-squared: 0.897978271426346"

Q10.3

credit <- read.table("/Users/helenalindsay/Documents/Fall_23/ISYE6501/hw5/germancredit.txt")
credit$V21 <- ifelse(credit$V21 == 1, 1, 0) # good = 1, bad = 0
credit$V21 <- as.factor(credit$V21)
credit_model <- glm(V21 ~ ., data=credit, family=binomial(link='logit'))
summary(credit_model)
## 
## Call:
## glm(formula = V21 ~ ., family = binomial(link = "logit"), data = credit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6116  -0.7095   0.3752   0.6994   2.3410  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.005e-01  1.084e+00  -0.369 0.711869    
## V1A12        3.749e-01  2.179e-01   1.720 0.085400 .  
## V1A13        9.657e-01  3.692e-01   2.616 0.008905 ** 
## V1A14        1.712e+00  2.322e-01   7.373 1.66e-13 ***
## V2          -2.786e-02  9.296e-03  -2.997 0.002724 ** 
## V3A31       -1.434e-01  5.489e-01  -0.261 0.793921    
## V3A32        5.861e-01  4.305e-01   1.362 0.173348    
## V3A33        8.532e-01  4.717e-01   1.809 0.070470 .  
## V3A34        1.436e+00  4.399e-01   3.264 0.001099 ** 
## V4A41        1.666e+00  3.743e-01   4.452 8.51e-06 ***
## V4A410       1.489e+00  7.764e-01   1.918 0.055163 .  
## V4A42        7.916e-01  2.610e-01   3.033 0.002421 ** 
## V4A43        8.916e-01  2.471e-01   3.609 0.000308 ***
## V4A44        5.228e-01  7.623e-01   0.686 0.492831    
## V4A45        2.164e-01  5.500e-01   0.393 0.694000    
## V4A46       -3.628e-02  3.965e-01  -0.092 0.927082    
## V4A48        2.059e+00  1.212e+00   1.699 0.089297 .  
## V4A49        7.401e-01  3.339e-01   2.216 0.026668 *  
## V5          -1.283e-04  4.444e-05  -2.887 0.003894 ** 
## V6A62        3.577e-01  2.861e-01   1.250 0.211130    
## V6A63        3.761e-01  4.011e-01   0.938 0.348476    
## V6A64        1.339e+00  5.249e-01   2.551 0.010729 *  
## V6A65        9.467e-01  2.625e-01   3.607 0.000310 ***
## V7A72        6.691e-02  4.270e-01   0.157 0.875475    
## V7A73        1.828e-01  4.105e-01   0.445 0.656049    
## V7A74        8.310e-01  4.455e-01   1.866 0.062110 .  
## V7A75        2.766e-01  4.134e-01   0.669 0.503410    
## V8          -3.301e-01  8.828e-02  -3.739 0.000185 ***
## V9A92        2.755e-01  3.865e-01   0.713 0.476040    
## V9A93        8.161e-01  3.799e-01   2.148 0.031718 *  
## V9A94        3.671e-01  4.537e-01   0.809 0.418448    
## V10A102     -4.360e-01  4.101e-01  -1.063 0.287700    
## V10A103      9.786e-01  4.243e-01   2.307 0.021072 *  
## V11         -4.776e-03  8.641e-02  -0.055 0.955920    
## V12A122     -2.814e-01  2.534e-01  -1.111 0.266630    
## V12A123     -1.945e-01  2.360e-01  -0.824 0.409743    
## V12A124     -7.304e-01  4.245e-01  -1.721 0.085308 .  
## V13          1.454e-02  9.222e-03   1.576 0.114982    
## V14A142      1.232e-01  4.119e-01   0.299 0.764878    
## V14A143      6.463e-01  2.391e-01   2.703 0.006871 ** 
## V15A152      4.436e-01  2.347e-01   1.890 0.058715 .  
## V15A153      6.839e-01  4.770e-01   1.434 0.151657    
## V16         -2.721e-01  1.895e-01  -1.436 0.151109    
## V17A172     -5.361e-01  6.796e-01  -0.789 0.430160    
## V17A173     -5.547e-01  6.549e-01  -0.847 0.397015    
## V17A174     -4.795e-01  6.623e-01  -0.724 0.469086    
## V18         -2.647e-01  2.492e-01  -1.062 0.288249    
## V19A192      3.000e-01  2.013e-01   1.491 0.136060    
## V20A202      1.392e+00  6.258e-01   2.225 0.026095 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1221.73  on 999  degrees of freedom
## Residual deviance:  895.82  on 951  degrees of freedom
## AIC: 993.82
## 
## Number of Fisher Scoring iterations: 5

Access model performance

# Assess model performance on training data
fitted.results <- predict(credit_model,newdata=credit,type='response')
fitted.results <- ifelse(fitted.results > 0.7,1,0)
credit$predicted <- fitted.results
misClasificError <- mean(fitted.results != credit$V21)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.756"

ROC curve

#install.packages('ROCR')
library(ROCR)
pr <- prediction(fitted.results, credit$V21)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.7609524