library(tidyverse)
library(plyr)
library(PerformanceAnalytics)
library(MASS)
library(forecast)
library(kableExtra)

Data Exploration

crime <- read_csv("crime-training-data_modified.csv")
crimeboxcox <- read_csv("crime-training-data_modified.csv")

glimpse(crime)
## Observations: 466
## Variables: 13
## $ zn      <dbl> 0, 0, 0, 30, 0, 0, 0, 0, 0, 80, 22, 0, 0, 22, 0, 0, 10...
## $ indus   <dbl> 19.58, 19.58, 18.10, 4.93, 2.46, 8.56, 18.10, 18.10, 5...
## $ chas    <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ nox     <dbl> 0.605, 0.871, 0.740, 0.428, 0.488, 0.520, 0.693, 0.693...
## $ rm      <dbl> 7.929, 5.403, 6.485, 6.393, 7.155, 6.781, 5.453, 4.519...
## $ age     <dbl> 96.2, 100.0, 100.0, 7.8, 92.2, 71.3, 100.0, 100.0, 38....
## $ dis     <dbl> 2.0459, 1.3216, 1.9784, 7.0355, 2.7006, 2.8561, 1.4896...
## $ rad     <dbl> 5, 5, 24, 6, 3, 5, 24, 24, 5, 1, 7, 5, 24, 7, 3, 3, 5,...
## $ tax     <dbl> 403, 403, 666, 300, 193, 384, 666, 666, 224, 315, 330,...
## $ ptratio <dbl> 14.7, 14.7, 20.2, 16.6, 17.8, 20.9, 20.2, 20.2, 20.2, ...
## $ lstat   <dbl> 3.70, 26.82, 18.85, 5.19, 4.82, 7.67, 30.59, 36.98, 5....
## $ medv    <dbl> 50.0, 13.4, 15.4, 23.7, 37.9, 26.5, 5.0, 7.0, 22.2, 20...
## $ target  <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...
skimr::skim(crime)
Data summary
Name crime
Number of rows 466
Number of columns 13
_______________________
Column type frequency:
numeric 13
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
zn 0 1 11.58 23.36 0.00 0.00 0.00 16.25 100.00 ▇▁▁▁▁
indus 0 1 11.11 6.85 0.46 5.15 9.69 18.10 27.74 ▇▆▁▇▁
chas 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
nox 0 1 0.55 0.12 0.39 0.45 0.54 0.62 0.87 ▇▇▅▃▁
rm 0 1 6.29 0.70 3.86 5.89 6.21 6.63 8.78 ▁▂▇▂▁
age 0 1 68.37 28.32 2.90 43.88 77.15 94.10 100.00 ▂▂▂▃▇
dis 0 1 3.80 2.11 1.13 2.10 3.19 5.21 12.13 ▇▅▂▁▁
rad 0 1 9.53 8.69 1.00 4.00 5.00 24.00 24.00 ▇▂▁▁▃
tax 0 1 409.50 167.90 187.00 281.00 334.50 666.00 711.00 ▇▇▅▁▇
ptratio 0 1 18.40 2.20 12.60 16.90 18.90 20.20 22.00 ▁▃▅▅▇
lstat 0 1 12.63 7.10 1.73 7.04 11.35 16.93 37.97 ▇▇▅▂▁
medv 0 1 22.59 9.24 5.00 17.02 21.20 25.00 50.00 ▂▇▅▁▁
target 0 1 0.49 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▇
chart.Correlation(crime, histogram = TRUE)

We can see a lot in this graph. First we notice which variables are more or less correlated with our target varible. Second we can see the distribution of the variables to start to think about which might be candidates for transformation. Since no data is missing, we can’t impune data but we can reduce outliers and make skewed data more normally distributed.

Data Preparation

crimelambda <- sapply(crime, FUN = BoxCox.lambda)
crimelambda
##          zn       indus        chas         nox          rm         age 
##  0.07538486 -0.08779326  0.47220206 -0.99992425  0.28832202  1.99992425 
##         dis         rad         tax     ptratio       lstat        medv 
## -0.61031032 -0.33539473 -0.99992425  1.99992425 -0.17920211 -0.09049268 
##      target 
##  0.46812429

The optimal candidates to transform are chas, nox, age, tax, and ptratio because they are within 0.03 of a multiple of 0.5. Now let’s round the lambda values to make a more simple transformation.

newlambda <- round_any(crimelambda, 0.5)

These are the new lambdas we will use. Lets transform the data.

crimeboxcox$chas <- (crime$chas ^ .5 - 1)/.5
crimeboxcox$nox <- (crime$nox ^ -1 - 1)/-1
crimeboxcox$age <- (crime$age ^ 2 - 1)/2
crimeboxcox$tax <- (crime$tax ^ -1 - 1)/-1
crimeboxcox$ptratio <- (crime$ptratio ^ 2 - 1)/2

Next we’re going to reduce the dimensions of the transformed variables to create linearly uncorrelated variables called principal components with principal component analysis.

prccrime <- prcomp(crimeboxcox[, 1:12])
#Making a second model with the scale command included. 
prccrimescale <- prcomp(crimeboxcox[, 1:12], scale=TRUE)

#creating a dataframe of new variables, adding and renaming the response variable for later processing.
pca <- data.frame(prccrime$x, crime$target)
colnames(pca)[13] <- "target"
pca1 <- data.frame(prccrimescale$x, crime$target)
colnames(pca1)[13] <- "target"

#Let's view the relationship of some of our new variables.
summary(lm(target~.,pca))
## 
## Call:
## lm(formula = target ~ ., data = pca)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65346 -0.18837 -0.02771  0.12643  0.99695 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.914e-01  1.404e-02  35.012  < 2e-16 ***
## PC1         -1.885e-04  8.098e-06 -23.281  < 2e-16 ***
## PC2         -1.426e-03  3.688e-04  -3.866 0.000127 ***
## PC3         -1.306e-03  7.624e-04  -1.712 0.087521 .  
## PC4          5.729e-03  1.616e-03   3.544 0.000434 ***
## PC5         -2.553e-02  2.017e-03 -12.660  < 2e-16 ***
## PC6          1.939e-03  3.427e-03   0.566 0.571884    
## PC7         -2.223e-03  4.020e-03  -0.553 0.580629    
## PC8          1.776e-02  1.301e-02   1.365 0.172921    
## PC9          2.318e-02  2.829e-02   0.820 0.412919    
## PC10         2.336e-02  3.067e-02   0.762 0.446677    
## PC11         8.143e-01  9.671e-02   8.420    5e-16 ***
## PC12         7.212e+01  2.504e+01   2.880 0.004162 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.303 on 453 degrees of freedom
## Multiple R-squared:  0.6429, Adjusted R-squared:  0.6335 
## F-statistic: 67.97 on 12 and 453 DF,  p-value: < 2.2e-16
#Two variables were chosen with a high degree of statistical significants. Let's see how distinct they look.
prccrime_num <- data.frame(
  PC1 = prccrime$x[, 1],
  PC2 = prccrime$x[, 5],
  classification = crime$target
)

ggplot(prccrime_num, aes(x = PC1, y = PC2, col = classification)) +
  geom_point() 

Looking for explination of variance.

pr_var <- prccrime$sdev^2
pr_varex <- pr_var/sum(pr_var)
plot(cumsum(pr_varex), type="b")

It appears that one column can explain 99.94% of the variance. We’ll eliminate columns in the next stage.

Build Models

Lets see how a simple linear model will work with our four variable sets.

crimemodel <- lm(target~., crime)
boxcoxmodel <- lm(target~., crimeboxcox)
pcamodel <- lm(target~., pca)
pcascaledmodel <- lm(target~., pca1)
summary(crimemodel)
## 
## Call:
## lm(formula = target ~ ., data = crime)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59701 -0.21505 -0.04691  0.14908  0.88702 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.6013725  0.3594901  -4.455 1.06e-05 ***
## zn          -0.0009668  0.0009442  -1.024 0.306432    
## indus        0.0031277  0.0042909   0.729 0.466433    
## chas         0.0059892  0.0588402   0.102 0.918970    
## nox          1.9722476  0.2632648   7.491 3.60e-13 ***
## rm           0.0249823  0.0315042   0.793 0.428202    
## age          0.0031738  0.0009045   3.509 0.000495 ***
## dis          0.0125382  0.0141433   0.887 0.375814    
## rad          0.0207000  0.0043384   4.771 2.47e-06 ***
## tax         -0.0002787  0.0002617  -1.065 0.287396    
## ptratio      0.0115287  0.0093460   1.234 0.218013    
## lstat        0.0045124  0.0038923   1.159 0.246935    
## medv         0.0089246  0.0029992   2.976 0.003080 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.312 on 453 degrees of freedom
## Multiple R-squared:  0.6213, Adjusted R-squared:  0.6112 
## F-statistic: 61.92 on 12 and 453 DF,  p-value: < 2.2e-16
summary(boxcoxmodel)
## 
## Call:
## lm(formula = target ~ ., data = crimeboxcox)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65346 -0.18837 -0.02771  0.12643  0.99695 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.152e+01  2.496e+01  -2.866 0.004356 ** 
## zn          -6.125e-04  8.986e-04  -0.682 0.495801    
## indus       -3.648e-03  3.856e-03  -0.946 0.344578    
## chas         2.180e-02  2.840e-02   0.767 0.443222    
## nox          7.977e-01  9.668e-02   8.251 1.72e-15 ***
## rm           2.142e-02  3.059e-02   0.700 0.484285    
## age          4.490e-05  1.539e-05   2.918 0.003697 ** 
## dis          3.039e-02  1.429e-02   2.126 0.034062 *  
## rad          1.080e-02  2.953e-03   3.657 0.000285 ***
## tax          7.212e+01  2.504e+01   2.880 0.004162 ** 
## ptratio      6.275e-04  4.948e-04   1.268 0.205345    
## lstat        4.888e-03  3.845e-03   1.271 0.204349    
## medv         1.068e-02  2.905e-03   3.678 0.000264 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.303 on 453 degrees of freedom
## Multiple R-squared:  0.6429, Adjusted R-squared:  0.6335 
## F-statistic: 67.97 on 12 and 453 DF,  p-value: < 2.2e-16
summary(pcamodel)
## 
## Call:
## lm(formula = target ~ ., data = pca)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65346 -0.18837 -0.02771  0.12643  0.99695 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.914e-01  1.404e-02  35.012  < 2e-16 ***
## PC1         -1.885e-04  8.098e-06 -23.281  < 2e-16 ***
## PC2         -1.426e-03  3.688e-04  -3.866 0.000127 ***
## PC3         -1.306e-03  7.624e-04  -1.712 0.087521 .  
## PC4          5.729e-03  1.616e-03   3.544 0.000434 ***
## PC5         -2.553e-02  2.017e-03 -12.660  < 2e-16 ***
## PC6          1.939e-03  3.427e-03   0.566 0.571884    
## PC7         -2.223e-03  4.020e-03  -0.553 0.580629    
## PC8          1.776e-02  1.301e-02   1.365 0.172921    
## PC9          2.318e-02  2.829e-02   0.820 0.412919    
## PC10         2.336e-02  3.067e-02   0.762 0.446677    
## PC11         8.143e-01  9.671e-02   8.420    5e-16 ***
## PC12         7.212e+01  2.504e+01   2.880 0.004162 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.303 on 453 degrees of freedom
## Multiple R-squared:  0.6429, Adjusted R-squared:  0.6335 
## F-statistic: 67.97 on 12 and 453 DF,  p-value: < 2.2e-16
summary(pcascaledmodel)
## 
## Call:
## lm(formula = target ~ ., data = pca1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65346 -0.18837 -0.02771  0.12643  0.99695 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.491416   0.014036  35.012  < 2e-16 ***
## PC1         -0.140708   0.005708 -24.651  < 2e-16 ***
## PC2         -0.114038   0.010985 -10.381  < 2e-16 ***
## PC3         -0.085691   0.013672  -6.268 8.54e-10 ***
## PC4          0.005231   0.014990   0.349  0.72726    
## PC5          0.038018   0.016285   2.334  0.02001 *  
## PC6         -0.028862   0.022369  -1.290  0.19761    
## PC7         -0.051765   0.025181  -2.056  0.04038 *  
## PC8         -0.036403   0.027194  -1.339  0.18137    
## PC9         -0.078168   0.030867  -2.532  0.01166 *  
## PC10         0.032285   0.032921   0.981  0.32728    
## PC11        -0.110317   0.035440  -3.113  0.00197 ** 
## PC12         0.239088   0.043220   5.532 5.36e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.303 on 453 degrees of freedom
## Multiple R-squared:  0.6429, Adjusted R-squared:  0.6335 
## F-statistic: 67.97 on 12 and 453 DF,  p-value: < 2.2e-16

Stepwise feature selection

stepcrime <- stepAIC(crimemodel, direction="backward")
## Start:  AIC=-1072.6
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv
## 
##           Df Sum of Sq    RSS     AIC
## - chas     1    0.0010 44.110 -1074.6
## - indus    1    0.0517 44.161 -1074.0
## - rm       1    0.0612 44.171 -1074.0
## - dis      1    0.0765 44.186 -1073.8
## - zn       1    0.1021 44.211 -1073.5
## - tax      1    0.1105 44.220 -1073.4
## - lstat    1    0.1309 44.240 -1073.2
## - ptratio  1    0.1482 44.258 -1073.0
## <none>                 44.109 -1072.6
## - medv     1    0.8622 44.972 -1065.6
## - age      1    1.1988 45.308 -1062.1
## - rad      1    2.2167 46.326 -1051.8
## - nox      1    5.4647 49.574 -1020.2
## 
## Step:  AIC=-1074.59
## target ~ zn + indus + nox + rm + age + dis + rad + tax + ptratio + 
##     lstat + medv
## 
##           Df Sum of Sq    RSS     AIC
## - indus    1    0.0540 44.164 -1076.0
## - rm       1    0.0609 44.171 -1076.0
## - dis      1    0.0771 44.188 -1075.8
## - zn       1    0.1022 44.213 -1075.5
## - tax      1    0.1143 44.225 -1075.4
## - lstat    1    0.1308 44.241 -1075.2
## - ptratio  1    0.1474 44.258 -1075.0
## <none>                 44.110 -1074.6
## - medv     1    0.8851 44.995 -1067.3
## - age      1    1.2031 45.313 -1064.0
## - rad      1    2.2364 46.347 -1053.5
## - nox      1    5.5026 49.613 -1021.8
## 
## Step:  AIC=-1076.02
## target ~ zn + nox + rm + age + dis + rad + tax + ptratio + lstat + 
##     medv
## 
##           Df Sum of Sq    RSS     AIC
## - rm       1    0.0508 44.215 -1077.5
## - dis      1    0.0571 44.221 -1077.4
## - tax      1    0.0681 44.232 -1077.3
## - zn       1    0.1246 44.289 -1076.7
## - lstat    1    0.1384 44.303 -1076.6
## - ptratio  1    0.1745 44.339 -1076.2
## <none>                 44.164 -1076.0
## - medv     1    0.8980 45.062 -1068.6
## - age      1    1.1987 45.363 -1065.5
## - rad      1    2.2339 46.398 -1055.0
## - nox      1    6.3555 50.520 -1015.4
## 
## Step:  AIC=-1077.49
## target ~ zn + nox + age + dis + rad + tax + ptratio + lstat + 
##     medv
## 
##           Df Sum of Sq    RSS     AIC
## - dis      1    0.0574 44.272 -1078.9
## - tax      1    0.0742 44.289 -1078.7
## - lstat    1    0.0990 44.314 -1078.4
## - zn       1    0.1094 44.325 -1078.3
## - ptratio  1    0.1651 44.380 -1077.8
## <none>                 44.215 -1077.5
## - medv     1    1.2223 45.437 -1066.8
## - age      1    1.3556 45.571 -1065.4
## - rad      1    2.3831 46.598 -1055.0
## - nox      1    6.3272 50.542 -1017.2
## 
## Step:  AIC=-1078.88
## target ~ zn + nox + age + rad + tax + ptratio + lstat + medv
## 
##           Df Sum of Sq    RSS     AIC
## - zn       1    0.0631 44.336 -1080.2
## - lstat    1    0.0806 44.353 -1080.0
## - tax      1    0.0923 44.365 -1079.9
## - ptratio  1    0.1570 44.429 -1079.2
## <none>                 44.272 -1078.9
## - medv     1    1.1833 45.456 -1068.6
## - age      1    1.3078 45.580 -1067.3
## - rad      1    2.4310 46.704 -1056.0
## - nox      1    6.9646 51.237 -1012.8
## 
## Step:  AIC=-1080.22
## target ~ nox + age + rad + tax + ptratio + lstat + medv
## 
##           Df Sum of Sq    RSS     AIC
## - lstat    1    0.0800 44.416 -1081.4
## - tax      1    0.1266 44.462 -1080.9
## <none>                 44.336 -1080.2
## - ptratio  1    0.2574 44.593 -1079.5
## - medv     1    1.1597 45.495 -1070.2
## - age      1    1.5927 45.928 -1065.8
## - rad      1    2.4722 46.808 -1056.9
## - nox      1    7.9023 52.238 -1005.8
## 
## Step:  AIC=-1081.38
## target ~ nox + age + rad + tax + ptratio + medv
## 
##           Df Sum of Sq    RSS     AIC
## - tax      1    0.1256 44.541 -1082.1
## <none>                 44.416 -1081.4
## - ptratio  1    0.2325 44.648 -1080.9
## - medv     1    1.3210 45.737 -1069.7
## - age      1    2.0695 46.485 -1062.2
## - rad      1    2.5773 46.993 -1057.1
## - nox      1    8.0243 52.440 -1006.0
## 
## Step:  AIC=-1082.06
## target ~ nox + age + rad + ptratio + medv
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 44.541 -1082.1
## - ptratio  1    0.2101 44.751 -1081.9
## - medv     1    1.5678 46.109 -1067.9
## - age      1    2.0560 46.597 -1063.0
## - rad      1    5.1739 49.715 -1032.8
## - nox      1    7.9634 52.505 -1007.4
stepboxcox <- stepAIC(boxcoxmodel, direction="backward")
## Start:  AIC=-1100.05
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv
## 
##           Df Sum of Sq    RSS     AIC
## - zn       1    0.0427 41.629 -1101.6
## - rm       1    0.0450 41.631 -1101.5
## - chas     1    0.0541 41.640 -1101.4
## - indus    1    0.0822 41.669 -1101.1
## - ptratio  1    0.1477 41.734 -1100.4
## - lstat    1    0.1483 41.735 -1100.4
## <none>                 41.586 -1100.0
## - dis      1    0.4148 42.001 -1097.4
## - tax      1    0.7616 42.348 -1093.6
## - age      1    0.7817 42.368 -1093.4
## - rad      1    1.2278 42.814 -1088.5
## - medv     1    1.2416 42.828 -1088.3
## - nox      1    6.2502 47.837 -1036.8
## 
## Step:  AIC=-1101.57
## target ~ indus + chas + nox + rm + age + dis + rad + tax + ptratio + 
##     lstat + medv
## 
##           Df Sum of Sq    RSS     AIC
## - rm       1    0.0379 41.667 -1103.2
## - chas     1    0.0564 41.685 -1102.9
## - indus    1    0.0791 41.708 -1102.7
## - lstat    1    0.1361 41.765 -1102.0
## <none>                 41.629 -1101.6
## - ptratio  1    0.2072 41.836 -1101.3
## - dis      1    0.3723 42.001 -1099.4
## - tax      1    0.7232 42.352 -1095.5
## - age      1    0.7881 42.417 -1094.8
## - rad      1    1.2000 42.829 -1090.3
## - medv     1    1.2050 42.834 -1090.3
## - nox      1    6.5638 48.193 -1035.3
## 
## Step:  AIC=-1103.15
## target ~ indus + chas + nox + age + dis + rad + tax + ptratio + 
##     lstat + medv
## 
##           Df Sum of Sq    RSS     AIC
## - chas     1    0.0544 41.721 -1104.5
## - indus    1    0.0936 41.761 -1104.1
## - lstat    1    0.1042 41.771 -1104.0
## <none>                 41.667 -1103.2
## - ptratio  1    0.1970 41.864 -1103.0
## - dis      1    0.3765 42.043 -1101.0
## - tax      1    0.7220 42.389 -1097.1
## - age      1    0.8900 42.557 -1095.3
## - rad      1    1.3129 42.980 -1090.7
## - medv     1    1.5940 43.261 -1087.7
## - nox      1    6.5562 48.223 -1037.0
## 
## Step:  AIC=-1104.54
## target ~ indus + nox + age + dis + rad + tax + ptratio + lstat + 
##     medv
## 
##           Df Sum of Sq    RSS     AIC
## - indus    1    0.0808 41.802 -1105.6
## - lstat    1    0.1049 41.826 -1105.4
## <none>                 41.721 -1104.5
## - ptratio  1    0.1825 41.904 -1104.5
## - dis      1    0.3805 42.102 -1102.3
## - tax      1    0.6976 42.419 -1098.8
## - age      1    0.8991 42.620 -1096.6
## - rad      1    1.3233 43.045 -1092.0
## - medv     1    1.6933 43.415 -1088.0
## - nox      1    6.6221 48.344 -1037.9
## 
## Step:  AIC=-1105.64
## target ~ nox + age + dis + rad + tax + ptratio + lstat + medv
## 
##           Df Sum of Sq    RSS     AIC
## - lstat    1    0.0930 41.895 -1106.6
## - ptratio  1    0.1513 41.954 -1106.0
## <none>                 41.802 -1105.6
## - dis      1    0.4754 42.278 -1102.4
## - tax      1    0.6177 42.420 -1100.8
## - age      1    0.8904 42.693 -1097.8
## - rad      1    1.3799 43.182 -1092.5
## - medv     1    1.7464 43.549 -1088.6
## - nox      1    6.6881 48.490 -1038.5
## 
## Step:  AIC=-1106.6
## target ~ nox + age + dis + rad + tax + ptratio + medv
## 
##           Df Sum of Sq    RSS     AIC
## - ptratio  1    0.1246 42.020 -1107.2
## <none>                 41.895 -1106.6
## - dis      1    0.4337 42.329 -1103.8
## - tax      1    0.5901 42.485 -1102.1
## - age      1    1.1959 43.091 -1095.5
## - rad      1    1.5547 43.450 -1091.6
## - medv     1    2.1513 44.046 -1085.3
## - nox      1    6.6314 48.527 -1040.1
## 
## Step:  AIC=-1107.22
## target ~ nox + age + dis + rad + tax + medv
## 
##        Df Sum of Sq    RSS     AIC
## <none>              42.020 -1107.2
## - dis   1    0.3777 42.397 -1105.0
## - tax   1    0.5725 42.592 -1102.9
## - age   1    1.2425 43.262 -1095.6
## - rad   1    2.0578 44.078 -1086.9
## - medv  1    2.1277 44.147 -1086.2
## - nox   1    6.5482 48.568 -1041.7
steppca <- stepAIC(pcamodel, direction="backward")
## Start:  AIC=-1100.05
## target ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + 
##     PC10 + PC11 + PC12
## 
##        Df Sum of Sq    RSS      AIC
## - PC7   1     0.028 41.614 -1101.73
## - PC6   1     0.029 41.616 -1101.72
## - PC10  1     0.053 41.640 -1101.45
## - PC9   1     0.062 41.648 -1101.36
## - PC8   1     0.171 41.757 -1100.14
## <none>              41.586 -1100.05
## - PC3   1     0.269 41.856 -1099.04
## - PC12  1     0.762 42.348 -1093.59
## - PC4   1     1.153 42.740 -1089.30
## - PC2   1     1.372 42.958 -1086.92
## - PC11  1     6.508 48.095 -1034.29
## - PC5   1    14.714 56.300  -960.88
## - PC1   1    49.757 91.344  -735.37
## 
## Step:  AIC=-1101.73
## target ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC8 + PC9 + PC10 + 
##     PC11 + PC12
## 
##        Df Sum of Sq    RSS      AIC
## - PC6   1     0.029 41.644 -1103.41
## - PC10  1     0.053 41.668 -1103.14
## - PC9   1     0.062 41.676 -1103.04
## - PC8   1     0.171 41.786 -1101.82
## <none>              41.614 -1101.73
## - PC3   1     0.269 41.884 -1100.73
## - PC12  1     0.762 42.376 -1095.28
## - PC4   1     1.153 42.768 -1090.99
## - PC2   1     1.372 42.987 -1088.62
## - PC11  1     6.508 48.123 -1036.02
## - PC5   1    14.714 56.329  -962.65
## - PC1   1    49.757 91.372  -737.23
## 
## Step:  AIC=-1103.41
## target ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC8 + PC9 + PC10 + PC11 + 
##     PC12
## 
##        Df Sum of Sq    RSS      AIC
## - PC10  1     0.053 41.697 -1104.81
## - PC9   1     0.062 41.705 -1104.72
## - PC8   1     0.171 41.815 -1103.49
## <none>              41.644 -1103.41
## - PC3   1     0.269 41.913 -1102.40
## - PC12  1     0.762 42.405 -1096.96
## - PC4   1     1.153 42.797 -1092.67
## - PC2   1     1.372 43.016 -1090.30
## - PC11  1     6.508 48.152 -1037.74
## - PC5   1    14.714 56.358  -964.41
## - PC1   1    49.757 91.401  -739.08
## 
## Step:  AIC=-1104.81
## target ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC8 + PC9 + PC11 + PC12
## 
##        Df Sum of Sq    RSS      AIC
## - PC9   1     0.062 41.759 -1106.12
## - PC8   1     0.171 41.868 -1104.90
## <none>              41.697 -1104.81
## - PC3   1     0.269 41.966 -1103.81
## - PC12  1     0.762 42.459 -1098.38
## - PC4   1     1.153 42.850 -1094.10
## - PC2   1     1.372 43.069 -1091.72
## - PC11  1     6.508 48.205 -1039.22
## - PC5   1    14.714 56.411  -965.97
## - PC1   1    49.757 91.454  -740.81
## 
## Step:  AIC=-1106.12
## target ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC8 + PC11 + PC12
## 
##        Df Sum of Sq    RSS      AIC
## - PC8   1     0.171 41.930 -1106.22
## <none>              41.759 -1106.12
## - PC3   1     0.269 42.028 -1105.13
## - PC12  1     0.762 42.520 -1099.70
## - PC4   1     1.153 42.912 -1095.42
## - PC2   1     1.372 43.131 -1093.06
## - PC11  1     6.508 48.267 -1040.62
## - PC5   1    14.714 56.473  -967.46
## - PC1   1    49.757 91.516  -742.50
## 
## Step:  AIC=-1106.22
## target ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC11 + PC12
## 
##        Df Sum of Sq    RSS      AIC
## <none>              41.930 -1106.22
## - PC3   1     0.269 42.199 -1105.23
## - PC12  1     0.762 42.691 -1099.83
## - PC4   1     1.153 43.083 -1095.57
## - PC2   1     1.372 43.302 -1093.21
## - PC11  1     6.508 48.438 -1040.98
## - PC5   1    14.714 56.644  -968.05
## - PC1   1    49.757 91.687  -743.63
steppcascaled <- stepAIC(pcascaledmodel, direction="backward")
## Start:  AIC=-1100.05
## target ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + 
##     PC10 + PC11 + PC12
## 
##        Df Sum of Sq    RSS      AIC
## - PC4   1     0.011 41.598 -1101.92
## - PC10  1     0.088 41.675 -1101.06
## - PC6   1     0.153 41.739 -1100.34
## - PC8   1     0.164 41.751 -1100.21
## <none>              41.586 -1100.05
## - PC7   1     0.388 41.974 -1097.72
## - PC5   1     0.500 42.087 -1096.48
## - PC9   1     0.589 42.175 -1095.50
## - PC11  1     0.889 42.476 -1092.19
## - PC12  1     2.809 44.396 -1071.59
## - PC3   1     3.606 45.193 -1063.29
## - PC2   1     9.893 51.479 -1002.60
## - PC1   1    55.787 97.374  -705.58
## 
## Step:  AIC=-1101.92
## target ~ PC1 + PC2 + PC3 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
##     PC11 + PC12
## 
##        Df Sum of Sq    RSS      AIC
## - PC10  1     0.088 41.686 -1102.94
## - PC6   1     0.153 41.750 -1102.21
## - PC8   1     0.164 41.762 -1102.08
## <none>              41.598 -1101.92
## - PC7   1     0.388 41.986 -1099.60
## - PC5   1     0.500 42.098 -1098.35
## - PC9   1     0.589 42.186 -1097.37
## - PC11  1     0.889 42.487 -1094.06
## - PC12  1     2.809 44.407 -1073.47
## - PC3   1     3.606 45.204 -1065.18
## - PC2   1     9.893 51.490 -1004.50
## - PC1   1    55.787 97.385  -707.53
## 
## Step:  AIC=-1102.94
## target ~ PC1 + PC2 + PC3 + PC5 + PC6 + PC7 + PC8 + PC9 + PC11 + 
##     PC12
## 
##        Df Sum of Sq    RSS      AIC
## - PC6   1     0.153 41.839 -1103.23
## - PC8   1     0.164 41.850 -1103.10
## <none>              41.686 -1102.94
## - PC7   1     0.388 42.074 -1100.62
## - PC5   1     0.500 42.186 -1099.38
## - PC9   1     0.589 42.275 -1098.40
## - PC11  1     0.889 42.575 -1095.10
## - PC12  1     2.809 44.495 -1074.54
## - PC3   1     3.606 45.292 -1066.27
## - PC2   1     9.893 51.579 -1005.70
## - PC1   1    55.787 97.473  -709.11
## 
## Step:  AIC=-1103.23
## target ~ PC1 + PC2 + PC3 + PC5 + PC7 + PC8 + PC9 + PC11 + PC12
## 
##        Df Sum of Sq    RSS      AIC
## - PC8   1     0.164 42.003 -1103.40
## <none>              41.839 -1103.23
## - PC7   1     0.388 42.227 -1100.93
## - PC5   1     0.500 42.339 -1099.69
## - PC9   1     0.589 42.427 -1098.72
## - PC11  1     0.889 42.728 -1095.43
## - PC12  1     2.809 44.648 -1074.94
## - PC3   1     3.606 45.445 -1066.70
## - PC2   1     9.893 51.732 -1006.32
## - PC1   1    55.787 97.626  -710.38
## 
## Step:  AIC=-1103.4
## target ~ PC1 + PC2 + PC3 + PC5 + PC7 + PC9 + PC11 + PC12
## 
##        Df Sum of Sq    RSS      AIC
## <none>              42.003 -1103.40
## - PC7   1     0.388 42.391 -1101.12
## - PC5   1     0.500 42.503 -1099.88
## - PC9   1     0.589 42.592 -1098.91
## - PC11  1     0.889 42.893 -1095.64
## - PC12  1     2.809 44.813 -1075.23
## - PC3   1     3.606 45.610 -1067.01
## - PC2   1     9.893 51.896 -1006.84
## - PC1   1    55.787 97.790  -711.59
stepcrime$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv
## 
## Final Model:
## target ~ nox + age + rad + ptratio + medv
## 
## 
##      Step Df    Deviance Resid. Df Resid. Dev       AIC
## 1                              453   44.10938 -1072.601
## 2  - chas  1 0.001008849       454   44.11038 -1074.591
## 3 - indus  1 0.053960288       455   44.16434 -1076.021
## 4    - rm  1 0.050758594       456   44.21510 -1077.486
## 5   - dis  1 0.057377715       457   44.27248 -1078.881
## 6    - zn  1 0.063058926       458   44.33554 -1080.218
## 7 - lstat  1 0.080044628       459   44.41558 -1081.377
## 8   - tax  1 0.125630983       460   44.54122 -1082.061
stepboxcox$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv
## 
## Final Model:
## target ~ nox + age + dis + rad + tax + medv
## 
## 
##        Step Df   Deviance Resid. Df Resid. Dev       AIC
## 1                               453   41.58639 -1100.048
## 2      - zn  1 0.04265713       454   41.62904 -1101.571
## 3      - rm  1 0.03793969       455   41.66698 -1103.146
## 4    - chas  1 0.05443762       456   41.72142 -1104.538
## 5   - indus  1 0.08076011       457   41.80218 -1105.637
## 6   - lstat  1 0.09295273       458   41.89513 -1106.601
## 7 - ptratio  1 0.12457379       459   42.01971 -1107.218
steppca$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## target ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + 
##     PC10 + PC11 + PC12
## 
## Final Model:
## target ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC11 + PC12
## 
## 
##     Step Df   Deviance Resid. Df Resid. Dev       AIC
## 1                            453   41.58639 -1100.048
## 2  - PC7  1 0.02806014       454   41.61445 -1101.734
## 3  - PC6  1 0.02937726       455   41.64382 -1103.405
## 4 - PC10  1 0.05325268       456   41.69708 -1104.810
## 5  - PC9  1 0.06165595       457   41.75873 -1106.121
## 6  - PC8  1 0.17105607       458   41.92979 -1106.216
steppcascaled$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## target ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + 
##     PC10 + PC11 + PC12
## 
## Final Model:
## target ~ PC1 + PC2 + PC3 + PC5 + PC7 + PC9 + PC11 + PC12
## 
## 
##     Step Df   Deviance Resid. Df Resid. Dev       AIC
## 1                            453   41.58639 -1100.048
## 2  - PC4  1 0.01118069       454   41.59757 -1101.923
## 3 - PC10  1 0.08828778       455   41.68585 -1102.935
## 4  - PC6  1 0.15283252       456   41.83869 -1103.230
## 5  - PC8  1 0.16449922       457   42.00319 -1103.401
aiccrime <- lm(target ~ nox + age + rad + ptratio + medv, crime)
#summary(aiccrime)
fitcrime<-glm(aiccrime)
aicboxcox <- lm(target ~ nox + age + dis + rad + tax + medv, crimeboxcox)
#summary(aicboxcox)
fitbc<-glm(aicboxcox)
aicpca <- lm(target ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC11 + PC12, pca)
#summary(aicpca)
fitpca<-glm(aicpca)
aicpcascaled <- lm(target ~ PC1 + PC2 + PC3 + PC5 + PC7 + PC9 + PC11 + PC12, pca)
#summary(aicpcascaled)
fitpcas<-glm(aicpcascaled)

Crime with PCA after AIC has the highest R^2 value at 0.64 Crime with BoxCox after AIC has the lowest AIC value at 217.2 These are the two models we’ll be comparing.

Select Models

Model Metrics

dfPred <- crimeboxcox[,c("nox","age","dis", "rad","tax", "medv")]
predProb <- predict(fitbc,dfPred,type="response") #Predicted Probability
predResp <- numeric(length(predProb)) #Predicted Class
predResp[which(predProb>=0.50)] <- 1
dfPred <- data.frame(cbind(crimeboxcox[,c("nox","age","dis", "rad","tax", "medv", "target")],predProb, predResp))
MMcrimebc <- ModelMetrics(dfPred,"predResp","target",1,0,"Model Metrics for Backwards Selection of BoxCox tranformed variables", fitbc$aic)

## <table>
##  <thead>
##   <tr>
##    <th style="text-align:left;">   </th>
##    <th style="text-align:right;"> Act-Pos </th>
##    <th style="text-align:right;"> Act-Neg </th>
##   </tr>
##  </thead>
## <tbody>
##   <tr>
##    <td style="text-align:left;"> Pred-Pos </td>
##    <td style="text-align:right;"> 179 </td>
##    <td style="text-align:right;"> 50 </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> Pred-Neg </td>
##    <td style="text-align:right;"> 17 </td>
##    <td style="text-align:right;"> 220 </td>
##   </tr>
## </tbody>
## </table>
MMcrimebc
##                Model Metrics for Backwards Selection of BoxCox tranformed variables
## accuracy                                                                      0.856
## classif.error                                                                 0.144
## precision                                                                     0.913
## sensitivity                                                                   0.782
## specificity                                                                   0.928
## f1score                                                                       0.842
## auc                                                                           0.771
## best.threshold                                                                0.400
## aic                                                                         217.233
dfPred <- pca[,c("PC1", "PC2" , "PC3" , "PC4" , "PC5" , "PC11" , "PC12")]
predProb <- predict(fitpca,dfPred,type="response") #Predicted Probability
predResp <- numeric(length(predProb)) #Predicted Class
predResp[which(predProb>=0.5)] <- 1
dfPred <- data.frame(cbind(pca[,c("PC1", "PC2" , "PC3" , "PC4" , "PC5" , "PC11" , "PC12", "target")],predProb, predResp))
MMpca <- ModelMetrics(dfPred,"predResp","target",1,0,"Model Metrics for Backwards Selection of PCA variables", fitpca$aic)

## <table>
##  <thead>
##   <tr>
##    <th style="text-align:left;">   </th>
##    <th style="text-align:right;"> Act-Pos </th>
##    <th style="text-align:right;"> Act-Neg </th>
##   </tr>
##  </thead>
## <tbody>
##   <tr>
##    <td style="text-align:left;"> Pred-Pos </td>
##    <td style="text-align:right;"> 185 </td>
##    <td style="text-align:right;"> 44 </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> Pred-Neg </td>
##    <td style="text-align:right;"> 18 </td>
##    <td style="text-align:right;"> 219 </td>
##   </tr>
## </tbody>
## </table>
MMpca
##                Model Metrics for Backwards Selection of PCA variables
## accuracy                                                        0.867
## classif.error                                                   0.133
## precision                                                       0.911
## sensitivity                                                     0.808
## specificity                                                     0.924
## f1score                                                         0.856
## auc                                                             0.754
## best.threshold                                                  0.340
## aic                                                           218.235

The best threshold for the BoxCox model is 0.4 and for the PCA model is 0.34

dfPred <- crimeboxcox[,c("nox","age","dis", "rad","tax", "medv")]
predProb <- predict(fitbc,dfPred,type="response") #Predicted Probability
predResp <- numeric(length(predProb)) #Predicted Class
predResp[which(predProb>=0.40)] <- 1
dfPred <- data.frame(cbind(crimeboxcox[,c("nox","age","dis", "rad","tax", "medv", "target")],predProb, predResp))
MMcrimebc <- ModelMetrics(dfPred,"predResp","target",1,0,"Model Metrics for Backwards Selection of BoxCox transformed variables", fitbc$aic)

## <table>
##  <thead>
##   <tr>
##    <th style="text-align:left;">   </th>
##    <th style="text-align:right;"> Act-Pos </th>
##    <th style="text-align:right;"> Act-Neg </th>
##   </tr>
##  </thead>
## <tbody>
##   <tr>
##    <td style="text-align:left;"> Pred-Pos </td>
##    <td style="text-align:right;"> 210 </td>
##    <td style="text-align:right;"> 19 </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> Pred-Neg </td>
##    <td style="text-align:right;"> 37 </td>
##    <td style="text-align:right;"> 200 </td>
##   </tr>
## </tbody>
## </table>
dfPred <- pca[,c("PC1", "PC2" , "PC3" , "PC4" , "PC5" , "PC11" , "PC12")]
predProb <- predict(fitpca,dfPred,type="response") #Predicted Probability
predResp <- numeric(length(predProb)) #Predicted Class
predResp[which(predProb>=0.34)] <- 1
dfPred <- data.frame(cbind(pca[,c("PC1", "PC2" , "PC3" , "PC4" , "PC5" , "PC11" , "PC12", "target")],predProb, predResp))
MMpca <- ModelMetrics(dfPred,"predResp","target",1,0,"Model Metrics for Backwards Selection of PCA variables", fitpca$aic)

## <table>
##  <thead>
##   <tr>
##    <th style="text-align:left;">   </th>
##    <th style="text-align:right;"> Act-Pos </th>
##    <th style="text-align:right;"> Act-Neg </th>
##   </tr>
##  </thead>
## <tbody>
##   <tr>
##    <td style="text-align:left;"> Pred-Pos </td>
##    <td style="text-align:right;"> 221 </td>
##    <td style="text-align:right;"> 8 </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> Pred-Neg </td>
##    <td style="text-align:right;"> 47 </td>
##    <td style="text-align:right;"> 190 </td>
##   </tr>
## </tbody>
## </table>
MMcrimebc
##                Model Metrics for Backwards Selection of BoxCox transformed variables
## accuracy                                                                       0.880
## classif.error                                                                  0.120
## precision                                                                      0.850
## sensitivity                                                                    0.917
## specificity                                                                    0.844
## f1score                                                                        0.882
## auc                                                                            0.771
## best.threshold                                                                 0.400
## aic                                                                          217.233
MMpca
##                Model Metrics for Backwards Selection of PCA variables
## accuracy                                                        0.882
## classif.error                                                   0.118
## precision                                                       0.825
## sensitivity                                                     0.965
## specificity                                                     0.802
## f1score                                                         0.889
## auc                                                             0.754
## best.threshold                                                  0.340
## aic                                                           218.235

BoxCox transformation model has an accuracy of 88% and the Principal Component Analysis model has an accuracy of 88.2%. The PCA model has a higher f1 score but BoxCox transformation model has the lower AIC as well as better explainability as to where the variables came from. I would use the BoxCox model.