First, set up cathing for this notebook.

knitr::opts_chunk$set(cache = TRUE)
options(scipen = 9999)
rm(list = ls())

Then, load the library packages for this projects.

library(dplyr) # for data manipulation
library(MASS) # for assessing the dataset
library(leaps) # for visualizing all-possible-regression methods
library(car) # for assessing multicollinearity

Load the crime dataset and check the structure.

crime <- read.csv("data_input/crime.csv") %>%
  dplyr::select(-X)

names(crime) <- c("percent_m", "is_south", "mean_education", "police_exp60", "police_exp59", "labour_participation", "m_per1000f", "state_pop", "nonwhites_per1000", "unemploy_m24", "unemploy_m39", "gdp", "inequality", "prob_prison", "time_prison", "crime_rate") # renaming the columns

str(crime)
## 'data.frame':    47 obs. of  16 variables:
##  $ percent_m           : int  151 143 142 136 141 121 127 131 157 140 ...
##  $ is_south            : int  1 0 1 0 0 0 1 1 1 0 ...
##  $ mean_education      : int  91 113 89 121 121 110 111 109 90 118 ...
##  $ police_exp60        : int  58 103 45 149 109 118 82 115 65 71 ...
##  $ police_exp59        : int  56 95 44 141 101 115 79 109 62 68 ...
##  $ labour_participation: int  510 583 533 577 591 547 519 542 553 632 ...
##  $ m_per1000f          : int  950 1012 969 994 985 964 982 969 955 1029 ...
##  $ state_pop           : int  33 13 18 157 18 25 4 50 39 7 ...
##  $ nonwhites_per1000   : int  301 102 219 80 30 44 139 179 286 15 ...
##  $ unemploy_m24        : int  108 96 94 102 91 84 97 79 81 100 ...
##  $ unemploy_m39        : int  41 36 33 39 20 29 38 35 28 24 ...
##  $ gdp                 : int  394 557 318 673 578 689 620 472 421 526 ...
##  $ inequality          : int  261 194 250 167 174 126 168 206 239 174 ...
##  $ prob_prison         : num  0.0846 0.0296 0.0834 0.0158 0.0414 ...
##  $ time_prison         : num  26.2 25.3 24.3 29.9 21.3 ...
##  $ crime_rate          : int  791 1635 578 1969 1234 682 963 1555 856 705 ...

Use step-wise regression for feature selection with backward elimination method to find the best model.

lm.all <- lm(crime_rate ~ ., crime)
step(lm.all, direction = "backward")
## Start:  AIC=514.65
## crime_rate ~ percent_m + is_south + mean_education + police_exp60 + 
##     police_exp59 + labour_participation + m_per1000f + state_pop + 
##     nonwhites_per1000 + unemploy_m24 + unemploy_m39 + gdp + inequality + 
##     prob_prison + time_prison
## 
##                        Df Sum of Sq     RSS    AIC
## - is_south              1        29 1354974 512.65
## - labour_participation  1      8917 1363862 512.96
## - time_prison           1     10304 1365250 513.00
## - state_pop             1     14122 1369068 513.14
## - nonwhites_per1000     1     18395 1373341 513.28
## - m_per1000f            1     31967 1386913 513.74
## - gdp                   1     37613 1392558 513.94
## - police_exp59          1     37919 1392865 513.95
## <none>                              1354946 514.65
## - unemploy_m24          1     83722 1438668 515.47
## - police_exp60          1    144306 1499252 517.41
## - unemploy_m39          1    181536 1536482 518.56
## - percent_m             1    193770 1548716 518.93
## - prob_prison           1    199538 1554484 519.11
## - mean_education        1    402117 1757063 524.86
## - inequality            1    423031 1777977 525.42
## 
## Step:  AIC=512.65
## crime_rate ~ percent_m + mean_education + police_exp60 + police_exp59 + 
##     labour_participation + m_per1000f + state_pop + nonwhites_per1000 + 
##     unemploy_m24 + unemploy_m39 + gdp + inequality + prob_prison + 
##     time_prison
## 
##                        Df Sum of Sq     RSS    AIC
## - time_prison           1     10341 1365315 511.01
## - labour_participation  1     10878 1365852 511.03
## - state_pop             1     14127 1369101 511.14
## - nonwhites_per1000     1     21626 1376600 511.39
## - m_per1000f            1     32449 1387423 511.76
## - police_exp59          1     37954 1392929 511.95
## - gdp                   1     39223 1394197 511.99
## <none>                              1354974 512.65
## - unemploy_m24          1     96420 1451395 513.88
## - police_exp60          1    144302 1499277 515.41
## - unemploy_m39          1    189859 1544834 516.81
## - percent_m             1    195084 1550059 516.97
## - prob_prison           1    204463 1559437 517.26
## - mean_education        1    403140 1758114 522.89
## - inequality            1    488834 1843808 525.13
## 
## Step:  AIC=511.01
## crime_rate ~ percent_m + mean_education + police_exp60 + police_exp59 + 
##     labour_participation + m_per1000f + state_pop + nonwhites_per1000 + 
##     unemploy_m24 + unemploy_m39 + gdp + inequality + prob_prison
## 
##                        Df Sum of Sq     RSS    AIC
## - labour_participation  1     10533 1375848 509.37
## - nonwhites_per1000     1     15482 1380797 509.54
## - state_pop             1     21846 1387161 509.75
## - police_exp59          1     28932 1394247 509.99
## - gdp                   1     36070 1401385 510.23
## - m_per1000f            1     41784 1407099 510.42
## <none>                              1365315 511.01
## - unemploy_m24          1     91420 1456735 512.05
## - police_exp60          1    134137 1499452 513.41
## - unemploy_m39          1    184143 1549458 514.95
## - percent_m             1    186110 1551425 515.01
## - prob_prison           1    237493 1602808 516.54
## - mean_education        1    409448 1774763 521.33
## - inequality            1    502909 1868224 523.75
## 
## Step:  AIC=509.37
## crime_rate ~ percent_m + mean_education + police_exp60 + police_exp59 + 
##     m_per1000f + state_pop + nonwhites_per1000 + unemploy_m24 + 
##     unemploy_m39 + gdp + inequality + prob_prison
## 
##                     Df Sum of Sq     RSS    AIC
## - nonwhites_per1000  1     11675 1387523 507.77
## - police_exp59       1     21418 1397266 508.09
## - state_pop          1     27803 1403651 508.31
## - m_per1000f         1     31252 1407100 508.42
## - gdp                1     35035 1410883 508.55
## <none>                           1375848 509.37
## - unemploy_m24       1     80954 1456802 510.06
## - police_exp60       1    123896 1499744 511.42
## - unemploy_m39       1    190746 1566594 513.47
## - percent_m          1    217716 1593564 514.27
## - prob_prison        1    226971 1602819 514.54
## - mean_education     1    413254 1789103 519.71
## - inequality         1    500944 1876792 521.96
## 
## Step:  AIC=507.77
## crime_rate ~ percent_m + mean_education + police_exp60 + police_exp59 + 
##     m_per1000f + state_pop + unemploy_m24 + unemploy_m39 + gdp + 
##     inequality + prob_prison
## 
##                  Df Sum of Sq     RSS    AIC
## - police_exp59    1     16706 1404229 506.33
## - state_pop       1     25793 1413315 506.63
## - m_per1000f      1     26785 1414308 506.66
## - gdp             1     31551 1419073 506.82
## <none>                        1387523 507.77
## - unemploy_m24    1     83881 1471404 508.52
## - police_exp60    1    118348 1505871 509.61
## - unemploy_m39    1    201453 1588976 512.14
## - prob_prison     1    216760 1604282 512.59
## - percent_m       1    309214 1696737 515.22
## - mean_education  1    402754 1790276 517.74
## - inequality      1    589736 1977259 522.41
## 
## Step:  AIC=506.33
## crime_rate ~ percent_m + mean_education + police_exp60 + m_per1000f + 
##     state_pop + unemploy_m24 + unemploy_m39 + gdp + inequality + 
##     prob_prison
## 
##                  Df Sum of Sq     RSS    AIC
## - state_pop       1     22345 1426575 505.07
## - gdp             1     32142 1436371 505.39
## - m_per1000f      1     36808 1441037 505.54
## <none>                        1404229 506.33
## - unemploy_m24    1     86373 1490602 507.13
## - unemploy_m39    1    205814 1610043 510.76
## - prob_prison     1    218607 1622836 511.13
## - percent_m       1    307001 1711230 513.62
## - mean_education  1    389502 1793731 515.83
## - inequality      1    608627 2012856 521.25
## - police_exp60    1   1050202 2454432 530.57
## 
## Step:  AIC=505.07
## crime_rate ~ percent_m + mean_education + police_exp60 + m_per1000f + 
##     unemploy_m24 + unemploy_m39 + gdp + inequality + prob_prison
## 
##                  Df Sum of Sq     RSS    AIC
## - gdp             1     26493 1453068 503.93
## <none>                        1426575 505.07
## - m_per1000f      1     84491 1511065 505.77
## - unemploy_m24    1     99463 1526037 506.24
## - prob_prison     1    198571 1625145 509.20
## - unemploy_m39    1    208880 1635455 509.49
## - percent_m       1    320926 1747501 512.61
## - mean_education  1    386773 1813348 514.35
## - inequality      1    594779 2021354 519.45
## - police_exp60    1   1127277 2553852 530.44
## 
## Step:  AIC=503.93
## crime_rate ~ percent_m + mean_education + police_exp60 + m_per1000f + 
##     unemploy_m24 + unemploy_m39 + inequality + prob_prison
## 
##                  Df Sum of Sq     RSS    AIC
## <none>                        1453068 503.93
## - m_per1000f      1    103159 1556227 505.16
## - unemploy_m24    1    127044 1580112 505.87
## - prob_prison     1    247978 1701046 509.34
## - unemploy_m39    1    255443 1708511 509.55
## - percent_m       1    296790 1749858 510.67
## - mean_education  1    445788 1898855 514.51
## - inequality      1    738244 2191312 521.24
## - police_exp60    1   1672038 3125105 537.93
## 
## Call:
## lm(formula = crime_rate ~ percent_m + mean_education + police_exp60 + 
##     m_per1000f + unemploy_m24 + unemploy_m39 + inequality + prob_prison, 
##     data = crime)
## 
## Coefficients:
##    (Intercept)       percent_m  mean_education    police_exp60  
##      -6426.101           9.332          18.012          10.265  
##     m_per1000f    unemploy_m24    unemploy_m39      inequality  
##          2.234          -6.087          18.735           6.133  
##    prob_prison  
##      -3796.032

Take a look for the summary of the resulted model.

model.crime_rate <- lm(
  formula = crime_rate ~ percent_m + mean_education + police_exp60 +
    m_per1000f + unemploy_m24 + unemploy_m39 + inequality + prob_prison,
  data = crime
)
summary(model.crime_rate)
## 
## Call:
## lm(formula = crime_rate ~ percent_m + mean_education + police_exp60 + 
##     m_per1000f + unemploy_m24 + unemploy_m39 + inequality + prob_prison, 
##     data = crime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -444.70 -111.07    3.03  122.15  483.30 
## 
## Coefficients:
##                 Estimate Std. Error t value     Pr(>|t|)    
## (Intercept)    -6426.101   1194.611  -5.379 0.0000040395 ***
## percent_m          9.332      3.350   2.786      0.00828 ** 
## mean_education    18.012      5.275   3.414      0.00153 ** 
## police_exp60      10.265      1.552   6.613 0.0000000826 ***
## m_per1000f         2.234      1.360   1.642      0.10874    
## unemploy_m24      -6.087      3.339  -1.823      0.07622 .  
## unemploy_m39      18.735      7.248   2.585      0.01371 *  
## inequality         6.133      1.396   4.394 0.0000863344 ***
## prob_prison    -3796.032   1490.646  -2.547      0.01505 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 195.5 on 38 degrees of freedom
## Multiple R-squared:  0.7888, Adjusted R-squared:  0.7444 
## F-statistic: 17.74 on 8 and 38 DF,  p-value: 0.0000000001159

It is showed that some variables give p-value > alpha (0.05). Let’s check adjusted r-squared value for each possible subsets of the current model with all-possible-regression methods, using regs.

regs <- regsubsets(crime_rate ~ percent_m + mean_education + police_exp60 +
  m_per1000f + unemploy_m24 + unemploy_m39 + inequality + prob_prison,
data = crime, nbest = 10
)
plot(regs, scale = "adjr", main = "All possible regression: ranked by Adjusted R-squared")

From the plot above, we find m_per1000f not affecting much the adjusted R-squared. So, let’s create a new model with removing the variable.

model.crime_rate2 <- lm(
  formula = crime_rate ~ percent_m + mean_education + police_exp60 +
    unemploy_m24 + unemploy_m39 + inequality + prob_prison,
  data = crime
)
summary(model.crime_rate2)
## 
## Call:
## lm(formula = crime_rate ~ percent_m + mean_education + police_exp60 + 
##     unemploy_m24 + unemploy_m39 + inequality + prob_prison, data = crime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -520.76 -105.67    9.53  136.28  519.37 
## 
## Coefficients:
##                 Estimate Std. Error t value     Pr(>|t|)    
## (Intercept)    -5095.552    896.895  -5.681 0.0000014350 ***
## percent_m         10.678      3.318   3.218       0.0026 ** 
## mean_education    21.845      4.833   4.520 0.0000562260 ***
## police_exp60      10.596      1.572   6.738 0.0000000491 ***
## unemploy_m24      -3.542      3.022  -1.172       0.2482    
## unemploy_m39      15.882      7.189   2.209       0.0331 *  
## inequality         6.633      1.392   4.767 0.0000260818 ***
## prob_prison    -3730.849   1522.207  -2.451       0.0188 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 199.8 on 39 degrees of freedom
## Multiple R-squared:  0.7738, Adjusted R-squared:  0.7332 
## F-statistic: 19.06 on 7 and 39 DF,  p-value: 0.00000000008805

A variables, unemploy_m24 still give p-value > alpha (0.05). Let’s check again adjusted r-squared value for each possible subsets of the current model.

regs2 <- regsubsets(crime_rate ~ percent_m + mean_education + police_exp60 +
  unemploy_m24 + unemploy_m39 + inequality + prob_prison,
data = crime, nbest = 10
)
plot(regs2, scale = "adjr", main = "All possible regression: ranked by Adjusted R-squared")

Again, we find unemploy_m24 not affecting much the adjusted R-squared, then let’s create a new model with removing the variable.

model.crime_rate3 <- lm(
  formula = crime_rate ~ percent_m + mean_education + police_exp60 +
    unemploy_m39 + inequality + prob_prison,
  data = crime
)
summary(model.crime_rate3)
## 
## Call:
## lm(formula = crime_rate ~ percent_m + mean_education + police_exp60 + 
##     unemploy_m39 + inequality + prob_prison, data = crime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -470.68  -78.41  -19.68  133.12  556.23 
## 
## Coefficients:
##                 Estimate Std. Error t value       Pr(>|t|)    
## (Intercept)    -5040.505    899.843  -5.602 0.000001715273 ***
## percent_m         10.502      3.330   3.154        0.00305 ** 
## mean_education    19.647      4.475   4.390 0.000080720160 ***
## police_exp60      11.502      1.375   8.363 0.000000000256 ***
## unemploy_m39       8.937      4.091   2.185        0.03483 *  
## inequality         6.765      1.394   4.855 0.000018793765 ***
## prob_prison    -3801.836   1528.097  -2.488        0.01711 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 200.7 on 40 degrees of freedom
## Multiple R-squared:  0.7659, Adjusted R-squared:  0.7307 
## F-statistic: 21.81 on 6 and 40 DF,  p-value: 0.00000000003418

Now, no variable gives p-value > alpha (0.05). Let’s check again with regs to check whether we can improve the model.

regs3 <- regsubsets(crime_rate ~ percent_m + mean_education + police_exp60 +
  unemploy_m39 + inequality + prob_prison,
data = crime, nbest = 10
)
plot(regs3, scale = "adjr", main = "All possible regression: ranked by Adjusted R-squared")

It looks like we can’t reduce the variable again, so let’s check the residual plot to look for any unwanted patterns in the model.

plot(crime$crime_rate, residuals(model.crime_rate3), ylim = c(-600, 600), ylab = "Residuals", xlab = "Crime Rate")

It showed the random pattern, it means our model meets the linearity assumption.

vif(model.crime_rate3)
##      percent_m mean_education   police_exp60   unemploy_m39     inequality 
##       2.000244       2.862893       1.908124       1.363074       3.530429 
##    prob_prison 
##       1.378714

Lastly, let’s predict the crime_rate. I use the values near the mean of all variables.

summary(crime)
##    percent_m        is_south      mean_education   police_exp60  
##  Min.   :119.0   Min.   :0.0000   Min.   : 87.0   Min.   : 45.0  
##  1st Qu.:130.0   1st Qu.:0.0000   1st Qu.: 97.5   1st Qu.: 62.5  
##  Median :136.0   Median :0.0000   Median :108.0   Median : 78.0  
##  Mean   :138.6   Mean   :0.3404   Mean   :105.6   Mean   : 85.0  
##  3rd Qu.:146.0   3rd Qu.:1.0000   3rd Qu.:114.5   3rd Qu.:104.5  
##  Max.   :177.0   Max.   :1.0000   Max.   :122.0   Max.   :166.0  
##   police_exp59    labour_participation   m_per1000f       state_pop     
##  Min.   : 41.00   Min.   :480.0        Min.   : 934.0   Min.   :  3.00  
##  1st Qu.: 58.50   1st Qu.:530.5        1st Qu.: 964.5   1st Qu.: 10.00  
##  Median : 73.00   Median :560.0        Median : 977.0   Median : 25.00  
##  Mean   : 80.23   Mean   :561.2        Mean   : 983.0   Mean   : 36.62  
##  3rd Qu.: 97.00   3rd Qu.:593.0        3rd Qu.: 992.0   3rd Qu.: 41.50  
##  Max.   :157.00   Max.   :641.0        Max.   :1071.0   Max.   :168.00  
##  nonwhites_per1000  unemploy_m24     unemploy_m39        gdp       
##  Min.   :  2.0     Min.   : 70.00   Min.   :20.00   Min.   :288.0  
##  1st Qu.: 24.0     1st Qu.: 80.50   1st Qu.:27.50   1st Qu.:459.5  
##  Median : 76.0     Median : 92.00   Median :34.00   Median :537.0  
##  Mean   :101.1     Mean   : 95.47   Mean   :33.98   Mean   :525.4  
##  3rd Qu.:132.5     3rd Qu.:104.00   3rd Qu.:38.50   3rd Qu.:591.5  
##  Max.   :423.0     Max.   :142.00   Max.   :58.00   Max.   :689.0  
##    inequality     prob_prison       time_prison      crime_rate    
##  Min.   :126.0   Min.   :0.00690   Min.   :12.20   Min.   : 342.0  
##  1st Qu.:165.5   1st Qu.:0.03270   1st Qu.:21.60   1st Qu.: 658.5  
##  Median :176.0   Median :0.04210   Median :25.80   Median : 831.0  
##  Mean   :194.0   Mean   :0.04709   Mean   :26.60   Mean   : 905.1  
##  3rd Qu.:227.5   3rd Qu.:0.05445   3rd Qu.:30.45   3rd Qu.:1057.5  
##  Max.   :276.0   Max.   :0.11980   Max.   :44.00   Max.   :1993.0
predict(model.crime_rate3, data.frame(percent_m = 138, mean_education = 105.6, police_exp60 = 85, unemploy_m39 = 33.98, inequality = 194, prob_prison = 0.04709))
##        1 
## 898.3163

The result shows the predicted value is near our median value for crime_rate.