Make a multiple linear regression from crime data with GDP as target variable.
library(dplyr)
library(tidyverse)
library(GGally)
library(car)
library(caret)
library(lmtest)
library(MLmetrics)
library(knitr)
crm <- read.csv("crime.csv") %>%
dplyr::select(-X)
names(crm) <- c("percent_m", "is_south", "mean_education", "police_exp60", "police_exp59", "labour_participation", "m_per1000f", "state_pop", "nonwhites_per1000", "unemploy_m24", "unemploy_m39", "gdp", "inequality", "prob_prison", "time_prison", "crime_rate")
ggcorr(crm,label=T,layout.exp = 10)
From the plot above we can conclude that the variable have linear correlation.
gdpA <- lm(gdp~.,crm)
gdpN <- lm(gdp~1,crm)
back <- step(gdpA,direction="backward") # AIC Result = 336.19
## Start: AIC=348.58
## gdp ~ percent_m + is_south + mean_education + police_exp60 +
## police_exp59 + labour_participation + m_per1000f + state_pop +
## nonwhites_per1000 + unemploy_m24 + unemploy_m39 + inequality +
## prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - police_exp60 1 10.4 39583 346.59
## - prob_prison 1 46.3 39619 346.64
## - unemploy_m24 1 65.7 39638 346.66
## - mean_education 1 95.2 39668 346.69
## - m_per1000f 1 264.4 39837 346.89
## - police_exp59 1 267.6 39840 346.90
## - unemploy_m39 1 325.5 39898 346.97
## - state_pop 1 417.9 39991 347.07
## - time_prison 1 644.1 40217 347.34
## - labour_participation 1 847.0 40420 347.58
## - crime_rate 1 1098.5 40671 347.87
## - percent_m 1 1686.7 41259 348.54
## <none> 39573 348.58
## - nonwhites_per1000 1 1949.9 41523 348.84
## - is_south 1 2175.5 41748 349.10
## - inequality 1 21413.3 60986 366.91
##
## Step: AIC=346.59
## gdp ~ percent_m + is_south + mean_education + police_exp59 +
## labour_participation + m_per1000f + state_pop + nonwhites_per1000 +
## unemploy_m24 + unemploy_m39 + inequality + prob_prison +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - prob_prison 1 67.7 39651 344.67
## - unemploy_m24 1 70.9 39654 344.68
## - mean_education 1 124.0 39707 344.74
## - m_per1000f 1 259.2 39842 344.90
## - unemploy_m39 1 329.8 39913 344.98
## - state_pop 1 409.9 39993 345.08
## - time_prison 1 658.4 40242 345.37
## - labour_participation 1 848.9 40432 345.59
## - crime_rate 1 1144.1 40727 345.93
## - percent_m 1 1676.2 41259 346.54
## <none> 39583 346.59
## - nonwhites_per1000 1 1980.0 41563 346.89
## - is_south 1 2169.5 41753 347.10
## - police_exp59 1 2669.7 42253 347.66
## - inequality 1 21522.5 61106 365.00
##
## Step: AIC=344.67
## gdp ~ percent_m + is_south + mean_education + police_exp59 +
## labour_participation + m_per1000f + state_pop + nonwhites_per1000 +
## unemploy_m24 + unemploy_m39 + inequality + time_prison +
## crime_rate
##
## Df Sum of Sq RSS AIC
## - unemploy_m24 1 58.9 39710 342.74
## - mean_education 1 104.5 39755 342.80
## - m_per1000f 1 253.0 39904 342.97
## - unemploy_m39 1 302.2 39953 343.03
## - state_pop 1 414.5 40065 343.16
## - labour_participation 1 877.5 40528 343.70
## - time_prison 1 1224.5 40875 344.10
## - crime_rate 1 1448.6 41099 344.36
## - percent_m 1 1707.7 41359 344.65
## <none> 39651 344.67
## - is_south 1 2104.2 41755 345.10
## - nonwhites_per1000 1 2425.2 42076 345.46
## - police_exp59 1 2700.8 42352 345.77
## - inequality 1 21779.9 61431 363.25
##
## Step: AIC=342.74
## gdp ~ percent_m + is_south + mean_education + police_exp59 +
## labour_participation + m_per1000f + state_pop + nonwhites_per1000 +
## unemploy_m39 + inequality + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - mean_education 1 70.7 39780 340.83
## - m_per1000f 1 195.7 39905 340.97
## - unemploy_m39 1 336.0 40046 341.14
## - state_pop 1 376.7 40086 341.19
## - time_prison 1 1295.3 41005 342.25
## - labour_participation 1 1355.3 41065 342.32
## - crime_rate 1 1659.3 41369 342.67
## - percent_m 1 1714.6 41424 342.73
## <none> 39710 342.74
## - nonwhites_per1000 1 2676.8 42387 343.81
## - is_south 1 2881.5 42591 344.04
## - police_exp59 1 3048.3 42758 344.22
## - inequality 1 22171.8 61881 361.59
##
## Step: AIC=340.83
## gdp ~ percent_m + is_south + police_exp59 + labour_participation +
## m_per1000f + state_pop + nonwhites_per1000 + unemploy_m39 +
## inequality + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - m_per1000f 1 252 40032 339.12
## - unemploy_m39 1 272 40052 339.15
## - state_pop 1 400 40180 339.30
## - time_prison 1 1226 41006 340.25
## - labour_participation 1 1523 41304 340.59
## <none> 39780 340.83
## - percent_m 1 1989 41769 341.12
## - crime_rate 1 2300 42080 341.47
## - nonwhites_per1000 1 2765 42545 341.98
## - is_south 1 2852 42632 342.08
## - police_exp59 1 2982 42762 342.22
## - inequality 1 31943 71723 366.53
##
## Step: AIC=339.12
## gdp ~ percent_m + is_south + police_exp59 + labour_participation +
## state_pop + nonwhites_per1000 + unemploy_m39 + inequality +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - state_pop 1 246 40278 337.41
## - unemploy_m39 1 603 40635 337.83
## - time_prison 1 994 41026 338.28
## <none> 40032 339.12
## - percent_m 1 1764 41796 339.15
## - crime_rate 1 2691 42723 340.18
## - is_south 1 2761 42793 340.26
## - labour_participation 1 2767 42799 340.26
## - police_exp59 1 3040 43072 340.56
## - nonwhites_per1000 1 3223 43255 340.76
## - inequality 1 31754 71786 364.57
##
## Step: AIC=337.41
## gdp ~ percent_m + is_south + police_exp59 + labour_participation +
## nonwhites_per1000 + unemploy_m39 + inequality + time_prison +
## crime_rate
##
## Df Sum of Sq RSS AIC
## - unemploy_m39 1 671 40950 336.19
## <none> 40278 337.41
## - time_prison 1 2013 42291 337.70
## - percent_m 1 2079 42357 337.78
## - crime_rate 1 2506 42784 338.25
## - is_south 1 2724 43003 338.49
## - labour_participation 1 2738 43016 338.50
## - nonwhites_per1000 1 3198 43476 339.00
## - police_exp59 1 4687 44965 340.58
## - inequality 1 32733 73012 363.37
##
## Step: AIC=336.19
## gdp ~ percent_m + is_south + police_exp59 + labour_participation +
## nonwhites_per1000 + inequality + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## <none> 40950 336.19
## - time_prison 1 2021 42971 336.45
## - labour_participation 1 2067 43016 336.50
## - is_south 1 2399 43349 336.86
## - nonwhites_per1000 1 2838 43788 337.34
## - percent_m 1 3660 44610 338.21
## - crime_rate 1 3696 44646 338.25
## - police_exp59 1 4262 45211 338.84
## - inequality 1 32326 73276 361.54
forward <- step(gdpN,scope=list(lower=gdpN,upper=gdpA),direction="forward")# AIC Result = 334.66
## Start: AIC=430.52
## gdp ~ 1
##
## Df Sum of Sq RSS AIC
## + inequality 1 334682 93601 361.04
## + police_exp59 1 270183 158100 385.68
## + police_exp60 1 265417 162866 387.08
## + mean_education 1 231997 196286 395.85
## + percent_m 1 192288 235995 404.51
## + is_south 1 173754 254529 408.06
## + nonwhites_per1000 1 149139 279144 412.40
## + prob_prison 1 132081 296202 415.19
## + crime_rate 1 83414 344869 422.34
## + state_pop 1 40698 387585 427.82
## + labour_participation 1 37178 391105 428.25
## <none> 428283 430.52
## + m_per1000f 1 13816 414467 430.98
## + unemploy_m39 1 3631 424652 432.12
## + unemploy_m24 1 862 427421 432.42
## + time_prison 1 0 428283 432.52
##
## Step: AIC=361.04
## gdp ~ inequality
##
## Df Sum of Sq RSS AIC
## + police_exp60 1 37561 56039 338.93
## + police_exp59 1 36169 57432 340.09
## + crime_rate 1 35452 58148 340.67
## + state_pop 1 16825 76775 353.73
## + prob_prison 1 11334 82267 356.98
## + percent_m 1 7983 85618 358.85
## + unemploy_m39 1 4807 88794 360.56
## <none> 93601 361.04
## + time_prison 1 3557 90044 361.22
## + mean_education 1 3342 90259 361.33
## + labour_participation 1 1451 92149 362.31
## + m_per1000f 1 448 93152 362.82
## + is_south 1 203 93397 362.94
## + nonwhites_per1000 1 59 93542 363.01
## + unemploy_m24 1 58 93543 363.01
##
## Step: AIC=338.93
## gdp ~ inequality + police_exp60
##
## Df Sum of Sq RSS AIC
## + nonwhites_per1000 1 4868.7 51171 336.66
## + crime_rate 1 4151.6 51888 337.31
## + mean_education 1 3418.7 52621 337.97
## + prob_prison 1 3333.6 52706 338.05
## + percent_m 1 3231.5 52808 338.14
## + labour_participation 1 2583.2 53456 338.71
## <none> 56039 338.93
## + m_per1000f 1 1558.6 54481 339.61
## + unemploy_m39 1 454.8 55585 340.55
## + state_pop 1 450.4 55589 340.55
## + is_south 1 404.4 55635 340.59
## + time_prison 1 324.5 55715 340.66
## + unemploy_m24 1 182.1 55857 340.78
## + police_exp59 1 125.3 55914 340.83
##
## Step: AIC=336.66
## gdp ~ inequality + police_exp60 + nonwhites_per1000
##
## Df Sum of Sq RSS AIC
## + crime_rate 1 3243.9 47927 335.58
## <none> 51171 336.66
## + prob_prison 1 1475.0 49696 337.29
## + mean_education 1 1413.8 49757 337.34
## + labour_participation 1 1344.1 49827 337.41
## + percent_m 1 1074.0 50097 337.66
## + time_prison 1 847.5 50323 337.88
## + state_pop 1 580.5 50590 338.12
## + unemploy_m39 1 460.5 50710 338.24
## + m_per1000f 1 426.5 50744 338.27
## + is_south 1 351.6 50819 338.34
## + unemploy_m24 1 24.3 51146 338.64
## + police_exp59 1 1.1 51170 338.66
##
## Step: AIC=335.58
## gdp ~ inequality + police_exp60 + nonwhites_per1000 + crime_rate
##
## Df Sum of Sq RSS AIC
## + percent_m 1 2889.60 45037 334.66
## <none> 47927 335.58
## + state_pop 1 1356.90 46570 336.23
## + time_prison 1 748.56 47178 336.84
## + prob_prison 1 515.30 47412 337.07
## + is_south 1 489.83 47437 337.10
## + unemploy_m39 1 483.32 47444 337.11
## + labour_participation 1 455.09 47472 337.13
## + mean_education 1 193.26 47734 337.39
## + police_exp59 1 35.18 47892 337.55
## + unemploy_m24 1 16.77 47910 337.57
## + m_per1000f 1 1.57 47925 337.58
##
## Step: AIC=334.66
## gdp ~ inequality + police_exp60 + nonwhites_per1000 + crime_rate +
## percent_m
##
## Df Sum of Sq RSS AIC
## <none> 45037 334.66
## + time_prison 1 886.41 44151 335.73
## + is_south 1 758.62 44279 335.86
## + state_pop 1 756.54 44281 335.86
## + labour_participation 1 467.35 44570 336.17
## + prob_prison 1 461.15 44576 336.18
## + unemploy_m24 1 97.38 44940 336.56
## + mean_education 1 68.19 44969 336.59
## + m_per1000f 1 23.28 45014 336.64
## + police_exp59 1 15.66 45022 336.64
## + unemploy_m39 1 14.42 45023 336.64
both <- step(gdpA,scope=list(lower=gdpN,upper=gdpA),direction="both") # AIC Result = 336.19
## Start: AIC=348.58
## gdp ~ percent_m + is_south + mean_education + police_exp60 +
## police_exp59 + labour_participation + m_per1000f + state_pop +
## nonwhites_per1000 + unemploy_m24 + unemploy_m39 + inequality +
## prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - police_exp60 1 10.4 39583 346.59
## - prob_prison 1 46.3 39619 346.64
## - unemploy_m24 1 65.7 39638 346.66
## - mean_education 1 95.2 39668 346.69
## - m_per1000f 1 264.4 39837 346.89
## - police_exp59 1 267.6 39840 346.90
## - unemploy_m39 1 325.5 39898 346.97
## - state_pop 1 417.9 39991 347.07
## - time_prison 1 644.1 40217 347.34
## - labour_participation 1 847.0 40420 347.58
## - crime_rate 1 1098.5 40671 347.87
## - percent_m 1 1686.7 41259 348.54
## <none> 39573 348.58
## - nonwhites_per1000 1 1949.9 41523 348.84
## - is_south 1 2175.5 41748 349.10
## - inequality 1 21413.3 60986 366.91
##
## Step: AIC=346.59
## gdp ~ percent_m + is_south + mean_education + police_exp59 +
## labour_participation + m_per1000f + state_pop + nonwhites_per1000 +
## unemploy_m24 + unemploy_m39 + inequality + prob_prison +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - prob_prison 1 67.7 39651 344.67
## - unemploy_m24 1 70.9 39654 344.68
## - mean_education 1 124.0 39707 344.74
## - m_per1000f 1 259.2 39842 344.90
## - unemploy_m39 1 329.8 39913 344.98
## - state_pop 1 409.9 39993 345.08
## - time_prison 1 658.4 40242 345.37
## - labour_participation 1 848.9 40432 345.59
## - crime_rate 1 1144.1 40727 345.93
## - percent_m 1 1676.2 41259 346.54
## <none> 39583 346.59
## - nonwhites_per1000 1 1980.0 41563 346.89
## - is_south 1 2169.5 41753 347.10
## - police_exp59 1 2669.7 42253 347.66
## + police_exp60 1 10.4 39573 348.58
## - inequality 1 21522.5 61106 365.00
##
## Step: AIC=344.67
## gdp ~ percent_m + is_south + mean_education + police_exp59 +
## labour_participation + m_per1000f + state_pop + nonwhites_per1000 +
## unemploy_m24 + unemploy_m39 + inequality + time_prison +
## crime_rate
##
## Df Sum of Sq RSS AIC
## - unemploy_m24 1 58.9 39710 342.74
## - mean_education 1 104.5 39755 342.80
## - m_per1000f 1 253.0 39904 342.97
## - unemploy_m39 1 302.2 39953 343.03
## - state_pop 1 414.5 40065 343.16
## - labour_participation 1 877.5 40528 343.70
## - time_prison 1 1224.5 40875 344.10
## - crime_rate 1 1448.6 41099 344.36
## - percent_m 1 1707.7 41359 344.65
## <none> 39651 344.67
## - is_south 1 2104.2 41755 345.10
## - nonwhites_per1000 1 2425.2 42076 345.46
## - police_exp59 1 2700.8 42352 345.77
## + prob_prison 1 67.7 39583 346.59
## + police_exp60 1 31.9 39619 346.64
## - inequality 1 21779.9 61431 363.25
##
## Step: AIC=342.74
## gdp ~ percent_m + is_south + mean_education + police_exp59 +
## labour_participation + m_per1000f + state_pop + nonwhites_per1000 +
## unemploy_m39 + inequality + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - mean_education 1 70.7 39780 340.83
## - m_per1000f 1 195.7 39905 340.97
## - unemploy_m39 1 336.0 40046 341.14
## - state_pop 1 376.7 40086 341.19
## - time_prison 1 1295.3 41005 342.25
## - labour_participation 1 1355.3 41065 342.32
## - crime_rate 1 1659.3 41369 342.67
## - percent_m 1 1714.6 41424 342.73
## <none> 39710 342.74
## - nonwhites_per1000 1 2676.8 42387 343.81
## - is_south 1 2881.5 42591 344.04
## - police_exp59 1 3048.3 42758 344.22
## + unemploy_m24 1 58.9 39651 344.67
## + prob_prison 1 55.7 39654 344.68
## + police_exp60 1 36.6 39673 344.70
## - inequality 1 22171.8 61881 361.59
##
## Step: AIC=340.83
## gdp ~ percent_m + is_south + police_exp59 + labour_participation +
## m_per1000f + state_pop + nonwhites_per1000 + unemploy_m39 +
## inequality + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - m_per1000f 1 252 40032 339.12
## - unemploy_m39 1 272 40052 339.15
## - state_pop 1 400 40180 339.30
## - time_prison 1 1226 41006 340.25
## - labour_participation 1 1523 41304 340.59
## <none> 39780 340.83
## - percent_m 1 1989 41769 341.12
## - crime_rate 1 2300 42080 341.47
## - nonwhites_per1000 1 2765 42545 341.98
## - is_south 1 2852 42632 342.08
## - police_exp59 1 2982 42762 342.22
## + mean_education 1 71 39710 342.74
## + police_exp60 1 62 39719 342.75
## + prob_prison 1 44 39737 342.77
## + unemploy_m24 1 25 39755 342.80
## - inequality 1 31943 71723 366.53
##
## Step: AIC=339.12
## gdp ~ percent_m + is_south + police_exp59 + labour_participation +
## state_pop + nonwhites_per1000 + unemploy_m39 + inequality +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - state_pop 1 246 40278 337.41
## - unemploy_m39 1 603 40635 337.83
## - time_prison 1 994 41026 338.28
## <none> 40032 339.12
## - percent_m 1 1764 41796 339.15
## - crime_rate 1 2691 42723 340.18
## - is_south 1 2761 42793 340.26
## - labour_participation 1 2767 42799 340.26
## - police_exp59 1 3040 43072 340.56
## - nonwhites_per1000 1 3223 43255 340.76
## + m_per1000f 1 252 39780 340.83
## + mean_education 1 127 39905 340.97
## + prob_prison 1 47 39985 341.07
## + police_exp60 1 46 39986 341.07
## + unemploy_m24 1 25 40008 341.09
## - inequality 1 31754 71786 364.57
##
## Step: AIC=337.41
## gdp ~ percent_m + is_south + police_exp59 + labour_participation +
## nonwhites_per1000 + unemploy_m39 + inequality + time_prison +
## crime_rate
##
## Df Sum of Sq RSS AIC
## - unemploy_m39 1 671 40950 336.19
## <none> 40278 337.41
## - time_prison 1 2013 42291 337.70
## - percent_m 1 2079 42357 337.78
## - crime_rate 1 2506 42784 338.25
## - is_south 1 2724 43003 338.49
## - labour_participation 1 2738 43016 338.50
## - nonwhites_per1000 1 3198 43476 339.00
## + state_pop 1 246 40032 339.12
## + mean_education 1 127 40151 339.26
## + m_per1000f 1 98 40180 339.30
## + prob_prison 1 51 40227 339.35
## + police_exp60 1 37 40241 339.37
## + unemploy_m24 1 17 40261 339.39
## - police_exp59 1 4687 44965 340.58
## - inequality 1 32733 73012 363.37
##
## Step: AIC=336.19
## gdp ~ percent_m + is_south + police_exp59 + labour_participation +
## nonwhites_per1000 + inequality + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## <none> 40950 336.19
## - time_prison 1 2021 42971 336.45
## - labour_participation 1 2067 43016 336.50
## - is_south 1 2399 43349 336.86
## - nonwhites_per1000 1 2838 43788 337.34
## + unemploy_m39 1 671 40278 337.41
## + unemploy_m24 1 532 40418 337.57
## + m_per1000f 1 337 40613 337.80
## + state_pop 1 314 40635 337.83
## + prob_prison 1 46 40904 338.14
## + mean_education 1 13 40937 338.17
## + police_exp60 1 1 40949 338.19
## - percent_m 1 3660 44610 338.21
## - crime_rate 1 3696 44646 338.25
## - police_exp59 1 4262 45211 338.84
## - inequality 1 32326 73276 361.54
gdpB <- lm(gdp ~ percent_m + is_south + police_exp59 + labour_participation + nonwhites_per1000 + inequality + time_prison + crime_rate,crm)
gdpF <- lm(gdp ~ inequality + police_exp60 + nonwhites_per1000 + crime_rate + percent_m,crm)
gdpBt <- lm(gdp ~ percent_m + is_south + police_exp59 + labour_participation + nonwhites_per1000 + inequality + time_prison + crime_rate,crm)
summary(gdpA)$adj.r.squared
## [1] 0.8628926
summary(gdpB)$adj.r.squared
## [1] 0.884257
summary(gdpF)$adj.r.squared
## [1] 0.8820182
(vif < 10)
vif(gdpA)
## percent_m is_south mean_education
## 3.170944 5.064479 6.568520
## police_exp60 police_exp59 labour_participation
## 115.774639 115.953281 3.658815
## m_per1000f state_pop nonwhites_per1000
## 3.849536 2.536364 4.515073
## unemploy_m24 unemploy_m39 inequality
## 6.427948 5.723608 7.360566
## prob_prison time_prison crime_rate
## 3.219434 2.690628 4.941213
vif(gdpB)
## percent_m is_south police_exp59
## 2.293891 4.023964 5.531691
## labour_participation nonwhites_per1000 inequality
## 1.575476 3.521800 4.862590
## time_prison crime_rate
## 1.139803 2.785642
vif(gdpF)
## inequality police_exp60 nonwhites_per1000 crime_rate
## 4.051937 5.303696 2.653074 2.686759
## percent_m
## 2.268081
(p-value > 0.05)
bptest(gdpA)
##
## studentized Breusch-Pagan test
##
## data: gdpA
## BP = 11.421, df = 15, p-value = 0.7222
bptest(gdpB)
##
## studentized Breusch-Pagan test
##
## data: gdpB
## BP = 8.1288, df = 8, p-value = 0.421
bptest(gdpF)
##
## studentized Breusch-Pagan test
##
## data: gdpF
## BP = 1.7128, df = 5, p-value = 0.8873
(p-value > 0.05)
shapiro.test(gdpA$residuals)
##
## Shapiro-Wilk normality test
##
## data: gdpA$residuals
## W = 0.96012, p-value = 0.1087
shapiro.test(gdpB$residuals)
##
## Shapiro-Wilk normality test
##
## data: gdpB$residuals
## W = 0.95329, p-value = 0.05833
shapiro.test(gdpF$residuals)
##
## Shapiro-Wilk normality test
##
## data: gdpF$residuals
## W = 0.97169, p-value = 0.3068
The result from Backward and Both direction give the same variables, so one of them is eliminated.
The AIC result from the Forward direction (334.66) is bigger than Backward direction (336.19), but the Adj R Squared result from Forward (0.8820182) is smaller than from Backward (0.884257).
Model F and B has passed assumption checking on Multicolinearity, Error Pattern, and Residual Normality. Model A has not passed the Multicolinearity, but I continue to investigate it and see if it gave a better result in stepwise after outlier removal.
Continue to investigate model A, B, and F.
crmA <- crm %>%
gather(key = "variable_name",
value = "variable_value")
ggplot(crmA,aes(y=variable_value))+
geom_boxplot()+
facet_wrap(~variable_name,scales="free")
Observe boxplot for each dependent and independent variables, find the outlier threshold then try remove each one to see if their make the model have a better Adj R Square.
Initial Adj R Square for Backward model : 0.8628926
crmA1 <- crm %>%
filter(crime_rate <= 1800)
gdpA1 <- lm(gdp ~ .,crmA1)
summary(gdpA1)$adj.r.square
## [1] 0.8460174
Decrease in Adj R Square, keep the outlier.
crmA2 <- crm %>%
filter(nonwhites_per1000 <= 300)
gdpA2 <- lm(gdp ~ .,crmA2)
summary(gdpA2)$adj.r.square
## [1] 0.8131418
Decrease in Adj R Square, keep the outlier.
crmA3 <- crm %>%
filter(percent_m <= 170)
gdpA3 <- lm(gdp ~ .,crmA3)
summary(gdpA3)$adj.r.square
## [1] 0.8542544
Decrease in Adj R Square, keep the outlier.
crmA4 <- crm %>%
filter(police_exp59 <= 150)
gdpA4 <- lm(gdp ~ .,crmA4)
summary(gdpA4)$adj.r.square
## [1] 0.8637712
Increase in Adj R Square, remove the outlier.
crmA5 <- crm %>%
filter(time_prison<=42)
gdpA5 <- lm(gdp ~ .,crmA5)
summary(gdpA5)$adj.r.square
## [1] 0.8779584
Increase in Adj R Square, remove the outlier.
crmA6 <- crm %>%
filter(m_per1000f <= 1060)
gdpA6 <- lm(gdp ~ .,crmA6)
summary(gdpA6)$adj.r.square
## [1] 0.8563324
Decrease in Adj R Square, keep the outlier.
crmA7 <- crm %>%
filter(prob_prison <= 0.1)
gdpA7 <- lm(gdp ~ .,crmA7)
summary(gdpA7)$adj.r.square
## [1] 0.8730323
Increase in Adj R Square, remove the outlier.
8.state_pop <= 140
crmA8 <- crm %>%
filter(state_pop <= 100)
gdpA8 <- lm(gdp ~ .,crmA8)
summary(gdpA8)$adj.r.square
## [1] 0.8504931
Decrease in Adj R Square, keep the outlier.
crmA9 <- crm %>%
filter(unemploy_m24 <= 140)
gdpA9 <- lm(gdp ~ .,crmA9)
summary(gdpA9)$adj.r.square
## [1] 0.8622883
Decrease in Adj R Square, keep the outlier.
crmA10 <- crm %>%
filter(unemploy_m39 <= 55)
gdpA10 <- lm(gdp ~ .,crmA10)
summary(gdpA10)$adj.r.square
## [1] 0.8584968
Decrease in Adj R Square, keep the outlier.
Result from Model A after eliminate outlier
crmAF <- crm %>%
filter(police_exp59 <= 150,
time_prison <= 42,
prob_prison <= 0.1)
gdpA1 <- lm(gdp~.,crmAF)
gdpN1 <- lm(gdp~1,crmAF)
GDP ~ percent_m + is_south + police_exp59 + labour_participation + nonwhites_per1000 + inequality + time_prison + crime_rate
crmB <- crm %>%
gather(key = "variable_name",
value = "variable_value",
gdp, percent_m, is_south, police_exp59, labour_participation, nonwhites_per1000, inequality, time_prison, crime_rate)
ggplot(crmB,aes(y=variable_value))+
geom_boxplot()+
facet_wrap(~variable_name,scales="free")
Observe boxplot for each dependent and independent variables, find the outlier threshold then try remove each one to see if their make the model have a better Adj R Square.
Initial Adj R Square for Backward model : 0.884257
crmB1 <- crm %>%
filter(crime_rate<=1800)
gdpB1 <- lm(gdp ~ percent_m + is_south + police_exp59 + labour_participation + nonwhites_per1000 + inequality + time_prison + crime_rate,crmB1)
summary(gdpB1)$adj.r.square
## [1] 0.8709542
Decrease in Adj R Square, keep the outlier.
crmB2 <- crm %>%
filter(nonwhites_per1000 <= 300)
gdpB2 <- lm(gdp ~ percent_m + is_south + police_exp59 + labour_participation + nonwhites_per1000 + inequality + time_prison + crime_rate,crmB2)
summary(gdpB2)$adj.r.square
## [1] 0.8450358
Decrease in Adj R Square, keep the outlier.
crmB3 <- crm %>%
filter(percent_m <= 170)
gdpB3 <- lm(gdp ~ percent_m + is_south + police_exp59 + labour_participation + nonwhites_per1000 + inequality + time_prison + crime_rate,crmB3)
summary(gdpB3)$adj.r.square
## [1] 0.8777601
Decrease in Adj R Square, keep the outlier.
crmB4 <- crm %>%
filter(police_exp59 <= 150)
gdpB4 <- lm(gdp ~ percent_m + is_south + police_exp59 + labour_participation + nonwhites_per1000 + inequality + time_prison + crime_rate,crmB4)
summary(gdpB4)$adj.r.square
## [1] 0.8843402
Increase in Adj R Square, remove the outlier.
crmB5 <- crm %>%
filter(time_prison <= 42)
gdpB5 <- lm(gdp ~ percent_m + is_south + police_exp59 + labour_participation + nonwhites_per1000 + inequality + time_prison + crime_rate,crmB5)
summary(gdpB5)$adj.r.square
## [1] 0.8963129
Increase in Adj R Square, remove the outlier.
Result from Model B after eliminate outlier
crmB6 <- crm %>%
filter(police_exp59 <= 150,
time_prison <= 42)
gdpB6 <- lm(gdp ~ percent_m + is_south + police_exp59 + labour_participation + nonwhites_per1000 + inequality + time_prison + crime_rate,crmB6)
gdpB6n <- lm(gdp ~1,crmB6)
GDP ~ inequality + police_exp60 + nonwhites_per1000 + crime_rate + percent_m
crmF <- crm %>%
gather(key = "variable_name",
value = "variable_value",
gdp, percent_m, nonwhites_per1000, inequality, crime_rate, police_exp60)
ggplot(crmF,aes(y=variable_value))+
geom_boxplot()+
facet_wrap(~variable_name,scales="free")
Observe boxplot for each dependent and independent variables, find the outlier threshold then try remove each one to see if their make the model have a better Adj R Square.
Initial Adj R Square for Forward model : 0.8820182
crmF1 <- crm %>%
filter(crime_rate<=1800)
gdpF1 <- lm(gdp ~ inequality + police_exp60 + nonwhites_per1000 + crime_rate + percent_m,crmF1)
summary(gdpF1)$adj.r.square
## [1] 0.8690229
Decrease in Adj R Square, keep the outlier.
crmF2 <- crm %>%
filter(nonwhites_per1000 <= 300)
gdpF2 <- lm(gdp ~ percent_m + is_south + police_exp59 + labour_participation + nonwhites_per1000 + inequality + time_prison + crime_rate,crmF2)
summary(gdpF2)$adj.r.square
## [1] 0.8450358
Decrease in Adj R Square, keep the outlier.
crmF3 <- crm %>%
filter(percent_m <= 170)
gdpF3 <- lm(gdp ~ percent_m + is_south + police_exp59 + labour_participation + nonwhites_per1000 + inequality + time_prison + crime_rate,crmF3)
summary(gdpF3)$adj.r.square
## [1] 0.8777601
Decrease in Adj R Square, keep the outlier.
Every outlier give low influence to the model, so there is no outlier removal in the Model F
Since theres a change in data for Model A and B, try to do a stepwise on a new A and B model
back3 <- step(gdpA1,direction="backward") #AIC 303.83
## Start: AIC=318.87
## gdp ~ percent_m + is_south + mean_education + police_exp60 +
## police_exp59 + labour_participation + m_per1000f + state_pop +
## nonwhites_per1000 + unemploy_m24 + unemploy_m39 + inequality +
## prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - state_pop 1 0.1 29849 316.87
## - police_exp59 1 0.7 29850 316.87
## - labour_participation 1 9.9 29859 316.88
## - percent_m 1 152.5 30002 317.09
## - is_south 1 153.2 30002 317.09
## - police_exp60 1 196.3 30046 317.16
## - m_per1000f 1 212.7 30062 317.18
## - mean_education 1 214.8 30064 317.18
## - crime_rate 1 452.1 30301 317.53
## - unemploy_m24 1 687.0 30536 317.87
## - time_prison 1 804.1 30653 318.04
## - unemploy_m39 1 1205.7 31055 318.61
## <none> 29849 318.87
## - nonwhites_per1000 1 1609.6 31459 319.18
## - prob_prison 1 1901.8 31751 319.59
## - inequality 1 14356.8 44206 334.15
##
## Step: AIC=316.87
## gdp ~ percent_m + is_south + mean_education + police_exp60 +
## police_exp59 + labour_participation + m_per1000f + nonwhites_per1000 +
## unemploy_m24 + unemploy_m39 + inequality + prob_prison +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - police_exp59 1 0.7 29850 314.87
## - labour_participation 1 9.7 29859 314.88
## - is_south 1 153.2 30003 315.09
## - percent_m 1 154.2 30004 315.09
## - police_exp60 1 199.2 30049 315.16
## - mean_education 1 216.1 30066 315.19
## - m_per1000f 1 250.2 30100 315.24
## - crime_rate 1 453.1 30303 315.53
## - unemploy_m24 1 688.6 30538 315.87
## - time_prison 1 872.3 30722 316.14
## - unemploy_m39 1 1221.0 31070 316.63
## <none> 29849 316.87
## - nonwhites_per1000 1 1627.2 31477 317.20
## - prob_prison 1 1902.4 31752 317.59
## - inequality 1 14394.8 44244 332.18
##
## Step: AIC=314.87
## gdp ~ percent_m + is_south + mean_education + police_exp60 +
## labour_participation + m_per1000f + nonwhites_per1000 + unemploy_m24 +
## unemploy_m39 + inequality + prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - labour_participation 1 12.6 29863 312.89
## - percent_m 1 155.9 30006 313.10
## - is_south 1 157.4 30008 313.10
## - mean_education 1 223.0 30073 313.20
## - m_per1000f 1 250.1 30100 313.24
## - crime_rate 1 461.5 30312 313.54
## - unemploy_m24 1 711.1 30561 313.91
## - time_prison 1 1008.9 30859 314.33
## - unemploy_m39 1 1232.7 31083 314.65
## <none> 29850 314.87
## - nonwhites_per1000 1 1793.7 31644 315.44
## - police_exp60 1 2075.0 31925 315.83
## - prob_prison 1 2095.3 31945 315.85
## - inequality 1 14481.8 44332 330.27
##
## Step: AIC=312.89
## gdp ~ percent_m + is_south + mean_education + police_exp60 +
## m_per1000f + nonwhites_per1000 + unemploy_m24 + unemploy_m39 +
## inequality + prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - is_south 1 155.2 30018 311.12
## - percent_m 1 165.8 30029 311.13
## - mean_education 1 282.1 30145 311.30
## - m_per1000f 1 437.0 30300 311.53
## - crime_rate 1 453.7 30316 311.55
## - unemploy_m24 1 979.9 30843 312.31
## - time_prison 1 1022.4 30885 312.37
## - unemploy_m39 1 1283.1 31146 312.74
## <none> 29863 312.89
## - nonwhites_per1000 1 1839.9 31703 313.52
## - police_exp60 1 2064.8 31927 313.83
## - prob_prison 1 2212.1 32075 314.03
## - inequality 1 15246.2 45109 329.04
##
## Step: AIC=311.12
## gdp ~ percent_m + mean_education + police_exp60 + m_per1000f +
## nonwhites_per1000 + unemploy_m24 + unemploy_m39 + inequality +
## prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - percent_m 1 117.8 30136 309.29
## - mean_education 1 271.3 30289 309.51
## - m_per1000f 1 388.4 30406 309.68
## - crime_rate 1 478.6 30497 309.81
## - time_prison 1 902.9 30921 310.42
## - unemploy_m24 1 1318.9 31337 311.01
## <none> 30018 311.12
## - nonwhites_per1000 1 1689.1 31707 311.52
## - unemploy_m39 1 1692.7 31711 311.53
## - police_exp60 1 1951.1 31969 311.89
## - prob_prison 1 2289.2 32307 312.35
## - inequality 1 15306.1 45324 327.25
##
## Step: AIC=309.29
## gdp ~ mean_education + police_exp60 + m_per1000f + nonwhites_per1000 +
## unemploy_m24 + unemploy_m39 + inequality + prob_prison +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - m_per1000f 1 321.4 30457 307.76
## - mean_education 1 382.6 30518 307.84
## - crime_rate 1 383.0 30519 307.84
## - time_prison 1 873.5 31009 308.55
## - unemploy_m24 1 1394.5 31530 309.28
## <none> 30136 309.29
## - unemploy_m39 1 2181.6 32317 310.36
## - nonwhites_per1000 1 2380.7 32516 310.63
## - police_exp60 1 2413.1 32549 310.68
## - prob_prison 1 2437.3 32573 310.71
## - inequality 1 15189.0 45325 325.25
##
## Step: AIC=307.76
## gdp ~ mean_education + police_exp60 + nonwhites_per1000 + unemploy_m24 +
## unemploy_m39 + inequality + prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - mean_education 1 511.9 30969 306.49
## - crime_rate 1 600.8 31058 306.61
## - time_prison 1 687.4 31145 306.74
## - unemploy_m24 1 1073.1 31530 307.28
## <none> 30457 307.76
## - unemploy_m39 1 1864.0 32321 308.37
## - police_exp60 1 2391.4 32849 309.08
## - prob_prison 1 2454.9 32912 309.17
## - nonwhites_per1000 1 2567.0 33024 309.32
## - inequality 1 14873.8 45331 323.25
##
## Step: AIC=306.49
## gdp ~ police_exp60 + nonwhites_per1000 + unemploy_m24 + unemploy_m39 +
## inequality + prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - unemploy_m24 1 732.3 31701 305.52
## - time_prison 1 785.9 31755 305.59
## - unemploy_m39 1 1359.3 32328 306.38
## <none> 30969 306.49
## - crime_rate 1 1690.7 32660 306.83
## - prob_prison 1 2064.8 33034 307.33
## - police_exp60 1 2327.9 33297 307.68
## - nonwhites_per1000 1 4013.0 34982 309.85
## - inequality 1 27740.9 58710 332.63
##
## Step: AIC=305.52
## gdp ~ police_exp60 + nonwhites_per1000 + unemploy_m39 + inequality +
## prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - unemploy_m39 1 680.8 32382 304.45
## - time_prison 1 1389.3 33091 305.40
## - crime_rate 1 1444.8 33146 305.48
## <none> 31701 305.52
## - prob_prison 1 1767.1 33469 305.90
## - nonwhites_per1000 1 3778.3 35480 308.47
## - police_exp60 1 3945.1 35647 308.68
## - inequality 1 27104.2 58806 330.70
##
## Step: AIC=304.45
## gdp ~ police_exp60 + nonwhites_per1000 + inequality + prob_prison +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - crime_rate 1 1335.0 33717 304.23
## <none> 32382 304.45
## - time_prison 1 1520.3 33902 304.47
## - prob_prison 1 1637.4 34020 304.62
## - nonwhites_per1000 1 3885.6 36268 307.44
## - police_exp60 1 4857.6 37240 308.60
## - inequality 1 26438.0 58820 328.71
##
## Step: AIC=304.23
## gdp ~ police_exp60 + nonwhites_per1000 + inequality + prob_prison +
## time_prison
##
## Df Sum of Sq RSS AIC
## - time_prison 1 1248.2 34965 303.83
## <none> 33717 304.23
## - prob_prison 1 2393.2 36110 305.25
## - nonwhites_per1000 1 3902.5 37620 307.05
## - police_exp60 1 20774.5 54492 323.35
## - inequality 1 27802.7 61520 328.69
##
## Step: AIC=303.83
## gdp ~ police_exp60 + nonwhites_per1000 + inequality + prob_prison
##
## Df Sum of Sq RSS AIC
## <none> 34965 303.83
## - nonwhites_per1000 1 2739.0 37704 305.15
## - prob_prison 1 7753.4 42719 310.64
## - police_exp60 1 19554.4 54520 321.37
## - inequality 1 27348.3 62314 327.25
forward3 <- step(gdpN1, scope = list(lower = gdpN1,upper = gdpA1),direction="forward") #AIC 303.83
## Start: AIC=403.29
## gdp ~ 1
##
## Df Sum of Sq RSS AIC
## + inequality 1 314018 88091 338.49
## + police_exp59 1 256505 145604 360.60
## + police_exp60 1 251745 150365 362.01
## + mean_education 1 236636 165474 366.22
## + prob_prison 1 219031 183078 370.67
## + is_south 1 191021 211089 376.94
## + percent_m 1 183238 218871 378.53
## + nonwhites_per1000 1 156201 245908 383.66
## + crime_rate 1 78071 324039 395.79
## + labour_participation 1 52576 349534 399.13
## + state_pop 1 30449 371661 401.83
## + m_per1000f 1 24180 377930 402.56
## <none> 402110 403.29
## + unemploy_m39 1 2861 399249 404.98
## + unemploy_m24 1 1721 400389 405.10
## + time_prison 1 458 401652 405.24
##
## Step: AIC=338.49
## gdp ~ inequality
##
## Df Sum of Sq RSS AIC
## + police_exp60 1 39557 48534 314.26
## + crime_rate 1 38272 49819 315.41
## + police_exp59 1 37234 50857 316.31
## + prob_prison 1 33556 54535 319.39
## + state_pop 1 16918 71173 331.10
## + time_prison 1 8603 79488 335.96
## + percent_m 1 5608 82483 337.59
## + unemploy_m39 1 5242 82849 337.79
## <none> 88091 338.49
## + mean_education 1 3318 84774 338.80
## + labour_participation 1 1769 86322 339.59
## + m_per1000f 1 595 87496 340.19
## + unemploy_m24 1 68 88023 340.45
## + nonwhites_per1000 1 41 88051 340.47
## + is_south 1 17 88074 340.48
##
## Step: AIC=314.26
## gdp ~ inequality + police_exp60
##
## Df Sum of Sq RSS AIC
## + prob_prison 1 10830.1 37704 305.15
## + nonwhites_per1000 1 5815.6 42719 310.64
## + time_prison 1 3725.6 44809 312.74
## + crime_rate 1 3024.0 45510 313.43
## + percent_m 1 2176.3 46358 314.24
## <none> 48534 314.26
## + state_pop 1 1238.3 47296 315.12
## + mean_education 1 1028.0 47506 315.31
## + labour_participation 1 954.3 47580 315.38
## + is_south 1 799.8 47735 315.53
## + police_exp59 1 517.8 48017 315.79
## + unemploy_m39 1 470.9 48064 315.83
## + m_per1000f 1 309.8 48225 315.98
## + unemploy_m24 1 80.9 48454 316.18
##
## Step: AIC=305.15
## gdp ~ inequality + police_exp60 + prob_prison
##
## Df Sum of Sq RSS AIC
## + nonwhites_per1000 1 2738.95 34965 303.83
## <none> 37704 305.15
## + crime_rate 1 1254.38 36450 305.66
## + percent_m 1 1234.23 36470 305.68
## + mean_education 1 1114.82 36590 305.83
## + unemploy_m39 1 696.06 37008 306.33
## + police_exp59 1 501.20 37203 306.56
## + m_per1000f 1 441.35 37263 306.63
## + labour_participation 1 215.23 37489 306.89
## + unemploy_m24 1 114.91 37589 307.01
## + is_south 1 113.55 37591 307.01
## + time_prison 1 84.58 37620 307.05
## + state_pop 1 46.38 37658 307.09
##
## Step: AIC=303.83
## gdp ~ inequality + police_exp60 + prob_prison + nonwhites_per1000
##
## Df Sum of Sq RSS AIC
## <none> 34965 303.83
## + time_prison 1 1248.17 33717 304.23
## + crime_rate 1 1062.94 33902 304.47
## + unemploy_m39 1 690.97 34274 304.95
## + percent_m 1 257.61 34708 305.50
## + is_south 1 248.45 34717 305.51
## + mean_education 1 248.04 34717 305.51
## + state_pop 1 243.89 34722 305.52
## + police_exp59 1 154.10 34811 305.63
## + labour_participation 1 26.85 34939 305.80
## + m_per1000f 1 20.18 34945 305.80
## + unemploy_m24 1 16.94 34948 305.81
both3 <- step(gdpA1,scope = list(lower = gdpN1,upper = gdpA1),direction="both") #AIC 303.83
## Start: AIC=318.87
## gdp ~ percent_m + is_south + mean_education + police_exp60 +
## police_exp59 + labour_participation + m_per1000f + state_pop +
## nonwhites_per1000 + unemploy_m24 + unemploy_m39 + inequality +
## prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - state_pop 1 0.1 29849 316.87
## - police_exp59 1 0.7 29850 316.87
## - labour_participation 1 9.9 29859 316.88
## - percent_m 1 152.5 30002 317.09
## - is_south 1 153.2 30002 317.09
## - police_exp60 1 196.3 30046 317.16
## - m_per1000f 1 212.7 30062 317.18
## - mean_education 1 214.8 30064 317.18
## - crime_rate 1 452.1 30301 317.53
## - unemploy_m24 1 687.0 30536 317.87
## - time_prison 1 804.1 30653 318.04
## - unemploy_m39 1 1205.7 31055 318.61
## <none> 29849 318.87
## - nonwhites_per1000 1 1609.6 31459 319.18
## - prob_prison 1 1901.8 31751 319.59
## - inequality 1 14356.8 44206 334.15
##
## Step: AIC=316.87
## gdp ~ percent_m + is_south + mean_education + police_exp60 +
## police_exp59 + labour_participation + m_per1000f + nonwhites_per1000 +
## unemploy_m24 + unemploy_m39 + inequality + prob_prison +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - police_exp59 1 0.7 29850 314.87
## - labour_participation 1 9.7 29859 314.88
## - is_south 1 153.2 30003 315.09
## - percent_m 1 154.2 30004 315.09
## - police_exp60 1 199.2 30049 315.16
## - mean_education 1 216.1 30066 315.19
## - m_per1000f 1 250.2 30100 315.24
## - crime_rate 1 453.1 30303 315.53
## - unemploy_m24 1 688.6 30538 315.87
## - time_prison 1 872.3 30722 316.14
## - unemploy_m39 1 1221.0 31070 316.63
## <none> 29849 316.87
## - nonwhites_per1000 1 1627.2 31477 317.20
## - prob_prison 1 1902.4 31752 317.59
## + state_pop 1 0.1 29849 318.87
## - inequality 1 14394.8 44244 332.18
##
## Step: AIC=314.87
## gdp ~ percent_m + is_south + mean_education + police_exp60 +
## labour_participation + m_per1000f + nonwhites_per1000 + unemploy_m24 +
## unemploy_m39 + inequality + prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - labour_participation 1 12.6 29863 312.89
## - percent_m 1 155.9 30006 313.10
## - is_south 1 157.4 30008 313.10
## - mean_education 1 223.0 30073 313.20
## - m_per1000f 1 250.1 30100 313.24
## - crime_rate 1 461.5 30312 313.54
## - unemploy_m24 1 711.1 30561 313.91
## - time_prison 1 1008.9 30859 314.33
## - unemploy_m39 1 1232.7 31083 314.65
## <none> 29850 314.87
## - nonwhites_per1000 1 1793.7 31644 315.44
## - police_exp60 1 2075.0 31925 315.83
## - prob_prison 1 2095.3 31945 315.85
## + police_exp59 1 0.7 29849 316.87
## + state_pop 1 0.1 29850 316.87
## - inequality 1 14481.8 44332 330.27
##
## Step: AIC=312.89
## gdp ~ percent_m + is_south + mean_education + police_exp60 +
## m_per1000f + nonwhites_per1000 + unemploy_m24 + unemploy_m39 +
## inequality + prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - is_south 1 155.2 30018 311.12
## - percent_m 1 165.8 30029 311.13
## - mean_education 1 282.1 30145 311.30
## - m_per1000f 1 437.0 30300 311.53
## - crime_rate 1 453.7 30316 311.55
## - unemploy_m24 1 979.9 30843 312.31
## - time_prison 1 1022.4 30885 312.37
## - unemploy_m39 1 1283.1 31146 312.74
## <none> 29863 312.89
## - nonwhites_per1000 1 1839.9 31703 313.52
## - police_exp60 1 2064.8 31927 313.83
## - prob_prison 1 2212.1 32075 314.03
## + labour_participation 1 12.6 29850 314.87
## + police_exp59 1 3.5 29859 314.88
## + state_pop 1 0.1 29863 314.89
## - inequality 1 15246.2 45109 329.04
##
## Step: AIC=311.12
## gdp ~ percent_m + mean_education + police_exp60 + m_per1000f +
## nonwhites_per1000 + unemploy_m24 + unemploy_m39 + inequality +
## prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - percent_m 1 117.8 30136 309.29
## - mean_education 1 271.3 30289 309.51
## - m_per1000f 1 388.4 30406 309.68
## - crime_rate 1 478.6 30497 309.81
## - time_prison 1 902.9 30921 310.42
## - unemploy_m24 1 1318.9 31337 311.01
## <none> 30018 311.12
## - nonwhites_per1000 1 1689.1 31707 311.52
## - unemploy_m39 1 1692.7 31711 311.53
## - police_exp60 1 1951.1 31969 311.89
## - prob_prison 1 2289.2 32307 312.35
## + is_south 1 155.2 29863 312.89
## + labour_participation 1 10.4 30008 313.10
## + police_exp59 1 1.3 30017 313.11
## + state_pop 1 0.7 30017 313.11
## - inequality 1 15306.1 45324 327.25
##
## Step: AIC=309.29
## gdp ~ mean_education + police_exp60 + m_per1000f + nonwhites_per1000 +
## unemploy_m24 + unemploy_m39 + inequality + prob_prison +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - m_per1000f 1 321.4 30457 307.76
## - mean_education 1 382.6 30518 307.84
## - crime_rate 1 383.0 30519 307.84
## - time_prison 1 873.5 31009 308.55
## - unemploy_m24 1 1394.5 31530 309.28
## <none> 30136 309.29
## - unemploy_m39 1 2181.6 32317 310.36
## - nonwhites_per1000 1 2380.7 32516 310.63
## - police_exp60 1 2413.1 32549 310.68
## - prob_prison 1 2437.3 32573 310.71
## + percent_m 1 117.8 30018 311.12
## + is_south 1 107.2 30029 311.13
## + police_exp59 1 4.8 30131 311.28
## + labour_participation 1 1.8 30134 311.29
## + state_pop 1 1.7 30134 311.29
## - inequality 1 15189.0 45325 325.25
##
## Step: AIC=307.76
## gdp ~ mean_education + police_exp60 + nonwhites_per1000 + unemploy_m24 +
## unemploy_m39 + inequality + prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - mean_education 1 511.9 30969 306.49
## - crime_rate 1 600.8 31058 306.61
## - time_prison 1 687.4 31145 306.74
## - unemploy_m24 1 1073.1 31530 307.28
## <none> 30457 307.76
## - unemploy_m39 1 1864.0 32321 308.37
## - police_exp60 1 2391.4 32849 309.08
## - prob_prison 1 2454.9 32912 309.17
## + m_per1000f 1 321.4 30136 309.29
## - nonwhites_per1000 1 2567.0 33024 309.32
## + is_south 1 82.2 30375 309.64
## + labour_participation 1 59.2 30398 309.67
## + state_pop 1 51.0 30406 309.68
## + percent_m 1 50.8 30406 309.68
## + police_exp59 1 28.8 30428 309.71
## - inequality 1 14873.8 45331 323.25
##
## Step: AIC=306.49
## gdp ~ police_exp60 + nonwhites_per1000 + unemploy_m24 + unemploy_m39 +
## inequality + prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - unemploy_m24 1 732.3 31701 305.52
## - time_prison 1 785.9 31755 305.59
## - unemploy_m39 1 1359.3 32328 306.38
## <none> 30969 306.49
## - crime_rate 1 1690.7 32660 306.83
## - prob_prison 1 2064.8 33034 307.33
## - police_exp60 1 2327.9 33297 307.68
## + mean_education 1 511.9 30457 307.76
## + m_per1000f 1 450.8 30518 307.84
## + labour_participation 1 227.3 30742 308.16
## + percent_m 1 134.5 30835 308.30
## + state_pop 1 83.5 30886 308.37
## + is_south 1 52.0 30917 308.41
## + police_exp59 1 10.3 30959 308.47
## - nonwhites_per1000 1 4013.0 34982 309.85
## - inequality 1 27740.9 58710 332.63
##
## Step: AIC=305.52
## gdp ~ police_exp60 + nonwhites_per1000 + unemploy_m39 + inequality +
## prob_prison + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - unemploy_m39 1 680.8 32382 304.45
## - time_prison 1 1389.3 33091 305.40
## - crime_rate 1 1444.8 33146 305.48
## <none> 31701 305.52
## - prob_prison 1 1767.1 33469 305.90
## + unemploy_m24 1 732.3 30969 306.49
## + is_south 1 278.3 31423 307.13
## + percent_m 1 231.2 31470 307.19
## + mean_education 1 171.2 31530 307.28
## + labour_participation 1 158.2 31543 307.30
## + m_per1000f 1 14.8 31687 307.50
## + state_pop 1 12.3 31689 307.50
## + police_exp59 1 2.4 31699 307.51
## - nonwhites_per1000 1 3778.3 35480 308.47
## - police_exp60 1 3945.1 35647 308.68
## - inequality 1 27104.2 58806 330.70
##
## Step: AIC=304.45
## gdp ~ police_exp60 + nonwhites_per1000 + inequality + prob_prison +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - crime_rate 1 1335.0 33717 304.23
## <none> 32382 304.45
## - time_prison 1 1520.3 33902 304.47
## - prob_prison 1 1637.4 34020 304.62
## + unemploy_m39 1 680.8 31701 305.52
## + percent_m 1 578.9 31803 305.66
## + is_south 1 344.3 32038 305.98
## + unemploy_m24 1 53.9 32328 306.38
## + m_per1000f 1 24.3 32358 306.42
## + state_pop 1 6.8 32375 306.44
## + mean_education 1 1.5 32381 306.45
## + labour_participation 1 1.2 32381 306.45
## + police_exp59 1 0.0 32382 306.45
## - nonwhites_per1000 1 3885.6 36268 307.44
## - police_exp60 1 4857.6 37240 308.60
## - inequality 1 26438.0 58820 328.71
##
## Step: AIC=304.23
## gdp ~ police_exp60 + nonwhites_per1000 + inequality + prob_prison +
## time_prison
##
## Df Sum of Sq RSS AIC
## - time_prison 1 1248.2 34965 303.83
## <none> 33717 304.23
## + crime_rate 1 1335.0 32382 304.45
## - prob_prison 1 2393.2 36110 305.25
## + unemploy_m39 1 571.0 33146 305.48
## + mean_education 1 357.2 33360 305.76
## + is_south 1 340.0 33377 305.78
## + m_per1000f 1 307.7 33410 305.83
## + percent_m 1 157.9 33559 306.02
## + unemploy_m24 1 64.6 33653 306.14
## + labour_participation 1 43.8 33673 306.17
## + police_exp59 1 7.6 33710 306.22
## + state_pop 1 0.6 33717 306.23
## - nonwhites_per1000 1 3902.5 37620 307.05
## - police_exp60 1 20774.5 54492 323.35
## - inequality 1 27802.7 61520 328.69
##
## Step: AIC=303.83
## gdp ~ police_exp60 + nonwhites_per1000 + inequality + prob_prison
##
## Df Sum of Sq RSS AIC
## <none> 34965 303.83
## + time_prison 1 1248.2 33717 304.23
## + crime_rate 1 1062.9 33902 304.47
## + unemploy_m39 1 691.0 34274 304.95
## - nonwhites_per1000 1 2739.0 37704 305.15
## + percent_m 1 257.6 34708 305.50
## + is_south 1 248.4 34717 305.51
## + mean_education 1 248.0 34717 305.52
## + state_pop 1 243.9 34722 305.52
## + police_exp59 1 154.1 34811 305.63
## + labour_participation 1 26.8 34939 305.79
## + m_per1000f 1 20.2 34945 305.80
## + unemploy_m24 1 16.9 34948 305.81
## - prob_prison 1 7753.4 42719 310.64
## - police_exp60 1 19554.4 54520 321.37
## - inequality 1 27348.3 62314 327.25
gdpAF <- lm(gdp ~ police_exp60 + nonwhites_per1000 + inequality + prob_prison,crmAF)
All 3 direction give the same AIC and same variables.
vif(gdpAF)
## police_exp60 nonwhites_per1000 inequality prob_prison
## 2.417687 2.322902 3.311513 1.973156
bptest(gdpAF)
##
## studentized Breusch-Pagan test
##
## data: gdpAF
## BP = 1.8475, df = 4, p-value = 0.7638
shapiro.test(gdpAF$residuals)
##
## Shapiro-Wilk normality test
##
## data: gdpAF$residuals
## W = 0.99044, p-value = 0.9715
The New A Model passed all 3 assumption checking.
back2 <- step(gdpB6,direction = "backward") # AIC 313.74
## Start: AIC=316.78
## gdp ~ percent_m + is_south + police_exp59 + labour_participation +
## nonwhites_per1000 + inequality + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - labour_participation 1 574.2 34989 315.53
## - percent_m 1 956.9 35372 316.02
## <none> 34415 316.78
## - is_south 1 1675.3 36090 316.92
## - crime_rate 1 1842.5 36257 317.13
## - nonwhites_per1000 1 4983.6 39398 320.87
## - police_exp59 1 5787.7 40202 321.78
## - time_prison 1 5841.4 40256 321.84
## - inequality 1 25988.1 60403 340.10
##
## Step: AIC=315.53
## gdp ~ percent_m + is_south + police_exp59 + nonwhites_per1000 +
## inequality + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - percent_m 1 789.9 35779 314.53
## - is_south 1 1139.2 36128 314.97
## <none> 34989 315.53
## - crime_rate 1 2475.5 37464 316.60
## - nonwhites_per1000 1 4919.0 39908 319.45
## - police_exp59 1 5470.0 40459 320.06
## - time_prison 1 5735.6 40725 320.36
## - inequality 1 26032.2 61021 338.55
##
## Step: AIC=314.53
## gdp ~ is_south + police_exp59 + nonwhites_per1000 + inequality +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - is_south 1 987.9 36767 313.76
## <none> 35779 314.53
## - crime_rate 1 1913.3 37692 314.87
## - time_prison 1 6313.9 42093 319.84
## - nonwhites_per1000 1 7531.2 43310 321.13
## - police_exp59 1 8326.0 44105 321.94
## - inequality 1 25732.1 61511 336.91
##
## Step: AIC=313.76
## gdp ~ police_exp59 + nonwhites_per1000 + inequality + time_prison +
## crime_rate
##
## Df Sum of Sq RSS AIC
## - crime_rate 1 1659.1 38426 313.74
## <none> 36767 313.76
## - time_prison 1 5685.1 42452 318.23
## - nonwhites_per1000 1 6773.6 43540 319.37
## - police_exp59 1 8921.5 45688 321.53
## - inequality 1 25480.5 62247 335.45
##
## Step: AIC=313.74
## gdp ~ police_exp59 + nonwhites_per1000 + inequality + time_prison
##
## Df Sum of Sq RSS AIC
## <none> 38426 313.74
## - time_prison 1 6535 44961 318.81
## - nonwhites_per1000 1 8385 46811 320.62
## - inequality 1 26011 64437 335.01
## - police_exp59 1 44154 82580 346.17
forward2 <- step(gdpB6n,scope=list(lower=gdpB6n,upper=gdpB6),direction="forward") # AIC 313.74
## Start: AIC=412.74
## gdp ~ 1
##
## Df Sum of Sq RSS AIC
## + inequality 1 324265 89961 346.02
## + police_exp59 1 268616 145610 367.69
## + percent_m 1 187224 227001 387.67
## + is_south 1 164529 249697 391.96
## + nonwhites_per1000 1 146374 267851 395.12
## + crime_rate 1 78901 335325 405.23
## + labour_participation 1 47336 366890 409.28
## <none> 414226 412.74
## + time_prison 1 1680 412546 414.55
##
## Step: AIC=346.02
## gdp ~ inequality
##
## Df Sum of Sq RSS AIC
## + police_exp59 1 39058 50902 322.39
## + crime_rate 1 38194 51766 323.15
## + time_prison 1 7159 82802 344.29
## + percent_m 1 5322 84639 345.28
## <none> 89961 346.02
## + labour_participation 1 1255 88706 347.39
## + is_south 1 215 89745 347.91
## + nonwhites_per1000 1 19 89942 348.01
##
## Step: AIC=322.39
## gdp ~ inequality + police_exp59
##
## Df Sum of Sq RSS AIC
## + nonwhites_per1000 1 5940.8 44961 318.81
## + time_prison 1 4091.3 46811 320.62
## + crime_rate 1 3876.2 47026 320.83
## + percent_m 1 2553.7 48348 322.08
## <none> 50902 322.39
## + labour_participation 1 1405.9 49496 323.13
## + is_south 1 800.6 50101 323.68
##
## Step: AIC=318.81
## gdp ~ inequality + police_exp59 + nonwhites_per1000
##
## Df Sum of Sq RSS AIC
## + time_prison 1 6535.5 38426 313.74
## + crime_rate 1 2509.5 42452 318.23
## <none> 44961 318.81
## + percent_m 1 444.4 44517 320.36
## + labour_participation 1 432.5 44529 320.38
## + is_south 1 162.6 44799 320.65
##
## Step: AIC=313.74
## gdp ~ inequality + police_exp59 + nonwhites_per1000 + time_prison
##
## Df Sum of Sq RSS AIC
## <none> 38426 313.74
## + crime_rate 1 1659.09 36767 313.76
## + is_south 1 733.71 37692 314.87
## + labour_participation 1 245.79 38180 315.45
## + percent_m 1 183.60 38242 315.53
both2 <- step(gdpB6,scope=list(lower=gdpB6n,upper=gdpB6),direction="both") # AIC 313.74
## Start: AIC=316.78
## gdp ~ percent_m + is_south + police_exp59 + labour_participation +
## nonwhites_per1000 + inequality + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - labour_participation 1 574.2 34989 315.53
## - percent_m 1 956.9 35372 316.02
## <none> 34415 316.78
## - is_south 1 1675.3 36090 316.92
## - crime_rate 1 1842.5 36257 317.13
## - nonwhites_per1000 1 4983.6 39398 320.87
## - police_exp59 1 5787.7 40202 321.78
## - time_prison 1 5841.4 40256 321.84
## - inequality 1 25988.1 60403 340.10
##
## Step: AIC=315.53
## gdp ~ percent_m + is_south + police_exp59 + nonwhites_per1000 +
## inequality + time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - percent_m 1 789.9 35779 314.53
## - is_south 1 1139.2 36128 314.97
## <none> 34989 315.53
## - crime_rate 1 2475.5 37464 316.60
## + labour_participation 1 574.2 34415 316.78
## - nonwhites_per1000 1 4919.0 39908 319.45
## - police_exp59 1 5470.0 40459 320.06
## - time_prison 1 5735.6 40725 320.36
## - inequality 1 26032.2 61021 338.55
##
## Step: AIC=314.53
## gdp ~ is_south + police_exp59 + nonwhites_per1000 + inequality +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - is_south 1 987.9 36767 313.76
## <none> 35779 314.53
## - crime_rate 1 1913.3 37692 314.87
## + percent_m 1 789.9 34989 315.53
## + labour_participation 1 407.2 35372 316.02
## - time_prison 1 6313.9 42093 319.84
## - nonwhites_per1000 1 7531.2 43310 321.13
## - police_exp59 1 8326.0 44105 321.94
## - inequality 1 25732.1 61511 336.91
##
## Step: AIC=313.76
## gdp ~ police_exp59 + nonwhites_per1000 + inequality + time_prison +
## crime_rate
##
## Df Sum of Sq RSS AIC
## - crime_rate 1 1659.1 38426 313.74
## <none> 36767 313.76
## + is_south 1 987.9 35779 314.53
## + percent_m 1 638.6 36128 314.97
## + labour_participation 1 18.0 36749 315.73
## - time_prison 1 5685.1 42452 318.23
## - nonwhites_per1000 1 6773.6 43540 319.37
## - police_exp59 1 8921.5 45688 321.53
## - inequality 1 25480.5 62247 335.45
##
## Step: AIC=313.74
## gdp ~ police_exp59 + nonwhites_per1000 + inequality + time_prison
##
## Df Sum of Sq RSS AIC
## <none> 38426 313.74
## + crime_rate 1 1659 36767 313.76
## + is_south 1 734 37692 314.87
## + labour_participation 1 246 38180 315.45
## + percent_m 1 184 38242 315.53
## - time_prison 1 6535 44961 318.81
## - nonwhites_per1000 1 8385 46811 320.62
## - inequality 1 26011 64437 335.01
## - police_exp59 1 44154 82580 346.17
gdpBB <- lm(gdp ~ police_exp59 + nonwhites_per1000 + inequality + time_prison,crmB6)
All 3 direction give the same AIC and same variables.
The outlier removal on New B Model make different result in Stepwise Regression. The result in Stepwise Regression better give better AIC (313.74) and better Adj R Square (0.8979581).
vif(gdpBB)
## police_exp59 nonwhites_per1000 inequality time_prison
## 2.006182 2.253062 3.525548 1.104807
bptest(gdpBB)
##
## studentized Breusch-Pagan test
##
## data: gdpBB
## BP = 3.9826, df = 4, p-value = 0.4084
shapiro.test(gdpBB$residuals)
##
## Shapiro-Wilk normality test
##
## data: gdpBB$residuals
## W = 0.95694, p-value = 0.09334
The New B Model passed all 3 assumption checking.
summary(gdpAF)
##
## Call:
## lm(formula = gdp ~ police_exp60 + nonwhites_per1000 + inequality +
## prob_prison, data = crmAF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.088 -16.432 0.997 20.392 67.007
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 702.23274 55.90397 12.561 2.77e-15 ***
## police_exp60 1.22349 0.26198 4.670 3.52e-05 ***
## nonwhites_per1000 -0.11567 0.06618 -1.748 0.08836 .
## inequality -1.14714 0.20770 -5.523 2.38e-06 ***
## prob_prison -968.35171 329.28581 -2.941 0.00548 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.94 on 39 degrees of freedom
## Multiple R-squared: 0.913, Adjusted R-squared: 0.9041
## F-statistic: 102.4 on 4 and 39 DF, p-value: < 2.2e-16
summary(gdpBB)
##
## Call:
## lm(formula = gdp ~ police_exp59 + nonwhites_per1000 + inequality +
## time_prison, data = crmB6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.13 -11.74 4.94 17.52 81.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 580.97746 57.10407 10.174 1.17e-12 ***
## police_exp59 1.73117 0.25535 6.780 3.81e-08 ***
## nonwhites_per1000 -0.19836 0.06714 -2.954 0.00523 **
## inequality -1.14629 0.22029 -5.204 6.18e-06 ***
## time_prison 1.95530 0.74965 2.608 0.01273 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.99 on 40 degrees of freedom
## Multiple R-squared: 0.9072, Adjusted R-squared: 0.898
## F-statistic: 97.8 on 4 and 40 DF, p-value: < 2.2e-16
summary(gdpF)
##
## Call:
## lm(formula = gdp ~ inequality + police_exp60 + nonwhites_per1000 +
## crime_rate + percent_m, data = crm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88.011 -18.410 6.833 19.612 54.021
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 834.38597 104.92045 7.953 7.71e-10 ***
## inequality -1.39397 0.24656 -5.654 1.34e-06 ***
## police_exp60 0.71623 0.37868 1.891 0.0657 .
## nonwhites_per1000 -0.07977 0.07741 -1.031 0.3088
## crime_rate 0.04445 0.02071 2.146 0.0378 *
## percent_m -0.94976 0.58559 -1.622 0.1125
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33.14 on 41 degrees of freedom
## Multiple R-squared: 0.8948, Adjusted R-squared: 0.882
## F-statistic: 69.78 on 5 and 41 DF, p-value: < 2.2e-16
New A Model = GDP ~ inequality + nonwhites_per1000 + prob_prison + police_exp60
AIC = 303.83*
Adj R Square = 0.9041266*
RMSE = 34.9665153
New B Model = GDP ~ inequality + nonwhites_per1000 + time_prison + police_exp59
AIC = 313.74
Adj R Square = 0.8979581
RMSE = 34.7180657
Model F = GDP ~ inequality + nonwhites_per1000 + crime_rate + police_exp60 + percent_m
AIC = 334.66
Adj R Square = 0.8820182
RMSE = 30.9554492*
From the result summary above, we can conclude that New A Model give the least information loss (smallest AIC) and biggest Adj R Square. Thus, model F give the smallest error RMSE.
All of 3 model have Inequality variable and Number of Non Whites Resident every 1000 people variable.
The differences between 3 models are in:
Prison related variable: each model use different prison related variable. All 3 variables tell us the same insight, how severe the population in crime is.
Model A: Probability of Imprisonment(prob_prison)
Model B: Average time served in prison (time_prison)
Model F: Crime Rate (crime_rate)
Police Expenditures: The difference between 2 variables is just the time of the expenditures
Model A: Police Expenditures in 1960
Model B: Police Expenditures in 1960
Model F: Police Expenditures in 1959
Only have model F use Percentage of Male Aged 14-24 (percent m)
New A Model = GDP ~ inequality + nonwhites_per1000 + prob_prison + police_exp60
plot(gdpAF)