library(dplyr) #for data cleaning
library(GGally) #For checking corrlation
library(MLmetrics) #For model evaluation

Data Preparation

# read data, and deselect x column because its just number column for index
crime <- read.csv("crime.csv") %>% select(-X)

# rename columns to make it more descriptve
names(crime) <- c("percent_m", "is_south", "mean_education", "police_exp60", "police_exp59", "labour_participation", "m_per1000f", "state_pop", "nonwhites_per1000", "unemploy_m24", "unemploy_m39", "gdp", "inequality", "prob_prison", "time_prison", "crime_rate")
head(crime)

glimpse(crime)

#> Rows: 47
#> Columns: 16
#> $ percent_m            <int> 151, 143, 142, 136, 141, 121, 127, 131, 157, 140,…
#> $ is_south             <int> 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0…
#> $ mean_education       <int> 91, 113, 89, 121, 121, 110, 111, 109, 90, 118, 10…
#> $ police_exp60         <int> 58, 103, 45, 149, 109, 118, 82, 115, 65, 71, 121,…
#> $ police_exp59         <int> 56, 95, 44, 141, 101, 115, 79, 109, 62, 68, 116, …
#> $ labour_participation <int> 510, 583, 533, 577, 591, 547, 519, 542, 553, 632,…
#> $ m_per1000f           <int> 950, 1012, 969, 994, 985, 964, 982, 969, 955, 102…
#> $ state_pop            <int> 33, 13, 18, 157, 18, 25, 4, 50, 39, 7, 101, 47, 2…
#> $ nonwhites_per1000    <int> 301, 102, 219, 80, 30, 44, 139, 179, 286, 15, 106…
#> $ unemploy_m24         <int> 108, 96, 94, 102, 91, 84, 97, 79, 81, 100, 77, 83…
#> $ unemploy_m39         <int> 41, 36, 33, 39, 20, 29, 38, 35, 28, 24, 35, 31, 2…
#> $ gdp                  <int> 394, 557, 318, 673, 578, 689, 620, 472, 421, 526,…
#> $ inequality           <int> 261, 194, 250, 167, 174, 126, 168, 206, 239, 174,…
#> $ prob_prison          <dbl> 0.084602, 0.029599, 0.083401, 0.015801, 0.041399,…
#> $ time_prison          <dbl> 26.2011, 25.2999, 24.3006, 29.9012, 21.2998, 20.9…
#> $ crime_rate           <int> 791, 1635, 578, 1969, 1234, 682, 963, 1555, 856, …

Column description: - percent_m: percentage of males aged 14-24 - is_south: whether it is in a Southern state. 1 for Yes, 0 for No. - mean_education: mean years of schooling - police_exp60: police expenditure in 1960 - police_exp59: police expenditure in 1959 - labour_participation: labour force participation rate - m_per1000f: number of males per 1000 females - state_pop: state population - nonwhites_per1000: number of non-whites resident per 1000 people - unemploy_m24: unemployment rate of urban males aged 14-24 - unemploy_m39: unemployment rate of urban males aged 35-39 - gdp: gross domestic product per head - inequality: income inequality - prob_prison: probability of imprisonment - time_prison: average time served in prisons - crime_rate: crime rate in an unspecified category

Data Cleansing

# change data type
crime <- crime %>% 
  mutate(is_south = as.factor(is_south))

EDA

Suppose we want to create a regression model to predict inequality based on other variables. Let’s check the correlation between the variables.

Tips to make ggcorr() neater:

Use hjust = 1 (horizontal justify) to prevent text from touching the boxes.
Use layout.exp = 3 to widen the horizontal axis by 3 times, to prevent text from being cut off.

# cek korelasi
ggcorr(crime, hjust = 1, layout.exp = 3, label = TRUE)

Insight: Based on correlation, ‘gdp’ is the most potential to predict ‘inequality’

Simple Linear Regression

Let’s make a simple linear regression model to predict ‘inequality’ value based on one most potential variable (gdp)

model_ineq <- lm(formula = inequality ~ gdp, data = crime)
summary(model_ineq)

#> 
#> Call:
#> lm(formula = inequality ~ gdp, data = crime)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -42.029 -11.754   1.714  12.438  30.006 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 386.03058   15.38651   25.09   <2e-16 ***
#> gdp          -0.36551    0.02881  -12.69   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 18.86 on 45 degrees of freedom
#> Multiple R-squared:  0.7815, Adjusted R-squared:  0.7766 
#> F-statistic: 160.9 on 1 and 45 DF,  p-value: < 2.2e-16

Model interpretation:

From the coefficients: Based on the intercept, if GDP is 0, inequality is already at 386. For every increase in GDP by 1 unit, inequality will decrease by 0.36.

From the significance: GDP is a significant predictor for predicting the target.

From the R-squared: 0.78, which is close to 1. However, additional variables are needed to better explain the information/variation of the target.

Multiple Linear Regression

Let’s make a multiple linear regression to predict ‘inequality’ using all variable

model_ineq_all <- lm(inequality~., data = crime)
summary(model_ineq_all)

#> 
#> Call:
#> lm(formula = inequality ~ ., data = crime)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -22.2069  -8.3637   0.5959   9.5326  28.0975 
#> 
#> Coefficients:
#>                        Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)          330.041730 120.913748   2.730  0.01036 *  
#> percent_m             -0.383670   0.299996  -1.279  0.21042    
#> is_south1             18.794908   9.696399   1.938  0.06173 .  
#> mean_education        -1.067204   0.448779  -2.378  0.02375 *  
#> police_exp60          -0.374454   0.767441  -0.488  0.62904    
#> police_exp59          -0.010789   0.822096  -0.013  0.98961    
#> labour_participation   0.114750   0.099665   1.151  0.25839    
#> m_per1000f             0.079852   0.141404   0.565  0.57634    
#> state_pop              0.132739   0.086233   1.539  0.13388    
#> nonwhites_per1000      0.002937   0.045031   0.065  0.94842    
#> unemploy_m24           0.100185   0.298897   0.335  0.73975    
#> unemploy_m39          -0.303733   0.602692  -0.504  0.61785    
#> gdp                   -0.239312   0.058431  -4.096  0.00028 ***
#> prob_prison           51.196445 167.739663   0.305  0.76224    
#> time_prison            0.064390   0.496290   0.130  0.89761    
#> crime_rate             0.033667   0.010822   3.111  0.00398 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 14.43 on 31 degrees of freedom
#> Multiple R-squared:  0.9118, Adjusted R-squared:  0.8692 
#> F-statistic: 21.38 on 15 and 31 DF,  p-value: 2.959e-12

Interpretation:

Significant predictors are GDP, crime_rate, and mean_education.
The R-squared value of the model with all predictors explains the information/variation of the target better.
Interpretation of coefficient: For example, for the predictor “is_south1”

Model Evaluation

Let’s compare R-squared model_ineq with model_ineq_all

summary(model_ineq)$r.squared

#> [1] 0.7814512

summary(model_ineq_all)$adj.r.squared

#> [1] 0.8691871

Let’s compare them using RMSE

# make new objects for the prediction result
pred_ineq <-  predict(object = model_ineq, newdata = crime) #object for the result of model prediction using one predictor
pred_ineq_all <- predict(object = model_ineq_all, newdata = crime) #object for the result of model prediction using all predictors

# calculate RMSE (using object pred_ineq and pred_ineq_all as y_pred)
RMSE(y_pred = pred_ineq, y_true = crime$inequality) # 1 predictor

#> [1] 18.45161

RMSE(y_pred = pred_ineq_all, y_true = crime$inequality) # all predictors

#> [1] 11.71891

range(crime$inequality)

#> [1] 126 276

Conclusion: model_ineq_all is a better model. It’s because it uses more feature to predict values

Step-wise Regression Model

Step-wise regression helps us select good predictors by searching for the best model combination based on the Akaike Information Criterion (AIC) value. The AIC represents the amount of information lost in the model, or information loss. Therefore, a good regression model is one that has a small AIC value.

There are three approaches to step-wise regression:

Backward Elimination: The model starts with all predictors included and iteratively removes one predictor at a time based on the highest p-value until a desired criterion (such as a specific significance level) is met.
Forward Selection: The model starts with no predictors and iteratively adds one predictor at a time based on the lowest p-value until a desired criterion is met.
Both: This approach combines backward elimination and forward selection. It starts with no predictors, then iteratively adds or removes predictors based on specific criteria.

The goal of step-wise regression is to find the best combination of predictors that results in the model with the lowest AIC value, indicating a good balance between model fit and complexity.

Backward Elimination

Process of backward elimination

Build an initial model using all available predictors.
Calculate the AIC for the model.
For each predictor, remove the predictor from the model and recalculate the AIC for the modified model.
Select the predictor that, when removed, results in the lowest AIC. Remove that predictor from the model.
Repeat steps 3 and 4, considering the modified model without the previously selected predictor.
Continue steps 3 and 4 until removing a predictor leads to an increase in AIC (higher than before).
The final model will be the one with the lowest AIC.

# first step: create model with all predictors
# we have made it before
summary(model_ineq_all)$call

#> lm(formula = inequality ~ ., data = crime)

# stepwise regression: backward elimination
model_backward <- step(object=model_ineq_all,
                       direction="backward", 
                       trace = F)

# summary model
summary(model_backward)

#> 
#> Call:
#> lm(formula = inequality ~ percent_m + is_south + mean_education + 
#>     police_exp60 + labour_participation + state_pop + gdp + crime_rate, 
#>     data = crime)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -23.7645  -7.3648  -0.2304   9.5643  30.5817 
#> 
#> Coefficients:
#>                        Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)          383.751874  55.466625   6.919 3.17e-08 ***
#> percent_m             -0.322926   0.236604  -1.365 0.180332    
#> is_south1             18.885387   6.700763   2.818 0.007621 ** 
#> mean_education        -0.931265   0.338854  -2.748 0.009114 ** 
#> police_exp60          -0.378395   0.155792  -2.429 0.019987 *  
#> labour_participation   0.137360   0.064315   2.136 0.039206 *  
#> state_pop              0.101107   0.066067   1.530 0.134207    
#> gdp                   -0.250171   0.048611  -5.146 8.40e-06 ***
#> crime_rate             0.032878   0.008102   4.058 0.000238 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 13.27 on 38 degrees of freedom
#> Multiple R-squared:  0.9086, Adjusted R-squared:  0.8894 
#> F-statistic: 47.23 on 8 and 38 DF,  p-value: < 2.2e-16

Forward Selection

Forward selection process:

Start with a model without any predictors.
For each predictor, add the predictor to the model and calculate the AIC.
Select the predictor that, when added, results in the lowest AIC. Add that predictor to the model.
Repeat steps 2 and 3, considering the model with the added predictor.
Continue steps 2 and 3 until adding a predictor leads to an increase in AIC (higher than before).

The process stops when adding another predictor would result in a higher AIC value.

This forward selection process aims to iteratively add predictors to the model based on their individual AIC values, ultimately selecting the best combination of predictors that results in the lowest AIC value, indicating a good fit and predictive power of the model.

# first step
# model without predictor
model_ineq_none <- lm(inequality ~ 1, data = crime)

For forward selection, we need to define a scope parameter to indicate the maximum upper limit of predictor combinations.

# stepwise regression: forward selection

model_forward <- step(object=model_ineq_none,
                      direction="forward",
                      scope=list(upper= model_ineq_all), 
                      trace=F)

# summary model
summary(model_forward)

#> 
#> Call:
#> lm(formula = inequality ~ gdp + crime_rate + mean_education + 
#>     police_exp59 + is_south + labour_participation + state_pop + 
#>     percent_m, data = crime)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -25.4435  -7.4800  -0.0233   9.0979  30.1273 
#> 
#> Coefficients:
#>                        Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)          382.103157  55.503952   6.884 3.53e-08 ***
#> gdp                   -0.252009   0.048453  -5.201 7.07e-06 ***
#> crime_rate             0.031544   0.007828   4.029 0.000259 ***
#> mean_education        -0.874630   0.338545  -2.583 0.013752 *  
#> police_exp59          -0.386989   0.161516  -2.396 0.021605 *  
#> is_south1             19.208354   6.714136   2.861 0.006830 ** 
#> labour_participation   0.129018   0.065030   1.984 0.054516 .  
#> state_pop              0.098028   0.065852   1.489 0.144845    
#> percent_m             -0.312760   0.236543  -1.322 0.194003    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 13.29 on 38 degrees of freedom
#> Multiple R-squared:  0.9083, Adjusted R-squared:  0.889 
#> F-statistic: 47.05 on 8 and 38 DF,  p-value: < 2.2e-16

summary(model_backward)$call

#> lm(formula = inequality ~ percent_m + is_south + mean_education + 
#>     police_exp60 + labour_participation + state_pop + gdp + crime_rate, 
#>     data = crime)

summary(model_forward)$call

#> lm(formula = inequality ~ gdp + crime_rate + mean_education + 
#>     police_exp59 + is_south + labour_participation + state_pop + 
#>     percent_m, data = crime)

summary(model_backward)$adj.r.squared

#> [1] 0.8893899

summary(model_forward)$adj.r.squared

#> [1] 0.888989

Notes: The results of forward and backward selection can differ in terms of predictor combinations, but generally, their performance is somewhat similar.

Both

Both is a combination between backward and forward stepwise regression.

Create a baseline model (can be all predictors or none).
For each predictor, try adding or removing it from the model, and calculate the AIC.
Add or remove one variable that results in the smallest AIC.
Repeat steps 2 and 3.
The process stops if adding or removing a variable leads to a higher AIC.

# stepwise regression: both
# it's started with a none, the upper is all predictor

model_both <- step(object= model_ineq_none,
                   direction= "both",
                   scope =list(upper= model_ineq_all),
                   trace=F)

# summary model
summary(model_both)

#> 
#> Call:
#> lm(formula = inequality ~ gdp + crime_rate + mean_education + 
#>     police_exp59 + is_south + labour_participation + state_pop + 
#>     percent_m, data = crime)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -25.4435  -7.4800  -0.0233   9.0979  30.1273 
#> 
#> Coefficients:
#>                        Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)          382.103157  55.503952   6.884 3.53e-08 ***
#> gdp                   -0.252009   0.048453  -5.201 7.07e-06 ***
#> crime_rate             0.031544   0.007828   4.029 0.000259 ***
#> mean_education        -0.874630   0.338545  -2.583 0.013752 *  
#> police_exp59          -0.386989   0.161516  -2.396 0.021605 *  
#> is_south1             19.208354   6.714136   2.861 0.006830 ** 
#> labour_participation   0.129018   0.065030   1.984 0.054516 .  
#> state_pop              0.098028   0.065852   1.489 0.144845    
#> percent_m             -0.312760   0.236543  -1.322 0.194003    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 13.29 on 38 degrees of freedom
#> Multiple R-squared:  0.9083, Adjusted R-squared:  0.889 
#> F-statistic: 47.05 on 8 and 38 DF,  p-value: < 2.2e-16

Comparison

summary(model_backward)$adj.r.squared

#> [1] 0.8893899

summary(model_forward)$adj.r.squared

#> [1] 0.888989

summary(model_both)$adj.r.squared

#> [1] 0.888989

library(performance)
comparison <- compare_performance(model_ineq_none, model_ineq_all, model_backward, model_forward, model_both)

as.data.frame(comparison)

Step-wise Linear Regression on Crime Data

Faisal Amir Maz

June 25, 2023