library(dplyr) #for data cleaning
library(GGally) #For checking corrlation
library(MLmetrics) #For model evaluation# read data, and deselect x column because its just number column for index
crime <- read.csv("crime.csv") %>% select(-X)
# rename columns to make it more descriptve
names(crime) <- c("percent_m", "is_south", "mean_education", "police_exp60", "police_exp59", "labour_participation", "m_per1000f", "state_pop", "nonwhites_per1000", "unemploy_m24", "unemploy_m39", "gdp", "inequality", "prob_prison", "time_prison", "crime_rate")
head(crime)glimpse(crime)#> Rows: 47
#> Columns: 16
#> $ percent_m <int> 151, 143, 142, 136, 141, 121, 127, 131, 157, 140,…
#> $ is_south <int> 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0…
#> $ mean_education <int> 91, 113, 89, 121, 121, 110, 111, 109, 90, 118, 10…
#> $ police_exp60 <int> 58, 103, 45, 149, 109, 118, 82, 115, 65, 71, 121,…
#> $ police_exp59 <int> 56, 95, 44, 141, 101, 115, 79, 109, 62, 68, 116, …
#> $ labour_participation <int> 510, 583, 533, 577, 591, 547, 519, 542, 553, 632,…
#> $ m_per1000f <int> 950, 1012, 969, 994, 985, 964, 982, 969, 955, 102…
#> $ state_pop <int> 33, 13, 18, 157, 18, 25, 4, 50, 39, 7, 101, 47, 2…
#> $ nonwhites_per1000 <int> 301, 102, 219, 80, 30, 44, 139, 179, 286, 15, 106…
#> $ unemploy_m24 <int> 108, 96, 94, 102, 91, 84, 97, 79, 81, 100, 77, 83…
#> $ unemploy_m39 <int> 41, 36, 33, 39, 20, 29, 38, 35, 28, 24, 35, 31, 2…
#> $ gdp <int> 394, 557, 318, 673, 578, 689, 620, 472, 421, 526,…
#> $ inequality <int> 261, 194, 250, 167, 174, 126, 168, 206, 239, 174,…
#> $ prob_prison <dbl> 0.084602, 0.029599, 0.083401, 0.015801, 0.041399,…
#> $ time_prison <dbl> 26.2011, 25.2999, 24.3006, 29.9012, 21.2998, 20.9…
#> $ crime_rate <int> 791, 1635, 578, 1969, 1234, 682, 963, 1555, 856, …
Column description: - percent_m: percentage of males aged 14-24 - is_south: whether it is in a Southern state. 1 for Yes, 0 for No. - mean_education: mean years of schooling - police_exp60: police expenditure in 1960 - police_exp59: police expenditure in 1959 - labour_participation: labour force participation rate - m_per1000f: number of males per 1000 females - state_pop: state population - nonwhites_per1000: number of non-whites resident per 1000 people - unemploy_m24: unemployment rate of urban males aged 14-24 - unemploy_m39: unemployment rate of urban males aged 35-39 - gdp: gross domestic product per head - inequality: income inequality - prob_prison: probability of imprisonment - time_prison: average time served in prisons - crime_rate: crime rate in an unspecified category
# change data type
crime <- crime %>%
mutate(is_south = as.factor(is_south))Suppose we want to create a regression model to predict inequality based on other variables. Let’s check the correlation between the variables.
Tips to make ggcorr() neater:
# cek korelasi
ggcorr(crime, hjust = 1, layout.exp = 3, label = TRUE)
Insight: Based on correlation, ‘gdp’ is the most potential to predict
‘inequality’
Let’s make a simple linear regression model to predict ‘inequality’ value based on one most potential variable (gdp)
model_ineq <- lm(formula = inequality ~ gdp, data = crime)
summary(model_ineq)#>
#> Call:
#> lm(formula = inequality ~ gdp, data = crime)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -42.029 -11.754 1.714 12.438 30.006
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 386.03058 15.38651 25.09 <2e-16 ***
#> gdp -0.36551 0.02881 -12.69 <2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 18.86 on 45 degrees of freedom
#> Multiple R-squared: 0.7815, Adjusted R-squared: 0.7766
#> F-statistic: 160.9 on 1 and 45 DF, p-value: < 2.2e-16
Model interpretation:
From the coefficients: Based on the intercept, if GDP is 0, inequality is already at 386. For every increase in GDP by 1 unit, inequality will decrease by 0.36.
From the significance: GDP is a significant predictor for predicting the target.
From the R-squared: 0.78, which is close to 1. However, additional variables are needed to better explain the information/variation of the target.
Let’s make a multiple linear regression to predict ‘inequality’ using all variable
model_ineq_all <- lm(inequality~., data = crime)
summary(model_ineq_all)#>
#> Call:
#> lm(formula = inequality ~ ., data = crime)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -22.2069 -8.3637 0.5959 9.5326 28.0975
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 330.041730 120.913748 2.730 0.01036 *
#> percent_m -0.383670 0.299996 -1.279 0.21042
#> is_south1 18.794908 9.696399 1.938 0.06173 .
#> mean_education -1.067204 0.448779 -2.378 0.02375 *
#> police_exp60 -0.374454 0.767441 -0.488 0.62904
#> police_exp59 -0.010789 0.822096 -0.013 0.98961
#> labour_participation 0.114750 0.099665 1.151 0.25839
#> m_per1000f 0.079852 0.141404 0.565 0.57634
#> state_pop 0.132739 0.086233 1.539 0.13388
#> nonwhites_per1000 0.002937 0.045031 0.065 0.94842
#> unemploy_m24 0.100185 0.298897 0.335 0.73975
#> unemploy_m39 -0.303733 0.602692 -0.504 0.61785
#> gdp -0.239312 0.058431 -4.096 0.00028 ***
#> prob_prison 51.196445 167.739663 0.305 0.76224
#> time_prison 0.064390 0.496290 0.130 0.89761
#> crime_rate 0.033667 0.010822 3.111 0.00398 **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 14.43 on 31 degrees of freedom
#> Multiple R-squared: 0.9118, Adjusted R-squared: 0.8692
#> F-statistic: 21.38 on 15 and 31 DF, p-value: 2.959e-12
Interpretation:
Let’s compare R-squared model_ineq with model_ineq_all
summary(model_ineq)$r.squared#> [1] 0.7814512
summary(model_ineq_all)$adj.r.squared#> [1] 0.8691871
Let’s compare them using RMSE
# make new objects for the prediction result
pred_ineq <- predict(object = model_ineq, newdata = crime) #object for the result of model prediction using one predictor
pred_ineq_all <- predict(object = model_ineq_all, newdata = crime) #object for the result of model prediction using all predictors# calculate RMSE (using object pred_ineq and pred_ineq_all as y_pred)
RMSE(y_pred = pred_ineq, y_true = crime$inequality) # 1 predictor#> [1] 18.45161
RMSE(y_pred = pred_ineq_all, y_true = crime$inequality) # all predictors#> [1] 11.71891
range(crime$inequality)#> [1] 126 276
Conclusion: model_ineq_all is a better model. It’s because it uses more feature to predict values
Step-wise regression helps us select good predictors by searching for the best model combination based on the Akaike Information Criterion (AIC) value. The AIC represents the amount of information lost in the model, or information loss. Therefore, a good regression model is one that has a small AIC value.
There are three approaches to step-wise regression:
Backward Elimination: The model starts with all predictors included and iteratively removes one predictor at a time based on the highest p-value until a desired criterion (such as a specific significance level) is met.
Forward Selection: The model starts with no predictors and iteratively adds one predictor at a time based on the lowest p-value until a desired criterion is met.
Both: This approach combines backward elimination and forward selection. It starts with no predictors, then iteratively adds or removes predictors based on specific criteria.
The goal of step-wise regression is to find the best combination of predictors that results in the model with the lowest AIC value, indicating a good balance between model fit and complexity.
Process of backward elimination
Build an initial model using all available predictors.
Calculate the AIC for the model.
For each predictor, remove the predictor from the model and recalculate the AIC for the modified model.
Select the predictor that, when removed, results in the lowest AIC. Remove that predictor from the model.
Repeat steps 3 and 4, considering the modified model without the previously selected predictor.
Continue steps 3 and 4 until removing a predictor leads to an increase in AIC (higher than before).
The final model will be the one with the lowest AIC.
# first step: create model with all predictors
# we have made it before
summary(model_ineq_all)$call#> lm(formula = inequality ~ ., data = crime)
# stepwise regression: backward elimination
model_backward <- step(object=model_ineq_all,
direction="backward",
trace = F)# summary model
summary(model_backward)#>
#> Call:
#> lm(formula = inequality ~ percent_m + is_south + mean_education +
#> police_exp60 + labour_participation + state_pop + gdp + crime_rate,
#> data = crime)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -23.7645 -7.3648 -0.2304 9.5643 30.5817
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 383.751874 55.466625 6.919 3.17e-08 ***
#> percent_m -0.322926 0.236604 -1.365 0.180332
#> is_south1 18.885387 6.700763 2.818 0.007621 **
#> mean_education -0.931265 0.338854 -2.748 0.009114 **
#> police_exp60 -0.378395 0.155792 -2.429 0.019987 *
#> labour_participation 0.137360 0.064315 2.136 0.039206 *
#> state_pop 0.101107 0.066067 1.530 0.134207
#> gdp -0.250171 0.048611 -5.146 8.40e-06 ***
#> crime_rate 0.032878 0.008102 4.058 0.000238 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 13.27 on 38 degrees of freedom
#> Multiple R-squared: 0.9086, Adjusted R-squared: 0.8894
#> F-statistic: 47.23 on 8 and 38 DF, p-value: < 2.2e-16
Forward selection process:
Start with a model without any predictors.
For each predictor, add the predictor to the model and calculate the AIC.
Select the predictor that, when added, results in the lowest AIC. Add that predictor to the model.
Repeat steps 2 and 3, considering the model with the added predictor.
Continue steps 2 and 3 until adding a predictor leads to an increase in AIC (higher than before).
The process stops when adding another predictor would result in a higher AIC value.
This forward selection process aims to iteratively add predictors to the model based on their individual AIC values, ultimately selecting the best combination of predictors that results in the lowest AIC value, indicating a good fit and predictive power of the model.
# first step
# model without predictor
model_ineq_none <- lm(inequality ~ 1, data = crime)For forward selection, we need to define a scope parameter to indicate the maximum upper limit of predictor combinations.
# stepwise regression: forward selection
model_forward <- step(object=model_ineq_none,
direction="forward",
scope=list(upper= model_ineq_all),
trace=F)# summary model
summary(model_forward)#>
#> Call:
#> lm(formula = inequality ~ gdp + crime_rate + mean_education +
#> police_exp59 + is_south + labour_participation + state_pop +
#> percent_m, data = crime)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -25.4435 -7.4800 -0.0233 9.0979 30.1273
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 382.103157 55.503952 6.884 3.53e-08 ***
#> gdp -0.252009 0.048453 -5.201 7.07e-06 ***
#> crime_rate 0.031544 0.007828 4.029 0.000259 ***
#> mean_education -0.874630 0.338545 -2.583 0.013752 *
#> police_exp59 -0.386989 0.161516 -2.396 0.021605 *
#> is_south1 19.208354 6.714136 2.861 0.006830 **
#> labour_participation 0.129018 0.065030 1.984 0.054516 .
#> state_pop 0.098028 0.065852 1.489 0.144845
#> percent_m -0.312760 0.236543 -1.322 0.194003
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 13.29 on 38 degrees of freedom
#> Multiple R-squared: 0.9083, Adjusted R-squared: 0.889
#> F-statistic: 47.05 on 8 and 38 DF, p-value: < 2.2e-16
summary(model_backward)$call#> lm(formula = inequality ~ percent_m + is_south + mean_education +
#> police_exp60 + labour_participation + state_pop + gdp + crime_rate,
#> data = crime)
summary(model_forward)$call#> lm(formula = inequality ~ gdp + crime_rate + mean_education +
#> police_exp59 + is_south + labour_participation + state_pop +
#> percent_m, data = crime)
summary(model_backward)$adj.r.squared#> [1] 0.8893899
summary(model_forward)$adj.r.squared#> [1] 0.888989
Notes: The results of forward and backward selection can differ in terms of predictor combinations, but generally, their performance is somewhat similar.
Both is a combination between backward and forward stepwise regression.
# stepwise regression: both
# it's started with a none, the upper is all predictor
model_both <- step(object= model_ineq_none,
direction= "both",
scope =list(upper= model_ineq_all),
trace=F)# summary model
summary(model_both)#>
#> Call:
#> lm(formula = inequality ~ gdp + crime_rate + mean_education +
#> police_exp59 + is_south + labour_participation + state_pop +
#> percent_m, data = crime)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -25.4435 -7.4800 -0.0233 9.0979 30.1273
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 382.103157 55.503952 6.884 3.53e-08 ***
#> gdp -0.252009 0.048453 -5.201 7.07e-06 ***
#> crime_rate 0.031544 0.007828 4.029 0.000259 ***
#> mean_education -0.874630 0.338545 -2.583 0.013752 *
#> police_exp59 -0.386989 0.161516 -2.396 0.021605 *
#> is_south1 19.208354 6.714136 2.861 0.006830 **
#> labour_participation 0.129018 0.065030 1.984 0.054516 .
#> state_pop 0.098028 0.065852 1.489 0.144845
#> percent_m -0.312760 0.236543 -1.322 0.194003
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 13.29 on 38 degrees of freedom
#> Multiple R-squared: 0.9083, Adjusted R-squared: 0.889
#> F-statistic: 47.05 on 8 and 38 DF, p-value: < 2.2e-16
summary(model_backward)$adj.r.squared#> [1] 0.8893899
summary(model_forward)$adj.r.squared#> [1] 0.888989
summary(model_both)$adj.r.squared#> [1] 0.888989
library(performance)
comparison <- compare_performance(model_ineq_none, model_ineq_all, model_backward, model_forward, model_both)
as.data.frame(comparison)