crime <- read.csv("data_input/crime.csv")
head(crime)
## percent_m is_south mean_education police_exp60 police_exp59
## 1 151 1 91 58 56
## 2 143 0 113 103 95
## 3 142 1 89 45 44
## 4 136 0 121 149 141
## 5 141 0 121 109 101
## 6 121 0 110 118 115
## labour_participation m_per1000f state_pop nonwhites_per1000 unemploy_m24
## 1 510 950 33 301 108
## 2 583 1012 13 102 96
## 3 533 969 18 219 94
## 4 577 994 157 80 102
## 5 591 985 18 30 91
## 6 547 964 25 44 84
## unemploy_m39 gdp inequality prob_prison time_prison crime_rate
## 1 41 394 261 0.084602 26.2011 791
## 2 36 557 194 0.029599 25.2999 1635
## 3 33 318 250 0.083401 24.3006 578
## 4 39 673 167 0.015801 29.9012 1969
## 5 20 578 174 0.041399 21.2998 1234
## 6 29 689 126 0.034201 20.9995 682
names(crime) <- c("percent_m", "is_south", "mean_education", "police_exp60", "police_exp59", "labour_participation", "m_per1000f", "state_pop", "nonwhites_per1000", "unemploy_m24", "unemploy_m39", "gdp", "inequality", "prob_prison", "time_prison", "crime_rate")
head(crime)
## percent_m is_south mean_education police_exp60 police_exp59
## 1 151 1 91 58 56
## 2 143 0 113 103 95
## 3 142 1 89 45 44
## 4 136 0 121 149 141
## 5 141 0 121 109 101
## 6 121 0 110 118 115
## labour_participation m_per1000f state_pop nonwhites_per1000 unemploy_m24
## 1 510 950 33 301 108
## 2 583 1012 13 102 96
## 3 533 969 18 219 94
## 4 577 994 157 80 102
## 5 591 985 18 30 91
## 6 547 964 25 44 84
## unemploy_m39 gdp inequality prob_prison time_prison crime_rate
## 1 41 394 261 0.084602 26.2011 791
## 2 36 557 194 0.029599 25.2999 1635
## 3 33 318 250 0.083401 24.3006 578
## 4 39 673 167 0.015801 29.9012 1969
## 5 20 578 174 0.041399 21.2998 1234
## 6 29 689 126 0.034201 20.9995 682
percent_m: percentage of males aged 14-24is_south: whether it is in a Southern state. 1 for Yes, 0 for No.mean_education: mean years of schoolingpolice_exp60: police expenditure in 1960police_exp59: police expenditure in 1959labour_participation: labour force participation ratem_per1000f: number of males per 1000 femalesstate_pop: state populationnonwhites_per1000: number of non-whites resident per 1000 peopleunemploy_m24: unemployment rate of urban males aged 14-24unemploy_m39: unemployment rate of urban males aged 35-39gdp: gross domestic product per headinequality: income inequalityprob_prison: probability of imprisonmenttime_prison: avg time served in prisonscrime_rate: crime rate in an unspecified category# plot correlation matrix
library(GGally)
ggcorr(crime,label = T, label_size = 3)
Dive Deeper:
Katakanlah kita ingin memprediksi tingkat pendidikan dari kota tersebut, berdasarkan korelasinya, variabel yang paling tepat digunakan adalah vriabel “inequality”
plot(crime$mean_education, crime$inequality)
Berdasarkan tabel di atas dapat dilihat semakin tinggi Inequality suatu kota maka semakin kurang baik pula tingkat pendidikan orang-orang di daerah tersebut
model_edu <- lm(mean_education ~ inequality, crime)
summary(model_edu)
##
## Call:
## lm(formula = mean_education ~ inequality, data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.475 -5.798 -1.673 6.404 11.051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 147.45198 5.29341 27.856 < 2e-16 ***
## inequality -0.21553 0.02674 -8.061 2.81e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.235 on 45 degrees of freedom
## Multiple R-squared: 0.5908, Adjusted R-squared: 0.5817
## F-statistic: 64.98 on 1 and 45 DF, p-value: 2.81e-10
Berdasarkan model tersebut dapat disimpulkan bahwa memang inequlity sangat berpengaruh terhadap tingkat pendidikan di buktika dengan P-value yang jauh lebih kecil dari than 0.05 yaitu sebesar: 0.000000000281
Jika inequality suatu daerah 600 maka dapat diperkirakan tingkat pendidikan sebesar: 112.0054 Didpat dar menggunakan metode prediksi berdasarkan model_edu di atas
new_data <- data.frame("inequality" = 600)
predict(model_edu, new_data)
## 1
## 18.13132
Namun berdasarkan summary dari model_edu dapat dilihat bahwa model tidak lah begitu fit melihat multiple R-Squared yang di bawah 0.78. Oleh karena itu diperlukan tambahan prediktor. Untuk mendapatkan prediktor terbaik kita pakai metode feature selection
Prediksi inequality dengan seluruh variabel yang ada:
model_all <- lm(mean_education~.,crime)
model_backward <- step(
object = model_all,
direction ="backward"
)
## Start: AIC=169.4
## mean_education ~ percent_m + is_south + police_exp60 + police_exp59 +
## labour_participation + m_per1000f + state_pop + nonwhites_per1000 +
## unemploy_m24 + unemploy_m39 + gdp + inequality + prob_prison +
## time_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - m_per1000f 1 0.048 874.38 167.40
## - time_prison 1 0.111 874.44 167.40
## - state_pop 1 1.136 875.46 167.46
## - gdp 1 2.103 876.43 167.51
## - is_south 1 2.597 876.92 167.53
## - nonwhites_per1000 1 34.329 908.66 169.21
## - prob_prison 1 36.365 910.69 169.31
## <none> 874.33 169.40
## - percent_m 1 47.554 921.88 169.88
## - police_exp59 1 55.337 929.66 170.28
## - police_exp60 1 76.596 950.92 171.34
## - unemploy_m24 1 87.980 962.31 171.90
## - labour_participation 1 100.875 975.20 172.53
## - inequality 1 159.493 1033.82 175.27
## - unemploy_m39 1 183.797 1058.12 176.36
## - crime_rate 1 259.481 1133.81 179.61
##
## Step: AIC=167.4
## mean_education ~ percent_m + is_south + police_exp60 + police_exp59 +
## labour_participation + state_pop + nonwhites_per1000 + unemploy_m24 +
## unemploy_m39 + gdp + inequality + prob_prison + time_prison +
## crime_rate
##
## Df Sum of Sq RSS AIC
## - time_prison 1 0.092 874.47 165.40
## - state_pop 1 1.121 875.50 165.46
## - gdp 1 2.170 876.55 165.51
## - is_south 1 2.773 877.15 165.55
## - prob_prison 1 36.393 910.77 167.31
## - nonwhites_per1000 1 36.480 910.86 167.32
## <none> 874.38 167.40
## - percent_m 1 49.767 924.14 168.00
## - police_exp59 1 55.300 929.68 168.28
## - police_exp60 1 76.640 951.02 169.35
## - unemploy_m24 1 133.885 1008.26 172.09
## - labour_participation 1 145.597 1019.97 172.64
## - inequality 1 160.728 1035.10 173.33
## - unemploy_m39 1 196.724 1071.10 174.94
## - crime_rate 1 269.159 1143.53 178.01
##
## Step: AIC=165.4
## mean_education ~ percent_m + is_south + police_exp60 + police_exp59 +
## labour_participation + state_pop + nonwhites_per1000 + unemploy_m24 +
## unemploy_m39 + gdp + inequality + prob_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - state_pop 1 1.475 875.94 163.48
## - gdp 1 2.306 876.77 163.53
## - is_south 1 2.683 877.15 163.55
## <none> 874.47 165.40
## - nonwhites_per1000 1 40.167 914.63 165.51
## - percent_m 1 50.236 924.70 166.03
## - prob_prison 1 51.221 925.69 166.08
## - police_exp59 1 62.216 936.68 166.63
## - police_exp60 1 83.394 957.86 167.68
## - unemploy_m24 1 145.112 1019.58 170.62
## - labour_participation 1 150.344 1024.81 170.86
## - inequality 1 160.702 1035.17 171.33
## - unemploy_m39 1 201.794 1076.26 173.16
## - crime_rate 1 272.130 1146.60 176.14
##
## Step: AIC=163.48
## mean_education ~ percent_m + is_south + police_exp60 + police_exp59 +
## labour_participation + nonwhites_per1000 + unemploy_m24 +
## unemploy_m39 + gdp + inequality + prob_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - is_south 1 2.283 878.23 161.60
## - gdp 1 2.826 878.77 161.63
## <none> 875.94 163.48
## - nonwhites_per1000 1 38.697 914.64 163.51
## - prob_prison 1 50.347 926.29 164.11
## - percent_m 1 52.926 928.87 164.24
## - police_exp59 1 61.058 937.00 164.65
## - police_exp60 1 82.067 958.01 165.69
## - unemploy_m24 1 144.263 1020.21 168.65
## - labour_participation 1 148.978 1024.92 168.86
## - inequality 1 164.438 1040.38 169.57
## - unemploy_m39 1 201.063 1077.01 171.19
## - crime_rate 1 277.424 1153.37 174.41
##
## Step: AIC=161.6
## mean_education ~ percent_m + police_exp60 + police_exp59 + labour_participation +
## nonwhites_per1000 + unemploy_m24 + unemploy_m39 + gdp + inequality +
## prob_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## - gdp 1 4.288 882.51 159.83
## - nonwhites_per1000 1 37.001 915.23 161.54
## <none> 878.23 161.60
## - percent_m 1 51.092 929.32 162.26
## - police_exp59 1 61.268 939.49 162.77
## - prob_prison 1 65.849 944.07 163.00
## - police_exp60 1 83.700 961.93 163.88
## - unemploy_m24 1 147.166 1025.39 166.89
## - labour_participation 1 166.954 1045.18 167.78
## - inequality 1 171.893 1050.12 168.01
## - unemploy_m39 1 198.978 1077.20 169.20
## - crime_rate 1 285.711 1163.94 172.84
##
## Step: AIC=159.83
## mean_education ~ percent_m + police_exp60 + police_exp59 + labour_participation +
## nonwhites_per1000 + unemploy_m24 + unemploy_m39 + inequality +
## prob_prison + crime_rate
##
## Df Sum of Sq RSS AIC
## <none> 882.51 159.83
## - nonwhites_per1000 1 41.66 924.17 160.00
## - percent_m 1 58.98 941.50 160.87
## - police_exp59 1 63.09 945.61 161.08
## - prob_prison 1 63.54 946.05 161.10
## - police_exp60 1 82.90 965.41 162.05
## - unemploy_m24 1 143.44 1025.95 164.91
## - labour_participation 1 178.39 1060.90 166.49
## - unemploy_m39 1 194.71 1077.22 167.20
## - crime_rate 1 319.06 1201.57 172.34
## - inequality 1 353.95 1236.46 173.68
Jika dibandingkan Adj. R-squared nya;
summary(model_backward)$adj.r.squared
## [1] 0.8041194
ladj.r.squared sudah melebihi 0.78 sehingga model yang akan digunakan adalah
mean_education ~ percent_m + police_exp60 + police_exp59 + labour_participation + nonwhites_per1000 + unemploy_m24 + unemploy_m39 + inequality + prob_prison + crime_rate
Cara evaluasi model di kasus regresi dapat menggunakan beberapa metrics penilaian:
library(MLmetrics)
#MAE
MAE(
y_pred = model_backward$fitted.values,
y_true = crime$mean_education
)
## [1] 3.421272
#MAPE
MAPE(
y_pred = model_backward$fitted.values,
y_true = crime$mean_education
)
## [1] 0.03304032
#RMSE
RMSE(
y_pred = model_backward$fitted.values,
y_true = crime$mean_education
)
## [1] 4.333229
Membaca Error: - MAE: 3.421272, artinya jik nilai tersebut memiliki kemungkinan error +- 3.421272 - Interpretasi error lebih baik cukup baik karena sangat kecil dapat dilihat dari MAE dan MAPE nya yang cenderung kecil
Berikutnya kita akan memprediksi tingkat edukasi
new_data <- read.csv("crime_new_data.csv")
predict(
model_backward,
new_data,
interval="confidence",
level=0.95
)
## fit lwr upr
## 1 100.25105 97.15865 103.34346
## 2 109.40379 102.38043 116.42715
## 3 90.78473 83.00530 98.56415
## 4 101.03516 95.26200 106.80833
## 5 85.84581 77.49685 94.19477
## 6 109.66609 103.77866 115.55353
## 7 106.96437 101.83923 112.08952
## 8 98.69447 93.59531 103.79364
## 9 117.79715 113.57092 122.02338
## 10 120.75632 115.90174 125.61090
# shapiro test
shapiro.test(model_backward$residuals)
##
## Shapiro-Wilk normality test
##
## data: model_backward$residuals
## W = 0.96997, p-value = 0.2643
Keputusan:
- Karena p-value (0.2643) > 0.05 maka keputusannya adalah terima H0, dengan kesimpulan residual dari model sudah berdistribusi normal.
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.6.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.6.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(model_backward)
##
## studentized Breusch-Pagan test
##
## data: model_backward
## BP = 6.9191, df = 10, p-value = 0.7331
P value (0.7331) lebih dari alpha maka terima H0: Data residual Homogen (tidak membentuk sebuah pola)
library(car)
## Warning: package 'car' was built under R version 3.6.2
## Loading required package: carData
vif(model_backward)
## percent_m police_exp60 police_exp59
## 2.707920 92.182019 93.735792
## labour_participation nonwhites_per1000 unemploy_m24
## 1.745926 3.153472 3.121315
## unemploy_m39 inequality prob_prison
## 4.091744 4.560387 1.678308
## crime_rate
## 3.317822
Terdapat 1 multi collinerarity yaitu
##Interpretasi model dan rekomendasi terkait case awal Berdasarkan uji validasi dan hasil perdiksi maka dapat dismpulkan model yang dipakai cukup akurat dan fit untuk digunakan. Hanya saja berdasarkan adanya multicollinearity maka salah satu kolom police_exp60 atau police_exp59 perlu di hapus dari data