#import library
library(dplyr) # Data Cleansing
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(GGally) # Visualisasi (korelasi)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(MLmetrics) # Metrics Error
## Warning: package 'MLmetrics' was built under R version 4.2.2
##
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
##
## Recall
#library(performance) # Compare performance
library(lmtest) # Uji Asumsi
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car) # Uji Asumsi
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(rsample)
#load data
crime <- read.csv("crime.csv")
head(crime)
## percent_m is_south mean_education police_exp60 police_exp59
## 1 151 1 91 58 56
## 2 143 0 113 103 95
## 3 142 1 89 45 44
## 4 136 0 121 149 141
## 5 141 0 121 109 101
## 6 121 0 110 118 115
## labour_participation m_per1000f state_pop nonwhites_per1000 unemploy_m24
## 1 510 950 33 301 108
## 2 583 1012 13 102 96
## 3 533 969 18 219 94
## 4 577 994 157 80 102
## 5 591 985 18 30 91
## 6 547 964 25 44 84
## unemploy_m39 gdp inequality prob_prison time_prison crime_rate
## 1 41 394 261 0.084602 26.2011 791
## 2 36 557 194 0.029599 25.2999 1635
## 3 33 318 250 0.083401 24.3006 578
## 4 39 673 167 0.015801 29.9012 1969
## 5 20 578 174 0.041399 21.2998 1234
## 6 29 689 126 0.034201 20.9995 682
#preprocessing
str(crime)
## 'data.frame': 47 obs. of 16 variables:
## $ percent_m : int 151 143 142 136 141 121 127 131 157 140 ...
## $ is_south : int 1 0 1 0 0 0 1 1 1 0 ...
## $ mean_education : int 91 113 89 121 121 110 111 109 90 118 ...
## $ police_exp60 : int 58 103 45 149 109 118 82 115 65 71 ...
## $ police_exp59 : int 56 95 44 141 101 115 79 109 62 68 ...
## $ labour_participation: int 510 583 533 577 591 547 519 542 553 632 ...
## $ m_per1000f : int 950 1012 969 994 985 964 982 969 955 1029 ...
## $ state_pop : int 33 13 18 157 18 25 4 50 39 7 ...
## $ nonwhites_per1000 : int 301 102 219 80 30 44 139 179 286 15 ...
## $ unemploy_m24 : int 108 96 94 102 91 84 97 79 81 100 ...
## $ unemploy_m39 : int 41 36 33 39 20 29 38 35 28 24 ...
## $ gdp : int 394 557 318 673 578 689 620 472 421 526 ...
## $ inequality : int 261 194 250 167 174 126 168 206 239 174 ...
## $ prob_prison : num 0.0846 0.0296 0.0834 0.0158 0.0414 ...
## $ time_prison : num 26.2 25.3 24.3 29.9 21.3 ...
## $ crime_rate : int 791 1635 578 1969 1234 682 963 1555 856 705 ...
Deskripsi Data - percent_m: percentage of males aged 14-24 - is_south: whether it is in a Southern state. 1 for Yes, 0 for No. - mean_education: mean years of schooling - police_exp60: police expenditure in 1960 - police_exp59: police expenditure in 1959 - labour_participation: labour force participation rate - m_per1000f: number of males per 1000 females - state_pop: state population - nonwhites_per1000: number of non-whites resident per 1000 people - unemploy_m24: unemployment rate of urban males aged 14-24 - unemploy_m39: unemployment rate of urban males aged 35-39 - gdp: gross domestic product per head - inequality: income inequality - prob_prison: probability of imprisonment - time_prison: avg time served in prisons - crime_rate: crime rate in an unspecified category Produce a linear model
##check missing value
colSums(is.na(crime))
## percent_m is_south mean_education
## 0 0 0
## police_exp60 police_exp59 labour_participation
## 0 0 0
## m_per1000f state_pop nonwhites_per1000
## 0 0 0
## unemploy_m24 unemploy_m39 gdp
## 0 0 0
## inequality prob_prison time_prison
## 0 0 0
## crime_rate
## 0
#exploratory data analysis ##melihat korelasi antar variabel
ggcorr(crime, label = T, label_size = 2.5, hjust = 1, layout.exp = 2)
Berdasarkan hasil chunk sebelumnya, kita dapat melihat bahwa tidak semua variabel memiliki korelasi dengan variabel crime_rate. Variabel yang memiliki korelasi tertinggi dengan crime_rate adalah police_exp60 dan police_exp59 dengan korelasi 0,7.
#Modelling
model_crime <- lm(crime_rate ~ police_exp59, crime)
summary(model_crime)
##
## Call:
## lm(formula = crime_rate ~ police_exp59, data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -595.58 -156.76 12.29 146.74 593.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 165.164 130.427 1.266 0.212
## police_exp59 9.222 1.537 6.001 3.11e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 291.4 on 45 degrees of freedom
## Multiple R-squared: 0.4445, Adjusted R-squared: 0.4322
## F-statistic: 36.01 on 1 and 45 DF, p-value: 3.114e-07
model_full <- lm(crime_rate ~ ., crime)
model_step <- step(model_full, direction = "backward", trace = 0)
summary(model_step)
##
## Call:
## lm(formula = crime_rate ~ percent_m + mean_education + police_exp60 +
## m_per1000f + unemploy_m24 + unemploy_m39 + inequality + prob_prison,
## data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -444.70 -111.07 3.03 122.15 483.30
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6426.101 1194.611 -5.379 4.04e-06 ***
## percent_m 9.332 3.350 2.786 0.00828 **
## mean_education 18.012 5.275 3.414 0.00153 **
## police_exp60 10.265 1.552 6.613 8.26e-08 ***
## m_per1000f 2.234 1.360 1.642 0.10874
## unemploy_m24 -6.087 3.339 -1.823 0.07622 .
## unemploy_m39 18.735 7.248 2.585 0.01371 *
## inequality 6.133 1.396 4.394 8.63e-05 ***
## prob_prison -3796.032 1490.646 -2.547 0.01505 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 195.5 on 38 degrees of freedom
## Multiple R-squared: 0.7888, Adjusted R-squared: 0.7444
## F-statistic: 17.74 on 8 and 38 DF, p-value: 1.159e-10
crime_test <- read.csv("crime_test.csv")
pred_crime1 <- predict(model_crime, crime_test)
pred_crime2 <- predict(model_step, crime_test)
MAPE(pred_crime1, crime_test$crime_rate)
## [1] 0.3046554
MAPE(pred_crime2, crime_test$crime_rate)
## [1] 0.2339692
#Uji Asumsi ##Shapiro Test
shapiro.test(model_crime$residuals)
##
## Shapiro-Wilk normality test
##
## data: model_crime$residuals
## W = 0.977, p-value = 0.4748
shapiro.test(model_step$residuals)
##
## Shapiro-Wilk normality test
##
## data: model_step$residuals
## W = 0.98511, p-value = 0.8051
##Breusch-Pagan Test
bptest(model_crime)
##
## studentized Breusch-Pagan test
##
## data: model_crime
## BP = 22.918, df = 1, p-value = 1.69e-06
bptest(model_step)
##
## studentized Breusch-Pagan test
##
## data: model_step
## BP = 13.51, df = 8, p-value = 0.09546
#Variance Inflation Factor
vif(model_step)
## percent_m mean_education police_exp60 m_per1000f unemploy_m24
## 2.131963 4.189684 2.560496 1.932367 4.360038
## unemploy_m39 inequality prob_prison
## 4.508106 3.731074 1.381879
#Kesimpulan Dapat disimpulkan bahwa model kedua atau model_step yang menggunakan metode stepwise-regression lebih baik dari model pertama dengan hanya satu variabel yang memiliki korelasi positif tertinggi dengan variabel crime_rate. Alasannya adalah: