#import library

library(dplyr) # Data Cleansing
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(GGally) # Visualisasi (korelasi)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(MLmetrics) # Metrics Error
## Warning: package 'MLmetrics' was built under R version 4.2.2
## 
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
## 
##     Recall
#library(performance) # Compare performance
library(lmtest) # Uji Asumsi
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car) # Uji Asumsi
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(rsample)
#load data
crime <- read.csv("crime.csv")
  
head(crime)
##   percent_m is_south mean_education police_exp60 police_exp59
## 1       151        1             91           58           56
## 2       143        0            113          103           95
## 3       142        1             89           45           44
## 4       136        0            121          149          141
## 5       141        0            121          109          101
## 6       121        0            110          118          115
##   labour_participation m_per1000f state_pop nonwhites_per1000 unemploy_m24
## 1                  510        950        33               301          108
## 2                  583       1012        13               102           96
## 3                  533        969        18               219           94
## 4                  577        994       157                80          102
## 5                  591        985        18                30           91
## 6                  547        964        25                44           84
##   unemploy_m39 gdp inequality prob_prison time_prison crime_rate
## 1           41 394        261    0.084602     26.2011        791
## 2           36 557        194    0.029599     25.2999       1635
## 3           33 318        250    0.083401     24.3006        578
## 4           39 673        167    0.015801     29.9012       1969
## 5           20 578        174    0.041399     21.2998       1234
## 6           29 689        126    0.034201     20.9995        682

#preprocessing

str(crime)
## 'data.frame':    47 obs. of  16 variables:
##  $ percent_m           : int  151 143 142 136 141 121 127 131 157 140 ...
##  $ is_south            : int  1 0 1 0 0 0 1 1 1 0 ...
##  $ mean_education      : int  91 113 89 121 121 110 111 109 90 118 ...
##  $ police_exp60        : int  58 103 45 149 109 118 82 115 65 71 ...
##  $ police_exp59        : int  56 95 44 141 101 115 79 109 62 68 ...
##  $ labour_participation: int  510 583 533 577 591 547 519 542 553 632 ...
##  $ m_per1000f          : int  950 1012 969 994 985 964 982 969 955 1029 ...
##  $ state_pop           : int  33 13 18 157 18 25 4 50 39 7 ...
##  $ nonwhites_per1000   : int  301 102 219 80 30 44 139 179 286 15 ...
##  $ unemploy_m24        : int  108 96 94 102 91 84 97 79 81 100 ...
##  $ unemploy_m39        : int  41 36 33 39 20 29 38 35 28 24 ...
##  $ gdp                 : int  394 557 318 673 578 689 620 472 421 526 ...
##  $ inequality          : int  261 194 250 167 174 126 168 206 239 174 ...
##  $ prob_prison         : num  0.0846 0.0296 0.0834 0.0158 0.0414 ...
##  $ time_prison         : num  26.2 25.3 24.3 29.9 21.3 ...
##  $ crime_rate          : int  791 1635 578 1969 1234 682 963 1555 856 705 ...

Deskripsi Data - percent_m: percentage of males aged 14-24 - is_south: whether it is in a Southern state. 1 for Yes, 0 for No. - mean_education: mean years of schooling - police_exp60: police expenditure in 1960 - police_exp59: police expenditure in 1959 - labour_participation: labour force participation rate - m_per1000f: number of males per 1000 females - state_pop: state population - nonwhites_per1000: number of non-whites resident per 1000 people - unemploy_m24: unemployment rate of urban males aged 14-24 - unemploy_m39: unemployment rate of urban males aged 35-39 - gdp: gross domestic product per head - inequality: income inequality - prob_prison: probability of imprisonment - time_prison: avg time served in prisons - crime_rate: crime rate in an unspecified category Produce a linear model

##check missing value

colSums(is.na(crime))
##            percent_m             is_south       mean_education 
##                    0                    0                    0 
##         police_exp60         police_exp59 labour_participation 
##                    0                    0                    0 
##           m_per1000f            state_pop    nonwhites_per1000 
##                    0                    0                    0 
##         unemploy_m24         unemploy_m39                  gdp 
##                    0                    0                    0 
##           inequality          prob_prison          time_prison 
##                    0                    0                    0 
##           crime_rate 
##                    0

#exploratory data analysis ##melihat korelasi antar variabel

ggcorr(crime, label = T, label_size = 2.5, hjust = 1, layout.exp = 2)

Berdasarkan hasil chunk sebelumnya, kita dapat melihat bahwa tidak semua variabel memiliki korelasi dengan variabel crime_rate. Variabel yang memiliki korelasi tertinggi dengan crime_rate adalah police_exp60 dan police_exp59 dengan korelasi 0,7.

#Modelling

model_crime <- lm(crime_rate ~ police_exp59, crime)
summary(model_crime)
## 
## Call:
## lm(formula = crime_rate ~ police_exp59, data = crime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -595.58 -156.76   12.29  146.74  593.74 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   165.164    130.427   1.266    0.212    
## police_exp59    9.222      1.537   6.001 3.11e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 291.4 on 45 degrees of freedom
## Multiple R-squared:  0.4445, Adjusted R-squared:  0.4322 
## F-statistic: 36.01 on 1 and 45 DF,  p-value: 3.114e-07
model_full <- lm(crime_rate ~ ., crime)
model_step <- step(model_full, direction = "backward", trace = 0) 
summary(model_step)
## 
## Call:
## lm(formula = crime_rate ~ percent_m + mean_education + police_exp60 + 
##     m_per1000f + unemploy_m24 + unemploy_m39 + inequality + prob_prison, 
##     data = crime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -444.70 -111.07    3.03  122.15  483.30 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -6426.101   1194.611  -5.379 4.04e-06 ***
## percent_m          9.332      3.350   2.786  0.00828 ** 
## mean_education    18.012      5.275   3.414  0.00153 ** 
## police_exp60      10.265      1.552   6.613 8.26e-08 ***
## m_per1000f         2.234      1.360   1.642  0.10874    
## unemploy_m24      -6.087      3.339  -1.823  0.07622 .  
## unemploy_m39      18.735      7.248   2.585  0.01371 *  
## inequality         6.133      1.396   4.394 8.63e-05 ***
## prob_prison    -3796.032   1490.646  -2.547  0.01505 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 195.5 on 38 degrees of freedom
## Multiple R-squared:  0.7888, Adjusted R-squared:  0.7444 
## F-statistic: 17.74 on 8 and 38 DF,  p-value: 1.159e-10
crime_test <- read.csv("crime_test.csv")
pred_crime1 <- predict(model_crime, crime_test)
pred_crime2 <- predict(model_step, crime_test)
MAPE(pred_crime1, crime_test$crime_rate)
## [1] 0.3046554
MAPE(pred_crime2, crime_test$crime_rate)
## [1] 0.2339692

#Uji Asumsi ##Shapiro Test

shapiro.test(model_crime$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model_crime$residuals
## W = 0.977, p-value = 0.4748
shapiro.test(model_step$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model_step$residuals
## W = 0.98511, p-value = 0.8051

##Breusch-Pagan Test

bptest(model_crime)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_crime
## BP = 22.918, df = 1, p-value = 1.69e-06
bptest(model_step)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_step
## BP = 13.51, df = 8, p-value = 0.09546

#Variance Inflation Factor

vif(model_step)
##      percent_m mean_education   police_exp60     m_per1000f   unemploy_m24 
##       2.131963       4.189684       2.560496       1.932367       4.360038 
##   unemploy_m39     inequality    prob_prison 
##       4.508106       3.731074       1.381879

#Kesimpulan Dapat disimpulkan bahwa model kedua atau model_step yang menggunakan metode stepwise-regression lebih baik dari model pertama dengan hanya satu variabel yang memiliki korelasi positif tertinggi dengan variabel crime_rate. Alasannya adalah: