The data taken in this report are real data that sourced from data.jakarta.go.id with some modifications.
After import all the data, I need to see what is the data look like. What are the columns name inside the dataframe. Data Based on The Perpetrators/Suspect
Data Based on The Type of Violence
Data Based on The Education of The Perpetrators
Because data type column tahun is factor, I need to change it to become date type, so I use lubridate library
data$tahun <- as.character(data$tahun)
data$tahun <- mdy(data$tahun) # change into date type
data2$tahun <- as.character(data2$tahun)
data2$tahun <- mdy(data2$tahun) # change into date type
data3$tahun <- as.character(data3$tahun)
data3$tahun <- mdy(data3$tahun) # change into date typeTo make sure, call again all dataframes that have been modified
# Aggregation
suspect <- data %>%
mutate(tahun = year(tahun)) %>%
group_by(pelaku_kekerasan, tahun) %>%
summarise(total = sum(jumlah_korban))
suspect# Pre-visualization
suspect.viz <- suspect %>%
mutate(text = glue(
"Suspect: {pelaku_kekerasan}
Year: {tahun}
Number of victim: {total}"
))
# Visualization
suspect.plot <- ggplot(suspect.viz, aes(x = pelaku_kekerasan, y = total, text = text)) +
geom_col(aes(fill = tahun)) +
facet_wrap(~ tahun, nrow = 1) +
theme_minimal() +
labs(title = "How Many Victims Based on The Perpetrators?",
subtitle = "Comparing to 2018 and 2019",
x = NULL,
y = NULL) +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5))
ggplotly(suspect.plot, tooltip = "text") %>%
config(displayModeBar = FALSE) %>%
layout(legend = list(x = 0, y = 100, orientation = "h"),
annotations = list(x = 1, y = -0.1, text = "Source: data.jakarta.go.id",
showarrow = F, xref='paper', yref='paper',
xanchor='right', yanchor='auto', xshift=0, yshift=0,
font=list(size=12,color="green")),
title = list(text = paste0('How Many Victims Based on The Perpetrators?',
'<br>',
'<sup>',
'Comparing to 2018 and 2019',
'</sup>')))# Aggregation
suspect.2 <- data %>%
mutate(tahun. = year(tahun),
bulan = month(tahun)) %>%
group_by(pelaku_kekerasan, tahun., bulan) %>%
summarise(total = sum(jumlah_korban)) %>%
ungroup()
suspect.2# Visualization
suspect.plot.2 <- ggplot(suspect.2, aes(x = bulan, y = total)) +
geom_line(aes(col = pelaku_kekerasan)) +
facet_wrap(~ tahun., nrow = 2) +
theme_minimal() +
scale_x_continuous(breaks = seq(1,9,1)) +
labs(title = "Number of Violence Cases by Perpetrators for Every Month",
x = "Month",
y = NULL) +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
ggplotly(suspect.plot.2) %>%
config(displayModeBar = FALSE) %>%
layout(legend = list(x = 0, y = 100, orientation = "h"),
annotations = list(x = 1, y = -0.1, text = "Source: data.jakarta.go.id",
showarrow = F, xref='paper', yref='paper',
xanchor='right', yanchor='auto', xshift=0, yshift=0,
font=list(size=12,color="green")))# Aggregation
type <- data2 %>%
mutate(tahun = year(tahun)) %>%
group_by(bentuk_kekerasan, tahun) %>%
summarise(total = sum(jumlah_korban))
type# Pre-visualization
type.viz <- type %>%
mutate(text2 = glue(
"Type: {bentuk_kekerasan}
Year: {tahun}
Number of victim: {total}"
))
# Visualization
type.plot <- ggplot(type.viz, aes(x = bentuk_kekerasan, y = total, text = text2)) +
geom_col(aes(fill = tahun)) +
facet_wrap(~ tahun, nrow = 1) +
theme_minimal() +
labs(title = "What Kind Of Violence?",
x = NULL,
y = NULL) +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5))
ggplotly(type.plot, tooltip = "text") %>%
config(displayModeBar = FALSE) %>%
layout(legend = list(x = 0, y = 100, orientation = "h"),
annotations = list(x = 1, y = -0.1, text = "Source: data.jakarta.go.id",
showarrow = F, xref='paper', yref='paper',
xanchor='right', yanchor='auto', xshift=0, yshift=0,
font=list(size=12,color="green")),
title = list(text = paste0('How Many Victims Based on The Violence Type?',
'<br>',
'<sup>',
'Comparing to 2018 and 2019',
'</sup>')))# Aggregation
type.2 <- data2 %>%
mutate(tahun. = year(tahun),
bulan = month(tahun)) %>%
group_by(bentuk_kekerasan, tahun., bulan) %>%
summarise(total = sum(jumlah_korban))
type.2# Visualization
type.plot.2 <- ggplot(type.2, aes(x = bulan, y = total)) +
geom_line(aes(col = bentuk_kekerasan)) +
facet_wrap(~ tahun., nrow = 2) +
theme_minimal() +
scale_x_continuous(breaks = seq(1,9,1)) +
labs(title = "Number of Violence Cases by Type for Every Month",
x = "Month",
y = NULL) +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
ggplotly(type.plot.2) %>%
config(displayModeBar = FALSE) %>%
layout(legend = list(x = 0, y = 100, orientation = "h"),
annotations = list(x = 1, y = -0.1, text = "Source: data.jakarta.go.id",
showarrow = F, xref='paper', yref='paper',
xanchor='right', yanchor='auto', xshift=0, yshift=0,
font=list(size=12,color="green")))# Aggregation
edu <- data3 %>%
mutate(tahun = year(tahun)) %>%
group_by(education, tahun) %>%
summarise(total = sum(jumlah_korban)) %>%
arrange(desc(total))
edu# Pre-visualization
edu.viz <- edu %>%
mutate(text3 = glue(
"Education: {education}
Year: {tahun}
Number of victim: {total}"
))
# Visualization
edu.plot <- ggplot(edu.viz, aes(x = reorder(education,total), y = total, text = text3)) +
geom_col(aes(fill = tahun)) +
facet_wrap(~ tahun, nrow = 1) +
theme_minimal() +
labs(title = "Does Education Affect?",
x = NULL,
y = NULL) +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5))
ggplotly(edu.plot, tooltip = "text") %>%
config(displayModeBar = FALSE) %>%
layout(legend = list(x = 0, y = 100, orientation = "h"),
annotations = list(x = 1, y = -0.1, text = "Source: data.jakarta.go.id",
showarrow = F, xref='paper', yref='paper',
xanchor='right', yanchor='auto', xshift=0, yshift=0,
font=list(size=12,color="green")),
title = list(text = paste0('Does Education Affect?',
'<br>',
'<sup>',
'Based on The Perpetrators',
'</sup>')))I want to predict the total of victim based on the suspect. So in this case I make it become independent from time (year), so I hope this prediction can predict in different year without considering the year. In this case I use linear regression method.
lm.none <- lm(formula = total~1, data = suspect.reg)
lm.all <- lm(formula = total~., data = suspect.reg)With feature selection, I want to know the better model. In this case, I use the Step-wise Regression and compare the R-squared, Adj. R-squared, and p-value from both forward selection and backward selection
#> Start: AIC=464.89
#> total ~ 1
#>
#> Df Sum of Sq RSS AIC
#> + pelaku_kekerasan 3 26400.2 18208 406.37
#> + tahun. 1 5425.3 39183 457.55
#> <none> 44608 464.89
#> + bulan 8 3642.7 40965 474.75
#>
#> Step: AIC=406.37
#> total ~ pelaku_kekerasan
#>
#> Df Sum of Sq RSS AIC
#> + tahun. 1 5425.3 12782 382.90
#> + bulan 8 3642.8 14565 406.30
#> <none> 18208 406.37
#>
#> Step: AIC=382.9
#> total ~ pelaku_kekerasan + tahun.
#>
#> Df Sum of Sq RSS AIC
#> + bulan 8 3642.8 9139.6 374.75
#> <none> 12782.4 382.90
#>
#> Step: AIC=374.75
#> total ~ pelaku_kekerasan + tahun. + bulan
#>
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + tahun. + bulan, data = suspect.reg)
#>
#> Coefficients:
#> (Intercept) pelaku_kekerasanHusband pelaku_kekerasanOthers
#> 35048.833 40.111 42.056
#> pelaku_kekerasanParents tahun. bulan2
#> 6.111 -17.361 7.250
#> bulan3 bulan4 bulan5
#> 10.750 0.250 -5.375
#> bulan6 bulan7 bulan8
#> -14.625 5.875 0.250
#> bulan9
#> 4.250
forward_model <- lm(formula = total ~ pelaku_kekerasan + bulan , data = suspect.reg)
summary(forward_model)#>
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + bulan, data = suspect.reg)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -35.292 -7.944 -2.042 10.434 36.208
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 5.431 6.361 0.854 0.3966
#> pelaku_kekerasanHusband 40.111 5.194 7.723 1.45e-10 ***
#> pelaku_kekerasanOthers 42.056 5.194 8.098 3.34e-11 ***
#> pelaku_kekerasanParents 6.111 5.194 1.177 0.2440
#> bulan2 7.250 7.790 0.931 0.3558
#> bulan3 10.750 7.790 1.380 0.1727
#> bulan4 0.250 7.790 0.032 0.9745
#> bulan5 -5.375 7.790 -0.690 0.4929
#> bulan6 -14.625 7.790 -1.877 0.0653 .
#> bulan7 5.875 7.790 0.754 0.4537
#> bulan8 0.250 7.790 0.032 0.9745
#> bulan9 4.250 7.790 0.546 0.5874
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 15.58 on 60 degrees of freedom
#> Multiple R-squared: 0.6735, Adjusted R-squared: 0.6136
#> F-statistic: 11.25 on 11 and 60 DF, p-value: 5.938e-11
#> Start: AIC=374.75
#> total ~ pelaku_kekerasan + tahun. + bulan
#>
#> Df Sum of Sq RSS AIC
#> <none> 9140 374.75
#> - bulan 8 3642.8 12782 382.90
#> - tahun. 1 5425.3 14565 406.30
#> - pelaku_kekerasan 3 26400.2 35540 466.53
#>
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + tahun. + bulan, data = suspect.reg)
#>
#> Coefficients:
#> (Intercept) pelaku_kekerasanHusband pelaku_kekerasanOthers
#> 35048.833 40.111 42.056
#> pelaku_kekerasanParents tahun. bulan2
#> 6.111 -17.361 7.250
#> bulan3 bulan4 bulan5
#> 10.750 0.250 -5.375
#> bulan6 bulan7 bulan8
#> -14.625 5.875 0.250
#> bulan9
#> 4.250
backward_model <- lm(formula = total ~ pelaku_kekerasan + bulan, data = suspect.reg)
summary(backward_model)#>
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + bulan, data = suspect.reg)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -35.292 -7.944 -2.042 10.434 36.208
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 5.431 6.361 0.854 0.3966
#> pelaku_kekerasanHusband 40.111 5.194 7.723 1.45e-10 ***
#> pelaku_kekerasanOthers 42.056 5.194 8.098 3.34e-11 ***
#> pelaku_kekerasanParents 6.111 5.194 1.177 0.2440
#> bulan2 7.250 7.790 0.931 0.3558
#> bulan3 10.750 7.790 1.380 0.1727
#> bulan4 0.250 7.790 0.032 0.9745
#> bulan5 -5.375 7.790 -0.690 0.4929
#> bulan6 -14.625 7.790 -1.877 0.0653 .
#> bulan7 5.875 7.790 0.754 0.4537
#> bulan8 0.250 7.790 0.032 0.9745
#> bulan9 4.250 7.790 0.546 0.5874
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 15.58 on 60 degrees of freedom
#> Multiple R-squared: 0.6735, Adjusted R-squared: 0.6136
#> F-statistic: 11.25 on 11 and 60 DF, p-value: 5.938e-11
From both selection I got the same value for both forward selection and backward selection. From this, I can use one of them for the next calculation.
From the plot, there are some data that have high influence in number of observe 26, 28, and 30
suspect.fill.model <- lm(total~.,data = suspect.fil)
step(suspect.fill.model, direction = "backward")#> Start: AIC=341.32
#> total ~ pelaku_kekerasan + tahun. + bulan
#>
#> Df Sum of Sq RSS AIC
#> <none> 6660 341.32
#> - bulan 8 4419.2 11080 360.43
#> - tahun. 1 3709.1 10369 369.86
#> - pelaku_kekerasan 3 26315.1 32975 445.69
#>
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + tahun. + bulan, data = suspect.fil)
#>
#> Coefficients:
#> (Intercept) pelaku_kekerasanHusband pelaku_kekerasanOthers
#> 29713.3654 41.9087 42.0556
#> pelaku_kekerasanParents tahun. bulan2
#> 6.1111 -14.7162 3.4598
#> bulan3 bulan4 bulan5
#> 11.1429 -3.5402 -9.1652
#> bulan6 bulan7 bulan8
#> -18.4152 2.0848 -7.4691
#> bulan9
#> 0.4598
backward_suspect_fil <- lm(formula = total ~ pelaku_kekerasan + bulan, data = suspect.fil)
summary(backward_suspect_fil)#>
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + bulan, data = suspect.fil)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -27.1505 -7.0949 0.7156 9.0569 29.4514
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 9.7447 5.7487 1.695 0.09551 .
#> pelaku_kekerasanHusband 42.4537 4.7391 8.958 1.81e-12 ***
#> pelaku_kekerasanOthers 42.0556 4.4959 9.354 4.10e-13 ***
#> pelaku_kekerasanParents 6.1111 4.4959 1.359 0.17942
#> bulan2 2.3502 6.9935 0.336 0.73806
#> bulan3 11.1429 7.2095 1.546 0.12774
#> bulan4 -4.6498 6.9935 -0.665 0.50881
#> bulan5 -10.2748 6.9935 -1.469 0.14728
#> bulan6 -19.5248 6.9935 -2.792 0.00712 **
#> bulan7 0.9752 6.9935 0.139 0.88959
#> bulan8 -9.5714 7.2095 -1.328 0.18960
#> bulan9 -0.6498 6.9935 -0.093 0.92630
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 13.49 on 57 degrees of freedom
#> Multiple R-squared: 0.7497, Adjusted R-squared: 0.7014
#> F-statistic: 15.52 on 11 and 57 DF, p-value: 2.092e-13
After removing the high influence data, I can get the higher R-squared (from 0.6735 become 0.7495) and Adj. R-squared (from 0.6136 become 0.7014) that more fit with the model
I want to check the error of the model using Mean Absolute Error (MAE)
#> [1] 9.911615
To know the model meet BLUE (Best Linear Model Estimator), I use some models
To check if the error distribute normally
From the historam I can see if the error distribute quite normally.
Or also can use shapiro test. From shapiro test I know the error distribute normally if the p-value > 0.05
#>
#> Shapiro-Wilk normality test
#>
#> data: backward_suspect_fil$residuals
#> W = 0.98873, p-value = 0.7938
Because the p-value is 0.7938 > 0.05, we can conclude that the error distributed normally
I want to check if the residual variance from the model doesn’t make pattern or distributed randomly. To do that, I can use Breusch-Pagan Test
Breusch-Pagan Test Breusch-Pagan Test concluded that if the p-value < 0.05, the residual variance of the model meet the condition of heteroscedasticity.
#>
#> studentized Breusch-Pagan test
#>
#> data: backward_suspect_fil
#> BP = 17.566, df = 11, p-value = 0.09221
Because the p-value from the BP-test is 0.09221 > 0.05, then I can conclude that the variance residual of the model is homoscedasticity / no heteroscedasticity
#>
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + bulan, data = suspect.fil)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -27.1505 -7.0949 0.7156 9.0569 29.4514
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 9.7447 5.7487 1.695 0.09551 .
#> pelaku_kekerasanHusband 42.4537 4.7391 8.958 1.81e-12 ***
#> pelaku_kekerasanOthers 42.0556 4.4959 9.354 4.10e-13 ***
#> pelaku_kekerasanParents 6.1111 4.4959 1.359 0.17942
#> bulan2 2.3502 6.9935 0.336 0.73806
#> bulan3 11.1429 7.2095 1.546 0.12774
#> bulan4 -4.6498 6.9935 -0.665 0.50881
#> bulan5 -10.2748 6.9935 -1.469 0.14728
#> bulan6 -19.5248 6.9935 -2.792 0.00712 **
#> bulan7 0.9752 6.9935 0.139 0.88959
#> bulan8 -9.5714 7.2095 -1.328 0.18960
#> bulan9 -0.6498 6.9935 -0.093 0.92630
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 13.49 on 57 degrees of freedom
#> Multiple R-squared: 0.7497, Adjusted R-squared: 0.7014
#> F-statistic: 15.52 on 11 and 57 DF, p-value: 2.092e-13
The model made from the data 2018 and 2019 from January until September, so It can’t Predict other months (October, November, December).
The overall p-value from the summary concluded that the target and the predictor have good correlation. But from the R-squared value 0.7497 shows that I need to get more data to make more fit with the model.
From the checking test, linearity, no heteroscedasticity, and normality residuals, I can say that the model pass the assumption checking that used for linear regression method