Understanding The Data

Library

# Calling the library that we will use for this data case
library(dplyr)
library(ggplot2)
library(lubridate)
library(plotly)
library(glue)
library(tidyr)
library(lmtest)
library(MLmetrics)
library(leaps)
library(car)

Importing Data

The data taken in this report are real data that sourced from data.jakarta.go.id with some modifications.

data <- read.csv("data/data_kekerasan2.csv")
data2 <- read.csv("data/data_kekerasan_tipe.csv")
data3 <-  read.csv("data/data-korban-kekerasan--pendidikan-pelaku.csv")

KYD (Know Your Data)

After import all the data, I need to see what is the data look like. What are the columns name inside the dataframe. Data Based on The Perpetrators/Suspect

head(data)

Data Based on The Type of Violence

head(data2)

Data Based on The Education of The Perpetrators

head(data3)

Pre-processing

Because data type column tahun is factor, I need to change it to become date type, so I use lubridate library

data$tahun <- as.character(data$tahun)
data$tahun <- mdy(data$tahun) # change into date type

data2$tahun <- as.character(data2$tahun)
data2$tahun <- mdy(data2$tahun) # change into date type

data3$tahun <- as.character(data3$tahun)
data3$tahun <- mdy(data3$tahun) # change into date type

To make sure, call again all dataframes that have been modified

head(data)

head(data2)

head(data3)

Plotting

Based on Perpetrators

Comparing by Year

# Aggregation
suspect <- data %>% 
  mutate(tahun = year(tahun)) %>% 
  group_by(pelaku_kekerasan, tahun) %>% 
  summarise(total = sum(jumlah_korban))

suspect

# Pre-visualization
suspect.viz <- suspect %>% 
            mutate(text = glue(
                "Suspect: {pelaku_kekerasan}
                Year: {tahun}
                Number of victim: {total}"
            ))

# Visualization
suspect.plot <- ggplot(suspect.viz, aes(x = pelaku_kekerasan, y = total, text = text)) +
  geom_col(aes(fill = tahun)) +
  facet_wrap(~ tahun, nrow = 1) +
  theme_minimal() +
  labs(title = "How Many Victims Based on The Perpetrators?",
       subtitle = "Comparing to 2018 and 2019",
       x = NULL,
       y = NULL) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5))

ggplotly(suspect.plot, tooltip = "text")  %>% 
  config(displayModeBar = FALSE) %>% 
  layout(legend = list(x = 0, y = 100, orientation = "h"),
         annotations = list(x = 1, y = -0.1, text = "Source: data.jakarta.go.id", 
      showarrow = F, xref='paper', yref='paper', 
      xanchor='right', yanchor='auto', xshift=0, yshift=0,
      font=list(size=12,color="green")),
      title = list(text = paste0('How Many Victims Based on The Perpetrators?',
                                    '<br>',
                                    '<sup>',
                                    'Comparing to 2018 and 2019',
                                    '</sup>')))

Comparing Through All Month

# Aggregation
suspect.2 <- data %>% 
  mutate(tahun. = year(tahun),
         bulan = month(tahun)) %>% 
  group_by(pelaku_kekerasan, tahun., bulan) %>% 
  summarise(total = sum(jumlah_korban)) %>% 
  ungroup()

suspect.2

# Visualization
suspect.plot.2 <- ggplot(suspect.2, aes(x = bulan, y = total)) +
  geom_line(aes(col = pelaku_kekerasan)) +
  facet_wrap(~ tahun., nrow = 2) +
  theme_minimal() +
  scale_x_continuous(breaks = seq(1,9,1)) +
  labs(title = "Number of Violence Cases by Perpetrators for Every Month",
       x = "Month",
       y = NULL) +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

ggplotly(suspect.plot.2)  %>% 
  config(displayModeBar = FALSE) %>% 
  layout(legend = list(x = 0, y = 100, orientation = "h"),
         annotations = list(x = 1, y = -0.1, text = "Source: data.jakarta.go.id", 
      showarrow = F, xref='paper', yref='paper', 
      xanchor='right', yanchor='auto', xshift=0, yshift=0,
      font=list(size=12,color="green")))

Based on Types

Comparing by Year

# Aggregation
type <- data2 %>% 
  mutate(tahun = year(tahun)) %>% 
  group_by(bentuk_kekerasan, tahun) %>% 
  summarise(total = sum(jumlah_korban))

type

# Pre-visualization
type.viz <- type %>% 
            mutate(text2 = glue(
                "Type: {bentuk_kekerasan}
                Year: {tahun}
                Number of victim: {total}"
            ))

# Visualization
type.plot <- ggplot(type.viz, aes(x = bentuk_kekerasan, y = total, text = text2)) +
  geom_col(aes(fill = tahun)) +
  facet_wrap(~ tahun, nrow = 1) +
  theme_minimal() +
  labs(title = "What Kind Of Violence?",
       x = NULL,
       y = NULL) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5))

ggplotly(type.plot, tooltip = "text")  %>% 
  config(displayModeBar = FALSE) %>% 
  layout(legend = list(x = 0, y = 100, orientation = "h"),
         annotations = list(x = 1, y = -0.1, text = "Source: data.jakarta.go.id", 
      showarrow = F, xref='paper', yref='paper', 
      xanchor='right', yanchor='auto', xshift=0, yshift=0,
      font=list(size=12,color="green")),
      title = list(text = paste0('How Many Victims Based on The Violence Type?',
                                    '<br>',
                                    '<sup>',
                                    'Comparing to 2018 and 2019',
                                    '</sup>')))

Comparing Through All Month

# Aggregation
type.2 <- data2 %>% 
  mutate(tahun. = year(tahun),
         bulan = month(tahun)) %>% 
  group_by(bentuk_kekerasan, tahun., bulan) %>% 
  summarise(total = sum(jumlah_korban))

type.2

# Visualization
type.plot.2 <- ggplot(type.2, aes(x = bulan, y = total)) +
  geom_line(aes(col = bentuk_kekerasan)) +
  facet_wrap(~ tahun., nrow = 2) +
  theme_minimal() +
  scale_x_continuous(breaks = seq(1,9,1)) +
  labs(title = "Number of Violence Cases by Type for Every Month",
       x = "Month",
       y = NULL) +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

ggplotly(type.plot.2)  %>% 
  config(displayModeBar = FALSE) %>% 
  layout(legend = list(x = 0, y = 100, orientation = "h"),
         annotations = list(x = 1, y = -0.1, text = "Source: data.jakarta.go.id", 
      showarrow = F, xref='paper', yref='paper', 
      xanchor='right', yanchor='auto', xshift=0, yshift=0,
      font=list(size=12,color="green")))

Based on Education

Does Education Affect?

# Aggregation
edu <- data3 %>% 
  mutate(tahun = year(tahun)) %>% 
  group_by(education, tahun) %>% 
  summarise(total = sum(jumlah_korban)) %>% 
  arrange(desc(total))

edu

# Pre-visualization
edu.viz <- edu %>% 
            mutate(text3 = glue(
                "Education: {education}
                Year: {tahun}
                Number of victim: {total}"
            ))

# Visualization
edu.plot <- ggplot(edu.viz, aes(x = reorder(education,total), y = total, text = text3)) +
  geom_col(aes(fill = tahun)) +
  facet_wrap(~ tahun, nrow = 1) +
  theme_minimal() +
  labs(title = "Does Education Affect?",
       x = NULL,
       y = NULL) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5))

ggplotly(edu.plot, tooltip = "text")  %>% 
  config(displayModeBar = FALSE) %>% 
  layout(legend = list(x = 0, y = 100, orientation = "h"),
         annotations = list(x = 1, y = -0.1, text = "Source: data.jakarta.go.id", 
      showarrow = F, xref='paper', yref='paper', 
      xanchor='right', yanchor='auto', xshift=0, yshift=0,
      font=list(size=12,color="green")),
      title = list(text = paste0('Does Education Affect?',
                                    '<br>',
                                    '<sup>',
                                    'Based on The Perpetrators',
                                    '</sup>')))

Let’s Predict

I want to predict the total of victim based on the suspect. So in this case I make it become independent from time (year), so I hope this prediction can predict in different year without considering the year. In this case I use linear regression method.

suspect.reg <- suspect.2 %>% 
  mutate(bulan = as.factor(bulan))

lm.none <- lm(formula = total~1, data = suspect.reg)
lm.all <-  lm(formula = total~., data = suspect.reg)

Feature Selection

With feature selection, I want to know the better model. In this case, I use the Step-wise Regression and compare the R-squared, Adj. R-squared, and p-value from both forward selection and backward selection

Forward Selection

step(object = lm.none, list(lower = lm.none, upper = lm.all), direction = "forward")

#> Start:  AIC=464.89
#> total ~ 1
#> 
#>                    Df Sum of Sq   RSS    AIC
#> + pelaku_kekerasan  3   26400.2 18208 406.37
#> + tahun.            1    5425.3 39183 457.55
#> <none>                          44608 464.89
#> + bulan             8    3642.7 40965 474.75
#> 
#> Step:  AIC=406.37
#> total ~ pelaku_kekerasan
#> 
#>          Df Sum of Sq   RSS    AIC
#> + tahun.  1    5425.3 12782 382.90
#> + bulan   8    3642.8 14565 406.30
#> <none>                18208 406.37
#> 
#> Step:  AIC=382.9
#> total ~ pelaku_kekerasan + tahun.
#> 
#>         Df Sum of Sq     RSS    AIC
#> + bulan  8    3642.8  9139.6 374.75
#> <none>               12782.4 382.90
#> 
#> Step:  AIC=374.75
#> total ~ pelaku_kekerasan + tahun. + bulan

#> 
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + tahun. + bulan, data = suspect.reg)
#> 
#> Coefficients:
#>             (Intercept)  pelaku_kekerasanHusband   pelaku_kekerasanOthers  
#>               35048.833                   40.111                   42.056  
#> pelaku_kekerasanParents                   tahun.                   bulan2  
#>                   6.111                  -17.361                    7.250  
#>                  bulan3                   bulan4                   bulan5  
#>                  10.750                    0.250                   -5.375  
#>                  bulan6                   bulan7                   bulan8  
#>                 -14.625                    5.875                    0.250  
#>                  bulan9  
#>                   4.250

forward_model <- lm(formula = total ~ pelaku_kekerasan + bulan , data = suspect.reg)
summary(forward_model)

#> 
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + bulan, data = suspect.reg)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -35.292  -7.944  -2.042  10.434  36.208 
#> 
#> Coefficients:
#>                         Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)                5.431      6.361   0.854   0.3966    
#> pelaku_kekerasanHusband   40.111      5.194   7.723 1.45e-10 ***
#> pelaku_kekerasanOthers    42.056      5.194   8.098 3.34e-11 ***
#> pelaku_kekerasanParents    6.111      5.194   1.177   0.2440    
#> bulan2                     7.250      7.790   0.931   0.3558    
#> bulan3                    10.750      7.790   1.380   0.1727    
#> bulan4                     0.250      7.790   0.032   0.9745    
#> bulan5                    -5.375      7.790  -0.690   0.4929    
#> bulan6                   -14.625      7.790  -1.877   0.0653 .  
#> bulan7                     5.875      7.790   0.754   0.4537    
#> bulan8                     0.250      7.790   0.032   0.9745    
#> bulan9                     4.250      7.790   0.546   0.5874    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 15.58 on 60 degrees of freedom
#> Multiple R-squared:  0.6735, Adjusted R-squared:  0.6136 
#> F-statistic: 11.25 on 11 and 60 DF,  p-value: 5.938e-11

Backward Selection

step(object = lm.all, scope = list(lower = lm.none),direction = "backward")

#> Start:  AIC=374.75
#> total ~ pelaku_kekerasan + tahun. + bulan
#> 
#>                    Df Sum of Sq   RSS    AIC
#> <none>                           9140 374.75
#> - bulan             8    3642.8 12782 382.90
#> - tahun.            1    5425.3 14565 406.30
#> - pelaku_kekerasan  3   26400.2 35540 466.53

#> 
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + tahun. + bulan, data = suspect.reg)
#> 
#> Coefficients:
#>             (Intercept)  pelaku_kekerasanHusband   pelaku_kekerasanOthers  
#>               35048.833                   40.111                   42.056  
#> pelaku_kekerasanParents                   tahun.                   bulan2  
#>                   6.111                  -17.361                    7.250  
#>                  bulan3                   bulan4                   bulan5  
#>                  10.750                    0.250                   -5.375  
#>                  bulan6                   bulan7                   bulan8  
#>                 -14.625                    5.875                    0.250  
#>                  bulan9  
#>                   4.250

backward_model <- lm(formula = total ~ pelaku_kekerasan  + bulan, data = suspect.reg)
summary(backward_model)

#> 
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + bulan, data = suspect.reg)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -35.292  -7.944  -2.042  10.434  36.208 
#> 
#> Coefficients:
#>                         Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)                5.431      6.361   0.854   0.3966    
#> pelaku_kekerasanHusband   40.111      5.194   7.723 1.45e-10 ***
#> pelaku_kekerasanOthers    42.056      5.194   8.098 3.34e-11 ***
#> pelaku_kekerasanParents    6.111      5.194   1.177   0.2440    
#> bulan2                     7.250      7.790   0.931   0.3558    
#> bulan3                    10.750      7.790   1.380   0.1727    
#> bulan4                     0.250      7.790   0.032   0.9745    
#> bulan5                    -5.375      7.790  -0.690   0.4929    
#> bulan6                   -14.625      7.790  -1.877   0.0653 .  
#> bulan7                     5.875      7.790   0.754   0.4537    
#> bulan8                     0.250      7.790   0.032   0.9745    
#> bulan9                     4.250      7.790   0.546   0.5874    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 15.58 on 60 degrees of freedom
#> Multiple R-squared:  0.6735, Adjusted R-squared:  0.6136 
#> F-statistic: 11.25 on 11 and 60 DF,  p-value: 5.938e-11

From both selection I got the same value for both forward selection and backward selection. From this, I can use one of them for the next calculation.

Look for The High Influence

plot(backward_model, 4)

From the plot, there are some data that have high influence in number of observe 26, 28, and 30

suspect.fil <- suspect.reg %>% 
  slice(-c(26,28,30))

suspect.fill.model <- lm(total~.,data = suspect.fil)
step(suspect.fill.model, direction = "backward")

#> Start:  AIC=341.32
#> total ~ pelaku_kekerasan + tahun. + bulan
#> 
#>                    Df Sum of Sq   RSS    AIC
#> <none>                           6660 341.32
#> - bulan             8    4419.2 11080 360.43
#> - tahun.            1    3709.1 10369 369.86
#> - pelaku_kekerasan  3   26315.1 32975 445.69

#> 
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + tahun. + bulan, data = suspect.fil)
#> 
#> Coefficients:
#>             (Intercept)  pelaku_kekerasanHusband   pelaku_kekerasanOthers  
#>              29713.3654                  41.9087                  42.0556  
#> pelaku_kekerasanParents                   tahun.                   bulan2  
#>                  6.1111                 -14.7162                   3.4598  
#>                  bulan3                   bulan4                   bulan5  
#>                 11.1429                  -3.5402                  -9.1652  
#>                  bulan6                   bulan7                   bulan8  
#>                -18.4152                   2.0848                  -7.4691  
#>                  bulan9  
#>                  0.4598

backward_suspect_fil <- lm(formula = total ~ pelaku_kekerasan + bulan, data = suspect.fil)
summary(backward_suspect_fil)

#> 
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + bulan, data = suspect.fil)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -27.1505  -7.0949   0.7156   9.0569  29.4514 
#> 
#> Coefficients:
#>                         Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)               9.7447     5.7487   1.695  0.09551 .  
#> pelaku_kekerasanHusband  42.4537     4.7391   8.958 1.81e-12 ***
#> pelaku_kekerasanOthers   42.0556     4.4959   9.354 4.10e-13 ***
#> pelaku_kekerasanParents   6.1111     4.4959   1.359  0.17942    
#> bulan2                    2.3502     6.9935   0.336  0.73806    
#> bulan3                   11.1429     7.2095   1.546  0.12774    
#> bulan4                   -4.6498     6.9935  -0.665  0.50881    
#> bulan5                  -10.2748     6.9935  -1.469  0.14728    
#> bulan6                  -19.5248     6.9935  -2.792  0.00712 ** 
#> bulan7                    0.9752     6.9935   0.139  0.88959    
#> bulan8                   -9.5714     7.2095  -1.328  0.18960    
#> bulan9                   -0.6498     6.9935  -0.093  0.92630    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 13.49 on 57 degrees of freedom
#> Multiple R-squared:  0.7497, Adjusted R-squared:  0.7014 
#> F-statistic: 15.52 on 11 and 57 DF,  p-value: 2.092e-13

After removing the high influence data, I can get the higher R-squared (from 0.6735 become 0.7495) and Adj. R-squared (from 0.6136 become 0.7014) that more fit with the model

Checking Error

I want to check the error of the model using Mean Absolute Error (MAE)

predict_backward <- predict(object = backward_suspect_fil, newdata = suspect.fil)

MAE(y_pred = predict_backward, y_true = suspect.fil$total)

#> [1] 9.911615

Assumption Checking

To know the model meet BLUE (Best Linear Model Estimator), I use some models

Linearity

plot(backward_suspect_fil, 1)

Normality Residual

To check if the error distribute normally

hist(backward_suspect_fil$residuals)

From the historam I can see if the error distribute quite normally.

plot(backward_suspect_fil, 2)

Or also can use shapiro test. From shapiro test I know the error distribute normally if the p-value > 0.05

shapiro.test(backward_suspect_fil$residuals)

#> 
#>  Shapiro-Wilk normality test
#> 
#> data:  backward_suspect_fil$residuals
#> W = 0.98873, p-value = 0.7938

Because the p-value is 0.7938 > 0.05, we can conclude that the error distributed normally

No-Heteroscedasticity

I want to check if the residual variance from the model doesn’t make pattern or distributed randomly. To do that, I can use Breusch-Pagan Test

plot(backward_suspect_fil, 3)

Breusch-Pagan Test Breusch-Pagan Test concluded that if the p-value < 0.05, the residual variance of the model meet the condition of heteroscedasticity.

bptest(backward_suspect_fil)

#> 
#>  studentized Breusch-Pagan test
#> 
#> data:  backward_suspect_fil
#> BP = 17.566, df = 11, p-value = 0.09221

Because the p-value from the BP-test is 0.09221 > 0.05, then I can conclude that the variance residual of the model is homoscedasticity / no heteroscedasticity

Conclusion

summary(backward_suspect_fil)

#> 
#> Call:
#> lm(formula = total ~ pelaku_kekerasan + bulan, data = suspect.fil)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -27.1505  -7.0949   0.7156   9.0569  29.4514 
#> 
#> Coefficients:
#>                         Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)               9.7447     5.7487   1.695  0.09551 .  
#> pelaku_kekerasanHusband  42.4537     4.7391   8.958 1.81e-12 ***
#> pelaku_kekerasanOthers   42.0556     4.4959   9.354 4.10e-13 ***
#> pelaku_kekerasanParents   6.1111     4.4959   1.359  0.17942    
#> bulan2                    2.3502     6.9935   0.336  0.73806    
#> bulan3                   11.1429     7.2095   1.546  0.12774    
#> bulan4                   -4.6498     6.9935  -0.665  0.50881    
#> bulan5                  -10.2748     6.9935  -1.469  0.14728    
#> bulan6                  -19.5248     6.9935  -2.792  0.00712 ** 
#> bulan7                    0.9752     6.9935   0.139  0.88959    
#> bulan8                   -9.5714     7.2095  -1.328  0.18960    
#> bulan9                   -0.6498     6.9935  -0.093  0.92630    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 13.49 on 57 degrees of freedom
#> Multiple R-squared:  0.7497, Adjusted R-squared:  0.7014 
#> F-statistic: 15.52 on 11 and 57 DF,  p-value: 2.092e-13

The model made from the data 2018 and 2019 from January until September, so It can’t Predict other months (October, November, December).

The overall p-value from the summary concluded that the target and the predictor have good correlation. But from the R-squared value 0.7497 shows that I need to get more data to make more fit with the model.

From the checking test, linearity, no heteroscedasticity, and normality residuals, I can say that the model pass the assumption checking that used for linear regression method

Violence Against Women and Children in Jakarta

Toho Dustin

November 1, 2019

Understanding The Data

Library

Importing Data

KYD (Know Your Data)

Pre-processing

Plotting

Based on Perpetrators

Comparing by Year

Comparing Through All Month

Based on Types

Comparing by Year

Comparing Through All Month

Based on Education

Does Education Affect?

Let’s Predict

Feature Selection

Forward Selection

Backward Selection

Look for The High Influence

Checking Error

Assumption Checking

Linearity

Normality Residual

No-Heteroscedasticity

Conclusion