Lab 6

Murder Dataset

Author

Eli Kramer

rm(list = ls())
options(scipen=999)

library(socviz)
load("Murders96_viz.RData")
head(Murders96_viz)
    arrests countyid  density  popul perc1019 perc2029 percblack percmale
17        8     1001 67.21535  40061 15.89077 13.17491 20.975510 48.70073
34        6     1003 77.05643 123023 13.93886 11.63929 13.496660 48.83233
51        1     1005 29.91548  26475 15.06327 13.69972 46.190750 49.15203
68        0     1009 67.20457  43392 14.17542 12.99318  1.415007 48.97446
85        1     1011 17.89899  11188 14.98927 14.13121 72.756520 49.91956
102       2     1013 27.71148  21530 15.68509 11.25871 41.384110 46.81839
    rpcincmaint rpcpersinc rpcunemins year murders  murdrate arrestrate
17      192.038  11852.760     26.796 1996       7 1.7473350  1.9969550
34      139.084  13583.020     28.710 1996       6 0.4877137  0.4877137
51      405.768  10760.510     63.162 1996       1 0.3777148  0.3777148
68      184.382  11094.820     21.692 1996       2 0.4609145  0.0000000
85      485.518   8349.506     63.162 1996       0 0.0000000  0.8938148
102     357.918   9947.058     54.868 1996       2 0.9289364  0.9289364
    statefips countyfips execs    lpopul execrate statename statecode region
17          1          1     0 10.598160        0   Alabama        AL  South
34          1          3     0 11.720130        0   Alabama        AL  South
51          1          5     0 10.183960        0   Alabama        AL  South
68          1          9     0 10.678030        0   Alabama        AL  South
85          1         11     0  9.322598        0   Alabama        AL  South
102         1         13     0  9.977202        0   Alabama        AL  South
View(Murders96_viz)
library(caret)
set.seed(1)
index <- createDataPartition(Murders96_viz$rpcunemins, p=0.6, list=FALSE)
train <- Murders96_viz[index,]
test  <- Murders96_viz[-index,]
library(psych)
library(tidyverse)
library(flextable)
psych::describe(train, fast=TRUE)
            vars    n      mean        sd     min        max      range      se
arrests        1 1320      6.28     36.89    0.00    1105.00    1105.00    1.02
countyid       2 1320  33239.43  15373.36 1001.00   56045.00   55044.00  423.14
density        3 1320    298.51   2069.92    0.21   54058.77   54058.56   56.97
popul          4 1320 101556.61 335436.63  141.00 9127751.00 9127610.00 9232.59
perc1019       5 1320     15.08      1.89    7.60      30.48      22.89    0.05
perc2029       6 1320     11.97      3.04    5.62      34.75      29.14    0.08
percblack      7 1320      8.39     13.87    0.00      81.13      81.13    0.38
percmale       8 1320     49.35      1.70   44.51      61.64      17.14    0.05
rpcincmaint    9 1320    209.21    116.97   21.69    1281.10    1259.41    3.22
rpcpersinc    10 1320  12571.39   3024.58 5322.20   41094.22   35772.02   83.25
rpcunemins    11 1320     52.90     36.47    0.00     287.10     287.10    1.00
year          12 1320   1996.00      0.00 1996.00    1996.00       0.00    0.00
murders       13 1320      6.35     44.53    0.00    1403.00    1403.00    1.23
murdrate      14 1320      0.40      0.68    0.00       8.81       8.81    0.02
arrestrate    15 1320      0.46      0.86    0.00      13.99      13.99    0.02
statefips     16 1320     33.14     15.35    1.00      56.00      55.00    0.42
countyfips    17 1320    101.55    110.07    1.00     830.00     829.00    3.03
execs         18 1320      0.01      0.11    0.00       2.00       2.00    0.00
lpopul        19 1320     10.41      1.37    4.95      16.03      11.08    0.04
execrate      20 1320      0.00      0.04    0.00       1.18       1.18    0.00
statename     21 1320       NaN        NA     Inf       -Inf       -Inf      NA
statecode     22 1320       NaN        NA     Inf       -Inf       -Inf      NA
region        23 1320       NaN        NA     Inf       -Inf       -Inf      NA
data_long <- train %>%
  select(where(is.numeric)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(data_long, aes(x = Variable, y = Value)) +
  geom_boxplot() +
  coord_flip() + 
  facet_wrap(~ Variable, scales = "free", ncol=3) +
  theme_minimal() +
  theme(axis.text.x = element_blank()) + 
  labs(title = "Horizontal Boxplot for Each Numeric Variable")

library(DataExplorer)
plot_boxplot(train, by="arrests") 

plot_boxplot(train, by="murders") 

plot_histogram(train)

plot_scatterplot(test, by="murders")

plot_correlation(train, type="continuous")

model <- lm(arrests ~ murders + I(murders^2) + percmale  +  density + rpcunemins + 
                                   execs + popul + murdrate + arrestrate + percblack + rpcpersinc, data=train)

model %>% as_flextable

Estimate

Standard Error

t value

Pr(>|t|)

(Intercept)

3.521

4.704

0.749

0.4543

murders

0.783

0.015

51.738

0.0000

***

I(murders^2)

-0.000

0.000

-4.289

0.0000

***

percmale

-0.049

0.092

-0.535

0.5928

density

0.001

0.000

11.125

0.0000

***

rpcunemins

-0.002

0.004

-0.484

0.6287

execs

0.144

1.381

0.104

0.9172

popul

0.000

0.000

6.627

0.0000

***

murdrate

-1.785

0.282

-6.321

0.0000

***

arrestrate

3.059

0.220

13.923

0.0000

***

percblack

-0.014

0.013

-1.134

0.2572

rpcpersinc

-0.000

0.000

-1.724

0.0849

.

Signif. codes: 0 <= '***' < 0.001 < '**' < 0.01 < '*' < 0.05

Residual standard error: 5.464 on 1308 degrees of freedom

Multiple R-squared: 0.9782, Adjusted R-squared: 0.9781

F-statistic: 5347 on 1308 and 11 DF, p-value: 0.0000

library(coefplot)
coefplot(model, soft="magnitude", intercept=FALSE)

library(car)
vif(model)
     murders I(murders^2)     percmale      density   rpcunemins        execs 
   20.080117     9.631030     1.079395     1.778414     1.038106     1.074181 
       popul     murdrate   arrestrate    percblack   rpcpersinc 
    7.553560     1.646289     1.587183     1.350868     1.502970 
quantile(round(residuals(model, type = "deviance"),2))
    0%    25%    50%    75%   100% 
-69.70  -0.54  -0.02   0.31 113.50 
plot(model)

library(broom)
train2 <- augment(model, data=train)  

p <- ggplot(train2, mapping = aes(y=.fitted, x=murders))
p  + geom_point()

p <- ggplot(train2, mapping = aes(y=.resid, x=.fitted))
p  + geom_point()

pred <- predict(model, newdata = test, interval="predict") 
test_w_predict <- cbind(test, pred)

test_w_predict %>%
  head(10) %>%
    select(murders, arrests, fit, lwr,upr) %>% 
  as_flextable(show_coltype = FALSE) 

murders

arrests

fit

lwr

upr

1

1

0.7

-10.1

11.5

14

20

14.9

4.1

25.6

0

1

1.4

-9.3

12.2

3

1

0.2

-10.6

11.0

0

3

7.6

-3.2

18.4

0

1

0.9

-9.9

11.6

5

12

8.6

-2.1

19.4

5

3

4.7

-6.1

15.4

2

2

3.0

-7.8

13.7

0

1

1.1

-9.6

11.8

n: 10

library(broom)
train2 <- augment(model, data=train)

p <- ggplot(train2, mapping = aes(y=.fitted, x=murders))
p  + geom_point()

p <- ggplot(train2, mapping = aes(y=.resid, x=.fitted))
p  + geom_point()

pred <- predict(model, newdata = test, interval="predict") 
test_w_predict <- cbind(test, pred) 

test_w_predict %>%
  head(10) %>%
    select(arrests, murders, fit, lwr,upr) %>% 
  as_flextable(show_coltype = FALSE) 

arrests

murders

fit

lwr

upr

1

1

0.7

-10.1

11.5

20

14

14.9

4.1

25.6

1

0

1.4

-9.3

12.2

1

3

0.2

-10.6

11.0

3

0

7.6

-3.2

18.4

1

0

0.9

-9.9

11.6

12

5

8.6

-2.1

19.4

3

5

4.7

-6.1

15.4

2

2

3.0

-7.8

13.7

1

0

1.1

-9.6

11.8

n: 10

plot(test_w_predict$arrests, test_w_predict$murders, pch=16, cex=1)

library(Metrics)
metric_label <- c("MAE","RMSE", "MAPE")
metrics <- c(round(mae(test_w_predict$arrests, test_w_predict$murders),4),
                         round(rmse(test_w_predict$arrests, test_w_predict$murders),4),
                         round(mape(test_w_predict$arrests, test_w_predict$murders),4))
pmtable <- data.frame(Metric=metric_label, Value = metrics)
flextable(pmtable)

Metric

Value

MAE

3.5998

RMSE

26.4445

MAPE

pip <- lm(arrests ~ murders, data=train)
summary(pip)

Call:
lm(formula = arrests ~ murders, data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-70.863  -1.103  -1.103  -0.103 123.789 

Coefficients:
            Estimate Std. Error t value             Pr(>|t|)    
(Intercept) 1.103157   0.182736   6.037        0.00000000204 ***
murders     0.815210   0.004064 200.598 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.573 on 1318 degrees of freedom
Multiple R-squared:  0.9683,    Adjusted R-squared:  0.9683 
F-statistic: 4.024e+04 on 1 and 1318 DF,  p-value: < 0.00000000000000022
library(ggplot2)
p <- ggplot(data=train, mapping=aes(y=log(arrests), x=murders))

p +  geom_point(alpha = 0.2) + 
       geom_smooth(method = "lm", aes(color = "OLS", fill = "OLS"))

p + geom_point(alpha=0.1) +
    geom_smooth(color = "tomato", fill="tomato", method = MASS::rlm) +
    geom_smooth(color = "steelblue", fill="steelblue", method = "lm")

p + geom_point(alpha=0.1) +
    geom_smooth(color = "tomato", method = "lm", linewidth = 1.2, 
                formula = y ~ splines::bs(x, 3), se = FALSE)

p + geom_point(alpha=0.1) +
    geom_quantile(color = "tomato", size = 1.2, method = "rqss",
                  lambda = 1, quantiles = c(0.20, 0.5, 0.85))

In relation to thre dependant variable arrests, the signiaficant variables were murders, density, population, arrest rate, and murder rate. In the coefficient plot murder rate was low value and arrest rate was high value. In the heat map, murder had high correlation with arrests and population. Arrests ranged from 0 to 1105, while murders ranged from 0 to 1403, which could mean that in some states the a person who got arrested murdered more than one victim. In most charts it shows the higher the murders, the higher the arrests.