library(DATA606)
library(ggplot2)
library(openintro)
library(tidyverse)
library(dplyr)
library("plotly")
library(fivethirtyeight)

Part 1 - Introduction

My Research question is: To understand if there is a link between the hate crimes reported in the 10 days after the 2016 election and the different areas of data that are available in the dataset(variables).

Type of study: This is an observational study.

Data Source

For the Kaggle data link click here

The story that this data is based on can be found here

For easy access I have this .csv file in my github

Dependent Variable: The response variable is hate crime per 100k and it is numerical so quantitative.

Independent Variable: The independent variables are median_house_inc,share_unemp_seas,share_non_citizen,share_white_poverty,share_non_white,share_vote_trump.

Hypotheses:

Null Hypothesis: hate crimes do not depend on median household income.

Alternative Hypothesis: hate crimes depends on median household income.

Part 2 - Data

This is a dataset from FiveThirtyEight hosted on their GitHub. There are 51 cases in this dataset corresponding to the different states .

Data is there in the fivethirtyeight package. It is loaded into a hate_crime dataframe.

The null valued hate_crimes_per_100k_splc are omitted from the dataset and the value for District of Columbia is also filtered out.

# load data
hate_crime1 <- hate_crimes %>% drop_na(hate_crimes_per_100k_splc) %>% filter(state != "District of Columbia")
head(hate_crime1)
## # A tibble: 6 x 13
##   state state_abbrev median_house_inc share_unemp_seas share_pop_metro
##   <chr> <chr>                   <int>            <dbl>           <dbl>
## 1 Alab~ AL                      42278            0.06             0.64
## 2 Alas~ AK                      67629            0.064            0.63
## 3 Ariz~ AZ                      49254            0.063            0.9 
## 4 Arka~ AR                      44922            0.052            0.69
## 5 Cali~ CA                      60487            0.059            0.97
## 6 Colo~ CO                      60940            0.04             0.8 
## # ... with 8 more variables: share_pop_hs <dbl>, share_non_citizen <dbl>,
## #   share_white_poverty <dbl>, gini_index <dbl>, share_non_white <dbl>,
## #   share_vote_trump <dbl>, hate_crimes_per_100k_splc <dbl>,
## #   avg_hatecrimes_per_100k_fbi <dbl>
hate_crime <- hate_crime1 %>% select(state,state_abbrev,median_house_inc,share_unemp_seas,share_non_citizen,share_white_poverty,share_non_white,share_vote_trump,hate_crimes_per_100k_splc,avg_hatecrimes_per_100k_fbi)

Part 3 - Exploratory data analysis

Summary

summary(hate_crime)
##     state           state_abbrev       median_house_inc share_unemp_seas 
##  Length:46          Length:46          Min.   :35521    Min.   :0.02900  
##  Class :character   Class :character   1st Qu.:47414    1st Qu.:0.04325  
##  Mode  :character   Mode  :character   Median :54093    Median :0.05150  
##                                        Mean   :54509    Mean   :0.05052  
##                                        3rd Qu.:60132    3rd Qu.:0.05775  
##                                        Max.   :76165    Max.   :0.07300  
##                                                                          
##  share_non_citizen share_white_poverty share_non_white  share_vote_trump
##  Min.   :0.01000   Min.   :0.0500      Min.   :0.0600   Min.   :0.3300  
##  1st Qu.:0.03000   1st Qu.:0.0800      1st Qu.:0.2025   1st Qu.:0.4200  
##  Median :0.04500   Median :0.0900      Median :0.2900   Median :0.4900  
##  Mean   :0.05409   Mean   :0.0937      Mean   :0.3085   Mean   :0.4933  
##  3rd Qu.:0.08000   3rd Qu.:0.1075      3rd Qu.:0.4200   3rd Qu.:0.5700  
##  Max.   :0.13000   Max.   :0.1700      Max.   :0.6200   Max.   :0.6900  
##  NA's   :2                                                              
##  hate_crimes_per_100k_splc avg_hatecrimes_per_100k_fbi
##  Min.   :0.06745           Min.   :0.412              
##  1st Qu.:0.14219           1st Qu.:1.293              
##  Median :0.22576           Median :1.923              
##  Mean   :0.27761           Mean   :2.155              
##  3rd Qu.:0.34681           3rd Qu.:3.065              
##  Max.   :0.83285           Max.   :4.802              
## 

Density Plot

hate_crime_long<-hate_crime %>% 
  pivot_longer(cols = 3:10, names_to = "variables")  #dataset is converted to long format

hate_crime_long %>% ggplot(aes(x = value))+
  geom_density()+
  facet_wrap(variables~.,scales = 'free', ncol = 3)
## Warning: Removed 2 rows containing non-finite values (stat_density).

Boxplot

hate_crime_long %>% ggplot(aes(x = value))+
  geom_boxplot()+
  facet_wrap(variables~.,scales = 'free', ncol = 3)
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).

Map - Statewise average hate crime by FBI

w <- list(color = toRGB("white"), width = 2)

g <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)


p <- plot_geo(hate_crime, locationmode = 'USA-states') %>%
  add_trace(
    z = ~avg_hatecrimes_per_100k_fbi, locations = ~state_abbrev,
    color = ~avg_hatecrimes_per_100k_fbi, colors = 'Reds'
  ) %>%
  colorbar(title = "Average hate crime") %>%
  layout(
    title = 'Statewise average hate crime by FBI',
    geo = g
  )
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
p

Map - Hate crime as per SPLC

p1 <- plot_geo(hate_crime, locationmode = 'USA-states') %>%
  add_trace(
    z = ~hate_crimes_per_100k_splc, locations = ~state_abbrev,
    color = ~hate_crimes_per_100k_splc, colors = 'Reds'
  ) %>%
  colorbar(title = "Hate crime as per SPLC") %>%
  layout(
    title = 'Statewise hate crime as per SPLC',
    geo = g
  )
p1  
dependent<- unique(hate_crime_long$variables)[7]

independent<-unique(hate_crime_long$variables)[1:6]

Part 4 - Inference

lm_fn<-function(data,x,y){
  lm.1<-lm(formula(paste(y,"~",x)), data)
  
  residual<-residuals(lm.1)
  i<-round(lm.1$coefficients[[1]],2)
  s<-round(lm.1$coefficients[[2]],4)
  a<-round(summary(lm.1)$r.squared,2)
  p<-round(summary(lm.1)$coefficients[,4][[2]],3)
  
  paste0("Y = ", i,"+",s,"x", "\nR^2 = ", a, "\nP-Value = ", p) 
  
  G1<-ggplot(data = data,mapping = aes_string(x, y))+ geom_point(pch = 21, color = "black", fill ="skyblue",alpha = 0.7,size =3 )+
    geom_smooth(method = "lm")+ labs(subtitle = paste0("R^2 = ", a,"\nP-Value = ", p,"\nY = ", s,"x","+",i),title = "Linear Model")
  
  print(G1)
  
  G2<-ggplot(lm.1,aes(x = .fitted, y = .resid))+ geom_point(pch = 21, color = "black", fill ="skyblue",alpha = 0.7,size =3 )+ geom_segment(aes(x = .fitted, xend =.fitted, y = .resid, yend =0), linetype = 2, color = "red")+ geom_hline(yintercept = 0)+ labs(title = "Residual")
  
  print(G2)
  
  print(summary(lm.1))
}

Linear Regression - median_house_inc

lm_fn(hate_crime,independent[1],dependent)
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = formula(paste(y, "~", x)), data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25922 -0.10473 -0.02883  0.07081  0.53087 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      -2.668e-02  1.553e-01  -0.172   0.8643  
## median_house_inc  5.582e-06  2.810e-06   1.987   0.0532 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1722 on 44 degrees of freedom
## Multiple R-squared:  0.08232,    Adjusted R-squared:  0.06147 
## F-statistic: 3.947 on 1 and 44 DF,  p-value: 0.0532

Linear Regression - share_unemp_seas

lm_fn(hate_crime,independent[2],dependent)
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = formula(paste(y, "~", x)), data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.20504 -0.13690 -0.04921  0.07256  0.58248 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        0.3975     0.1387   2.865  0.00637 **
## share_unemp_seas  -2.3729     2.6966  -0.880  0.38367   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1782 on 44 degrees of freedom
## Multiple R-squared:  0.01729,    Adjusted R-squared:  -0.005041 
## F-statistic: 0.7743 on 1 and 44 DF,  p-value: 0.3837

Linear Regression - share_non_citizen

lm_fn(hate_crime,independent[3],dependent)
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = formula(paste(y, "~", x)), data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.21881 -0.12712 -0.05224  0.06825  0.55177 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.25302    0.05362   4.719 2.64e-05 ***
## share_non_citizen  0.40084    0.86619   0.463    0.646    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1729 on 42 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.005073,   Adjusted R-squared:  -0.01862 
## F-statistic: 0.2141 on 1 and 42 DF,  p-value: 0.6459

Linear Regression - share_white_poverty

lm_fn(hate_crime,independent[4],dependent)
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = formula(paste(y, "~", x)), data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.21628 -0.14116 -0.05286  0.07532  0.55976 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           0.3447     0.1052   3.277  0.00205 **
## share_white_poverty  -0.7163     1.0870  -0.659  0.51337   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1789 on 44 degrees of freedom
## Multiple R-squared:  0.009772,   Adjusted R-squared:  -0.01273 
## F-statistic: 0.4342 on 1 and 44 DF,  p-value: 0.5134

Linear Regression - share_non_white

lm_fn(hate_crime,independent[5],dependent)
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = formula(paste(y, "~", x)), data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.22449 -0.12993 -0.03772  0.10778  0.53929 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.37907    0.06078   6.236 1.52e-07 ***
## share_non_white -0.32890    0.17881  -1.839   0.0726 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1732 on 44 degrees of freedom
## Multiple R-squared:  0.07141,    Adjusted R-squared:  0.0503 
## F-statistic: 3.383 on 1 and 44 DF,  p-value: 0.07261

Linear Regression - share_vote_trump

lm_fn(hate_crime,independent[6],dependent)
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = formula(paste(y, "~", x)), data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25833 -0.10270 -0.03169  0.04589  0.48816 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.6750     0.1325   5.093 7.08e-06 ***
## share_vote_trump  -0.8057     0.2642  -3.049  0.00387 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1633 on 44 degrees of freedom
## Multiple R-squared:  0.1745, Adjusted R-squared:  0.1557 
## F-statistic: 9.299 on 1 and 44 DF,  p-value: 0.003873

Multiple Regression

panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...){
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    rreal = cor(x, y)
    txtreal <- format(c(rreal, 0.123456789), digits=digits)[1]
    txt <- format(c(r, 0.123456789), digits=digits)[1]
    if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txtreal, cex = cex.cor * r)
}
hate_crime <- na.omit(hate_crime)
hate_crime %>% select(median_house_inc,share_unemp_seas,share_non_citizen,share_white_poverty,share_non_white,share_vote_trump) %>% pairs(lower.panel=panel.cor)

Part 5 - Conclusion

From the above analysis it is seen that the correlation in the case of median_house_inc is 0.07585 and the p value is 0.07039 which is greater than 0.05 so the ‘null hypothesis’ cannot be rejected. Therefore it is safe to conclude that hate crimes do not depend on median household income.

Also the strongest relationship is observed between hate_crimes_per_100k_splc and share_vote_trump having a correlation of 0.1608. This means that the number of hate crimes per 100k of people tend to increase when the share of people who voted for trump increases.The p value is 0.006977 which is less than 0.05 and the conditions for inference are also met.

References

http://www.briansarnacki.com/an-easy-way-to-map-data-in-r-with-plotly/

Openintro Statistics, Fourth Edition, David Diez