library(DATA606)
library(ggplot2)
library(openintro)
library(tidyverse)
library(dplyr)
library("plotly")
library(fivethirtyeight)
My Research question is: To understand if there is a link between the hate crimes reported in the 10 days after the 2016 election and the different areas of data that are available in the dataset(variables).
Type of study: This is an observational study.
Data Source
For the Kaggle data link click here
The story that this data is based on can be found here
For easy access I have this .csv file in my github
Dependent Variable: The response variable is hate crime per 100k and it is numerical so quantitative.
Independent Variable: The independent variables are median_house_inc,share_unemp_seas,share_non_citizen,share_white_poverty,share_non_white,share_vote_trump.
Hypotheses:
Null Hypothesis: hate crimes do not depend on median household income.
Alternative Hypothesis: hate crimes depends on median household income.
This is a dataset from FiveThirtyEight hosted on their GitHub. There are 51 cases in this dataset corresponding to the different states .
Data is there in the fivethirtyeight package. It is loaded into a hate_crime dataframe.
The null valued hate_crimes_per_100k_splc are omitted from the dataset and the value for District of Columbia is also filtered out.
# load data
hate_crime1 <- hate_crimes %>% drop_na(hate_crimes_per_100k_splc) %>% filter(state != "District of Columbia")
head(hate_crime1)
## # A tibble: 6 x 13
## state state_abbrev median_house_inc share_unemp_seas share_pop_metro
## <chr> <chr> <int> <dbl> <dbl>
## 1 Alab~ AL 42278 0.06 0.64
## 2 Alas~ AK 67629 0.064 0.63
## 3 Ariz~ AZ 49254 0.063 0.9
## 4 Arka~ AR 44922 0.052 0.69
## 5 Cali~ CA 60487 0.059 0.97
## 6 Colo~ CO 60940 0.04 0.8
## # ... with 8 more variables: share_pop_hs <dbl>, share_non_citizen <dbl>,
## # share_white_poverty <dbl>, gini_index <dbl>, share_non_white <dbl>,
## # share_vote_trump <dbl>, hate_crimes_per_100k_splc <dbl>,
## # avg_hatecrimes_per_100k_fbi <dbl>
hate_crime <- hate_crime1 %>% select(state,state_abbrev,median_house_inc,share_unemp_seas,share_non_citizen,share_white_poverty,share_non_white,share_vote_trump,hate_crimes_per_100k_splc,avg_hatecrimes_per_100k_fbi)
summary(hate_crime)
## state state_abbrev median_house_inc share_unemp_seas
## Length:46 Length:46 Min. :35521 Min. :0.02900
## Class :character Class :character 1st Qu.:47414 1st Qu.:0.04325
## Mode :character Mode :character Median :54093 Median :0.05150
## Mean :54509 Mean :0.05052
## 3rd Qu.:60132 3rd Qu.:0.05775
## Max. :76165 Max. :0.07300
##
## share_non_citizen share_white_poverty share_non_white share_vote_trump
## Min. :0.01000 Min. :0.0500 Min. :0.0600 Min. :0.3300
## 1st Qu.:0.03000 1st Qu.:0.0800 1st Qu.:0.2025 1st Qu.:0.4200
## Median :0.04500 Median :0.0900 Median :0.2900 Median :0.4900
## Mean :0.05409 Mean :0.0937 Mean :0.3085 Mean :0.4933
## 3rd Qu.:0.08000 3rd Qu.:0.1075 3rd Qu.:0.4200 3rd Qu.:0.5700
## Max. :0.13000 Max. :0.1700 Max. :0.6200 Max. :0.6900
## NA's :2
## hate_crimes_per_100k_splc avg_hatecrimes_per_100k_fbi
## Min. :0.06745 Min. :0.412
## 1st Qu.:0.14219 1st Qu.:1.293
## Median :0.22576 Median :1.923
## Mean :0.27761 Mean :2.155
## 3rd Qu.:0.34681 3rd Qu.:3.065
## Max. :0.83285 Max. :4.802
##
hate_crime_long<-hate_crime %>%
pivot_longer(cols = 3:10, names_to = "variables") #dataset is converted to long format
hate_crime_long %>% ggplot(aes(x = value))+
geom_density()+
facet_wrap(variables~.,scales = 'free', ncol = 3)
## Warning: Removed 2 rows containing non-finite values (stat_density).
hate_crime_long %>% ggplot(aes(x = value))+
geom_boxplot()+
facet_wrap(variables~.,scales = 'free', ncol = 3)
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
w <- list(color = toRGB("white"), width = 2)
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
p <- plot_geo(hate_crime, locationmode = 'USA-states') %>%
add_trace(
z = ~avg_hatecrimes_per_100k_fbi, locations = ~state_abbrev,
color = ~avg_hatecrimes_per_100k_fbi, colors = 'Reds'
) %>%
colorbar(title = "Average hate crime") %>%
layout(
title = 'Statewise average hate crime by FBI',
geo = g
)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
p
p1 <- plot_geo(hate_crime, locationmode = 'USA-states') %>%
add_trace(
z = ~hate_crimes_per_100k_splc, locations = ~state_abbrev,
color = ~hate_crimes_per_100k_splc, colors = 'Reds'
) %>%
colorbar(title = "Hate crime as per SPLC") %>%
layout(
title = 'Statewise hate crime as per SPLC',
geo = g
)
p1
dependent<- unique(hate_crime_long$variables)[7]
independent<-unique(hate_crime_long$variables)[1:6]
lm_fn<-function(data,x,y){
lm.1<-lm(formula(paste(y,"~",x)), data)
residual<-residuals(lm.1)
i<-round(lm.1$coefficients[[1]],2)
s<-round(lm.1$coefficients[[2]],4)
a<-round(summary(lm.1)$r.squared,2)
p<-round(summary(lm.1)$coefficients[,4][[2]],3)
paste0("Y = ", i,"+",s,"x", "\nR^2 = ", a, "\nP-Value = ", p)
G1<-ggplot(data = data,mapping = aes_string(x, y))+ geom_point(pch = 21, color = "black", fill ="skyblue",alpha = 0.7,size =3 )+
geom_smooth(method = "lm")+ labs(subtitle = paste0("R^2 = ", a,"\nP-Value = ", p,"\nY = ", s,"x","+",i),title = "Linear Model")
print(G1)
G2<-ggplot(lm.1,aes(x = .fitted, y = .resid))+ geom_point(pch = 21, color = "black", fill ="skyblue",alpha = 0.7,size =3 )+ geom_segment(aes(x = .fitted, xend =.fitted, y = .resid, yend =0), linetype = 2, color = "red")+ geom_hline(yintercept = 0)+ labs(title = "Residual")
print(G2)
print(summary(lm.1))
}
lm_fn(hate_crime,independent[1],dependent)
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = formula(paste(y, "~", x)), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25922 -0.10473 -0.02883 0.07081 0.53087
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.668e-02 1.553e-01 -0.172 0.8643
## median_house_inc 5.582e-06 2.810e-06 1.987 0.0532 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1722 on 44 degrees of freedom
## Multiple R-squared: 0.08232, Adjusted R-squared: 0.06147
## F-statistic: 3.947 on 1 and 44 DF, p-value: 0.0532
panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...){
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
rreal = cor(x, y)
txtreal <- format(c(rreal, 0.123456789), digits=digits)[1]
txt <- format(c(r, 0.123456789), digits=digits)[1]
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txtreal, cex = cex.cor * r)
}
hate_crime <- na.omit(hate_crime)
hate_crime %>% select(median_house_inc,share_unemp_seas,share_non_citizen,share_white_poverty,share_non_white,share_vote_trump) %>% pairs(lower.panel=panel.cor)
From the above analysis it is seen that the correlation in the case of median_house_inc is 0.07585 and the p value is 0.07039 which is greater than 0.05 so the ‘null hypothesis’ cannot be rejected. Therefore it is safe to conclude that hate crimes do not depend on median household income.
Also the strongest relationship is observed between hate_crimes_per_100k_splc and share_vote_trump having a correlation of 0.1608. This means that the number of hate crimes per 100k of people tend to increase when the share of people who voted for trump increases.The p value is 0.006977 which is less than 0.05 and the conditions for inference are also met.
http://www.briansarnacki.com/an-easy-way-to-map-data-in-r-with-plotly/
Openintro Statistics, Fourth Edition, David Diez