data <- read.csv("hatecrimes_data.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
variables <- data %>% select(hate_crimes_per_100k_splc, inequality, median_household_income, share_non_white)
Summary Table
mean1 <- variables %>% summarise(across(everything(), mean, na.rm=TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(everything(), mean, na.rm = TRUE)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
sd1 <- variables %>% summarise(across(everything(), sd, na.rm=TRUE))
min1 <- variables %>% summarise(across(everything(), min, na.rm=TRUE))
max1 <- variables %>% summarise(across(everything(), max, na.rm=TRUE))
#put together into a table using rbind()
table <- rbind(mean1, sd1, min1, max1)
#Usually, we want the variables to be in the rows and the summary measures to be in the columns - let's flip them around
rownames(table) <- c("Mean", "Standard Deviation", "Minimum", "Maximum")
#Transpose the dataframe to flip it around
table <- t(table)
#remove scientific notation
options(scipen = 999)
#Tell R to round the data to make it look nicer
table <- table %>% as.data.frame %>% mutate_if(is.numeric, round, digits=2)
#show the table using kableExtra
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
table %>%
kbl() %>%
kable_classic_2("striped", full_width = F)
| Mean | Standard Deviation | Minimum | Maximum | |
|---|---|---|---|---|
| hate_crimes_per_100k_splc | 0.30 | 0.25 | 0.07 | 1.52 |
| inequality | 0.45 | 0.02 | 0.42 | 0.53 |
| median_household_income | 55223.61 | 9208.48 | 35521.00 | 76165.00 |
| share_non_white | 0.32 | 0.16 | 0.06 | 0.81 |
Histograms/Plots
#Histogram 1
library(ggplot2)
ggplot(data, aes(x = hate_crimes_per_100k_splc))+
geom_histogram(bins = 10, fill = "cornflowerblue", color = "gray")+
labs(title = "Hate Crimes per 100k People")+
theme_minimal()
## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).
ggplot(data, aes(x = inequality))+
geom_histogram(bins = 10, fill = "tomato", color = "gray")+
labs(title = "Inequality")+
theme_minimal()
#plot 3
library(ggthemes)
ggplot(data)+
geom_point(aes(y = hate_crimes_per_100k_splc, x = inequality, color = state))+
geom_smooth(aes(y = hate_crimes_per_100k_splc, x = inequality), method = "lm")+
labs(title = "Hate Crimes vs Inequality", y = "Hate Crimes per 100k", x = "Inequality")+
theme_minimal()+
guides(color = F)+
theme(plot.title = element_text(hjust = 0.5))
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
Make a correlation plot
library(corrplot)
## corrplot 0.92 loaded
cor_data <- cor(variables, use = "complete.obs")
corrplot(cor_data)
#click on gray button with arrow to show in new window - you can resize the plot from there to make it more visible
The regression model
summary(lm(formula = hate_crimes_per_100k_splc ~ inequality + median_household_income + share_non_white, data = variables))
##
## Call:
## lm(formula = hate_crimes_per_100k_splc ~ inequality + median_household_income +
## share_non_white, data = variables)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.35476 -0.10791 -0.05605 0.07765 0.69690
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.183440886 0.874954059 -3.638 0.00073 ***
## inequality 6.547087807 1.895658282 3.454 0.00125 **
## median_household_income 0.000011722 0.000003503 3.347 0.00171 **
## share_non_white -0.435826999 0.258514525 -1.686 0.09906 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2163 on 43 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.315, Adjusted R-squared: 0.2672
## F-statistic: 6.59 on 3 and 43 DF, p-value: 0.0009177