var_names <- names(data)
# new dataset with all the variable names

datasummary_skim(data)
Unique Missing Pct. Mean SD Min Median Max Histogram
2000 census tract 172 0 58.9 30.5 1.0 68.0 99.1
total population 168 0 3123.8 1396.8 149.0 2947.0 7278.0
percent of the tract that identifies as black 74 0 67.6 34.7 2.7 85.0 100.0
percent of the tract that identifies as white 91 0 22.6 29.9 0.0 5.2 94.0
percent of the population that identifies as Hispanic 84 0 6.8 9.2 0.0 3.8 51.0
percent of the tract that identifies as Asian 65 0 2.5 4.3 0.0 1.0 40.0
percent of the tract that is foreign born 87 0 11.1 11.4 0.0 6.5 54.0
percent of the tract that lives below the poverty line 81 0 22.6 15.6 2.0 19.0 90.0
percent of the tract that is unemployed 83 0 12.7 11.5 0.0 11.0 83.0
percent of the tract without a high school degree 72 1 25.3 14.6 0.0 27.0 65.0
average income in 2010 dollars 171 1 96489.8 75164.5 23026.0 70815.5 463490.0
percent of the tract that owns their home 84 1 40.8 23.3 0.0 38.0 100.0
Log(total_pop) 168 0 7.9 0.6 5.0 8.0 10.1
Neighborhood cluster 38 2 19.8 11.5 1.0 20.0 39.0
Ward 8 0 4.8 2.2 1.0 5.0 8.0
violent crime rate (per 1,000) in 2000 72 0 18.4 18.2 0.5 16.0 145.0
violent crime rate (per 1,000) in 2001 75 0 18.7 19.1 0.2 16.0 158.0
percent of the tract receiving TANF 153 0 9.6 9.9 0.0 6.7 43.6
percent of the tract receiving foodstamps 166 4 0.2 0.2 0.0 0.1 1.4
log(avg_income) 171 1 11.3 0.6 10.0 11.2 13.0
crime if association with poverty is deterministic 81 0 18.7 5.3 11.7 17.5 41.5
simulated X 172 0 0.1 1.1 -2.6 0.2 2.7
simulated Y 172 0 0.1 1.5 -3.7 0.1 3.5
ggplot(data, aes(x = vcr2001)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black") +
  labs(title = "Histogram of Violent Crime Rate in 2001",
       x = "Violent Crime Rate (per 1,000)",
       y = "Frequency")

ggplot(data, aes(x = vcr2001)) +
  geom_histogram(binwidth = .2, fill = "blue", color = "black") +
  labs(title = "Histogram of Violent Crime Rate in 2001",
       x = "Violent Crime Rate (per 1,000)",
       y = "Frequency")

# the bin width now allows for space between the columns to better visually distinguish the frequencyof each violent crime rate
ggplot(data, aes(x = vcr2001)) +
  geom_density() +
  geom_vline(aes(xintercept = mean(vcr2001, na.rm = TRUE)), colour="red") +
  labs(title = "Density Plot of Violent Crime Rate in 2001",
       y = "Density",
       x = "Violent Crime Rate (per 1,000)",
       caption = "Notes: Washington D.C. census tracts, 2000") +
  theme(axis.text=element_text(size=12), axis.title=element_text(size=14))

# The red line here shows the mean of violent crime rates by census tract in 2001
mean(data$vcr2001)
## [1] 18.72093
# mean is 18.72
data %>%
  ggplot(aes(x = poverty_rate, y = vcr2001)) +
  geom_point(color="black") +
  ggtitle("Scatter Plot of Violent Crime Rate vs. % Poverty") +
  labs(y="Violent Crime Rate (per 1,000)", x="% Poverty", 
       caption="Notes: Washington D.C. census tracts, 2000") +
  theme_light()

# It looks like the relationship between these 2 variables is slightly positive but probably not very strongly; there are some outliers that may be skewing the data though
mean_vcr <- mean(data$vcr2001, na.rm = TRUE)

data %>%
  ggplot(aes(x = poverty_rate, y = vcr2001)) +
  geom_point(color="black") +
  # Add horizontal line representing the overall mean
  geom_hline(yintercept = mean_vcr, linetype = "dashed", color = "blue") +
  # Add vertical lines representing distance from each point to the mean
  geom_segment(aes(xend = poverty_rate, yend = mean_vcr), 
               color = "red", alpha = 0.5) +
  ggtitle("Scatter plot of violent crime rate and % poverty") +
  labs(y="Violent Crime Rate (per 1,000)", x="% Poverty", 
       caption = "Notes: Washington D.C. census tracts, 2000. Red lines represent distances contributing to TSS") +
  theme_minimal()

# to get the SST, you should take each observed outcome, subtract the mean, and square it. This will tell us the total variation we have in the population data
data %>%
  ggplot(aes(x = poverty_rate, y = vcr2001))+
  geom_point(color="black") +
  geom_smooth(method="lm", color="red", fill=NA)+
  geom_vline(data=data, aes(xintercept = mean(poverty_rate)), colour="blue", lty="dashed") +
  geom_hline(data=data, aes(yintercept = mean(vcr2001)), colour="blue", lty="dashed") +
  ggtitle("Scatter plot of violent crime rate and % poverty") +
  labs(y="VCR (per 1,000)", x="% Poverty", caption="Notes: Washington D.C. census tracts, 2000 ") 
## `geom_smooth()` using formula = 'y ~ x'

# this validates my assumption that the relationship between the two is mildly positive. In other words, census tracts with higher poverty in DC circa 2001 did have slightly higher violent crime rates on average
data %>%
  ggplot(aes(x = vcr2001, y = homeownership_rate))+
  geom_point(color="black") +
  geom_smooth(method="lm", color="red", fill=NA)+
  geom_vline(data=data, aes(xintercept = mean(vcr2001, na.rm = TRUE)), colour="blue", lty="dashed") +
  geom_hline(data=data, aes(yintercept = mean(homeownership_rate, na.rm = TRUE)), colour="blue", lty="dashed") +
  ggtitle("Scatter Plot of Violent Crime Rate vs. % Homeownership with Averages") +
  labs(y="Violent Crime Rate (per 1,000)", x="% Homeownership", caption="Notes: Washington D.C. census tracts, 2000 ") 
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# This relationship is slightly negative. In other words, census tracts in DC circa 2001 had less violent crime when there was higher homeownership rates (on average)
lm_model <- lm(vcr2001 ~ poverty_rate, data = data)
summary(lm_model)
## 
## Call:
## lm(formula = vcr2001 ~ poverty_rate, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.906  -7.779  -2.512   2.462 145.821 
## 
## Coefficients:
##              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)   11.0627     2.4790   4.463 0.0000147 ***
## poverty_rate   0.3383     0.0902   3.750  0.000242 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.43 on 170 degrees of freedom
## Multiple R-squared:  0.07641,    Adjusted R-squared:  0.07098 
## F-statistic: 14.06 on 1 and 170 DF,  p-value: 0.0002419
# for every one percent increase in poverty in a given census tract, the violent crime rate goes up about .34 (per 1,000 people)

# The intercept here means that a hypothetical census tract with no poverty would still have a violent crime rate of 11.06 per 1,000 people

# The r-squared is about .07, meaning that the model only explains about 7% of the variance of the observed outcomes (not a good thing)

confint(lm_model)
##                  2.5 %     97.5 %
## (Intercept)  6.1690590 15.9563493
## poverty_rate 0.1602112  0.5163071
# we're 95% sure that the true intercept is between 6.17 and 15.96
# for every 1% increase in poverty rate, we're 95% confident that violent crime will rise somewhere between .16 and .51 per every 1,000 people