Lab2_stats2

var_names <- names(data)
# new dataset with all the variable names

datasummary_skim(data)

	Unique	Missing Pct.	Mean	SD	Min	Median	Max
2000 census tract	172	0	58.9	30.5	1.0	68.0	99.1
total population	168	0	3123.8	1396.8	149.0	2947.0	7278.0
percent of the tract that identifies as black	74	0	67.6	34.7	2.7	85.0	100.0
percent of the tract that identifies as white	91	0	22.6	29.9	0.0	5.2	94.0
percent of the population that identifies as Hispanic	84	0	6.8	9.2	0.0	3.8	51.0
percent of the tract that identifies as Asian	65	0	2.5	4.3	0.0	1.0	40.0
percent of the tract that is foreign born	87	0	11.1	11.4	0.0	6.5	54.0
percent of the tract that lives below the poverty line	81	0	22.6	15.6	2.0	19.0	90.0
percent of the tract that is unemployed	83	0	12.7	11.5	0.0	11.0	83.0
percent of the tract without a high school degree	72	1	25.3	14.6	0.0	27.0	65.0
average income in 2010 dollars	171	1	96489.8	75164.5	23026.0	70815.5	463490.0
percent of the tract that owns their home	84	1	40.8	23.3	0.0	38.0	100.0
Log(total_pop)	168	0	7.9	0.6	5.0	8.0	10.1
Neighborhood cluster	38	2	19.8	11.5	1.0	20.0	39.0
Ward	8	0	4.8	2.2	1.0	5.0	8.0
violent crime rate (per 1,000) in 2000	72	0	18.4	18.2	0.5	16.0	145.0
violent crime rate (per 1,000) in 2001	75	0	18.7	19.1	0.2	16.0	158.0
percent of the tract receiving TANF	153	0	9.6	9.9	0.0	6.7	43.6
percent of the tract receiving foodstamps	166	4	0.2	0.2	0.0	0.1	1.4
log(avg_income)	171	1	11.3	0.6	10.0	11.2	13.0
crime if association with poverty is deterministic	81	0	18.7	5.3	11.7	17.5	41.5
simulated X	172	0	0.1	1.1	-2.6	0.2	2.7
simulated Y	172	0	0.1	1.5	-3.7	0.1	3.5

ggplot(data, aes(x = vcr2001)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black") +
  labs(title = "Histogram of Violent Crime Rate in 2001",
       x = "Violent Crime Rate (per 1,000)",
       y = "Frequency")

ggplot(data, aes(x = vcr2001)) +
  geom_histogram(binwidth = .2, fill = "blue", color = "black") +
  labs(title = "Histogram of Violent Crime Rate in 2001",
       x = "Violent Crime Rate (per 1,000)",
       y = "Frequency")

# the bin width now allows for space between the columns to better visually distinguish the frequencyof each violent crime rate

ggplot(data, aes(x = vcr2001)) +
  geom_density() +
  geom_vline(aes(xintercept = mean(vcr2001, na.rm = TRUE)), colour="red") +
  labs(title = "Density Plot of Violent Crime Rate in 2001",
       y = "Density",
       x = "Violent Crime Rate (per 1,000)",
       caption = "Notes: Washington D.C. census tracts, 2000") +
  theme(axis.text=element_text(size=12), axis.title=element_text(size=14))

# The red line here shows the mean of violent crime rates by census tract in 2001
mean(data$vcr2001)

## [1] 18.72093

# mean is 18.72

data %>%
  ggplot(aes(x = poverty_rate, y = vcr2001)) +
  geom_point(color="black") +
  ggtitle("Scatter Plot of Violent Crime Rate vs. % Poverty") +
  labs(y="Violent Crime Rate (per 1,000)", x="% Poverty", 
       caption="Notes: Washington D.C. census tracts, 2000") +
  theme_light()

# It looks like the relationship between these 2 variables is slightly positive but probably not very strongly; there are some outliers that may be skewing the data though

mean_vcr <- mean(data$vcr2001, na.rm = TRUE)

data %>%
  ggplot(aes(x = poverty_rate, y = vcr2001)) +
  geom_point(color="black") +
  # Add horizontal line representing the overall mean
  geom_hline(yintercept = mean_vcr, linetype = "dashed", color = "blue") +
  # Add vertical lines representing distance from each point to the mean
  geom_segment(aes(xend = poverty_rate, yend = mean_vcr), 
               color = "red", alpha = 0.5) +
  ggtitle("Scatter plot of violent crime rate and % poverty") +
  labs(y="Violent Crime Rate (per 1,000)", x="% Poverty", 
       caption = "Notes: Washington D.C. census tracts, 2000. Red lines represent distances contributing to TSS") +
  theme_minimal()

# to get the SST, you should take each observed outcome, subtract the mean, and square it. This will tell us the total variation we have in the population data

data %>%
  ggplot(aes(x = poverty_rate, y = vcr2001))+
  geom_point(color="black") +
  geom_smooth(method="lm", color="red", fill=NA)+
  geom_vline(data=data, aes(xintercept = mean(poverty_rate)), colour="blue", lty="dashed") +
  geom_hline(data=data, aes(yintercept = mean(vcr2001)), colour="blue", lty="dashed") +
  ggtitle("Scatter plot of violent crime rate and % poverty") +
  labs(y="VCR (per 1,000)", x="% Poverty", caption="Notes: Washington D.C. census tracts, 2000 ")

## `geom_smooth()` using formula = 'y ~ x'

# this validates my assumption that the relationship between the two is mildly positive. In other words, census tracts with higher poverty in DC circa 2001 did have slightly higher violent crime rates on average

data %>%
  ggplot(aes(x = vcr2001, y = homeownership_rate))+
  geom_point(color="black") +
  geom_smooth(method="lm", color="red", fill=NA)+
  geom_vline(data=data, aes(xintercept = mean(vcr2001, na.rm = TRUE)), colour="blue", lty="dashed") +
  geom_hline(data=data, aes(yintercept = mean(homeownership_rate, na.rm = TRUE)), colour="blue", lty="dashed") +
  ggtitle("Scatter Plot of Violent Crime Rate vs. % Homeownership with Averages") +
  labs(y="Violent Crime Rate (per 1,000)", x="% Homeownership", caption="Notes: Washington D.C. census tracts, 2000 ")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# This relationship is slightly negative. In other words, census tracts in DC circa 2001 had less violent crime when there was higher homeownership rates (on average)

lm_model <- lm(vcr2001 ~ poverty_rate, data = data)
summary(lm_model)

## 
## Call:
## lm(formula = vcr2001 ~ poverty_rate, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.906  -7.779  -2.512   2.462 145.821 
## 
## Coefficients:
##              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)   11.0627     2.4790   4.463 0.0000147 ***
## poverty_rate   0.3383     0.0902   3.750  0.000242 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.43 on 170 degrees of freedom
## Multiple R-squared:  0.07641,    Adjusted R-squared:  0.07098 
## F-statistic: 14.06 on 1 and 170 DF,  p-value: 0.0002419

# for every one percent increase in poverty in a given census tract, the violent crime rate goes up about .34 (per 1,000 people)

# The intercept here means that a hypothetical census tract with no poverty would still have a violent crime rate of 11.06 per 1,000 people

# The r-squared is about .07, meaning that the model only explains about 7% of the variance of the observed outcomes (not a good thing)

confint(lm_model)

##                  2.5 %     97.5 %
## (Intercept)  6.1690590 15.9563493
## poverty_rate 0.1602112  0.5163071

# we're 95% sure that the true intercept is between 6.17 and 15.96
# for every 1% increase in poverty rate, we're 95% confident that violent crime will rise somewhere between .16 and .51 per every 1,000 people

Lab2_stats2

Matthew Reyes

2026-02-03