# Setup options
options(repos = c(CRAN = "https://cloud.r-project.org"))
# Uncomment and run the following line if packages aren't already installed
# install.packages(c("tidyverse", "tidycensus", "sf", "plotly", "forecast"))
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.5
## âś” forcats 1.0.0 âś” stringr 1.5.1
## âś” ggplot2 3.5.1 âś” tibble 3.2.1
## âś” lubridate 1.9.3 âś” tidyr 1.3.1
## âś” purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidycensus)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Set your Census API key (Only needs to be done once in the session)
census_api_key("f4f4801feced14583355d7a62646514912610e1d", install = TRUE, overwrite = TRUE)
## Your original .Renviron will be backed up and stored in your R HOME directory if needed.
## Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY").
## To use now, restart R or run `readRenviron("~/.Renviron")`
## [1] "f4f4801feced14583355d7a62646514912610e1d"
# Define variables to retrieve
variables <- c(white = "B03002_003", black = "B03002_004", median_income = "B19013_001", total_population = "B01003_001")
# Retrieve data at the census tract level for Washtenaw County (FIPS code 26161) in 2021
data <- get_acs(
geography = "tract",
variables = variables,
state = "MI",
county = "Washtenaw",
year = 2021,
survey = "acs5",
output = "wide",
geometry = FALSE
) %>%
rename(tract = GEOID) # Rename GEOID to tract for easier reference
## Getting data from the 2017-2021 5-year ACS
# Check the first few rows of the data to ensure it's correct
head(data)
## # A tibble: 6 Ă— 10
## tract NAME whiteE whiteM blackE blackM median_incomeE median_incomeM
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 26161400100 Census … 1346 369 90 73 24705 21732
## 2 26161400300 Census … 3741 915 320 119 31213 20828
## 3 26161400400 Census … 2612 327 86 102 121452 16140
## 4 26161400500 Census … 5823 982 239 124 29347 4802
## 5 26161400600 Census … 3950 469 1 5 85459 17904
## 6 26161400700 Census … 2570 801 588 388 62743 39215
## # ℹ 2 more variables: total_populationE <dbl>, total_populationM <dbl>
# Step 3: Descriptive Statistics
summary_stats <- data %>%
summarise(
mean_white = mean(whiteE, na.rm = TRUE),
median_white = median(whiteE, na.rm = TRUE),
min_white = min(whiteE, na.rm = TRUE),
max_white = max(whiteE, na.rm = TRUE),
mean_black = mean(blackE, na.rm = TRUE),
median_black = median(blackE, na.rm = TRUE),
min_black = min(blackE, na.rm = TRUE),
max_black = max(blackE, na.rm = TRUE),
mean_income = mean(median_incomeE, na.rm = TRUE),
median_income = median(median_incomeE, na.rm = TRUE),
min_income = min(median_incomeE, na.rm = TRUE),
max_income = max(median_incomeE, na.rm = TRUE)
)
print(summary_stats)
## # A tibble: 1 Ă— 12
## mean_white median_white min_white max_white mean_black median_black min_black
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2409. 2307 0 6650 400. 194 0
## # ℹ 5 more variables: max_black <dbl>, mean_income <dbl>, median_income <dbl>,
## # min_income <dbl>, max_income <dbl>
# Scatter plot - Percentage of White Population vs Median Income
ggplot(data, aes(x = whiteE / total_populationE * 100, y = median_incomeE)) +
geom_point(color = "blue") +
labs(title = "Percentage of White Population vs Median Household Income",
x = "Percentage of White Population",
y = "Median Household Income")
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Scatter plot - Percentage of Black Population vs Median Income
ggplot(data, aes(x = blackE / total_populationE * 100, y = median_incomeE)) +
geom_point(color = "red") +
labs(title = "Percentage of Black Population vs Median Household Income",
x = "Percentage of Black Population",
y = "Median Household Income")
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Histogram of Median Household Income
ggplot(data, aes(x = median_incomeE)) +
geom_histogram(binwidth = 5000, fill = "skyblue", color = "black") +
labs(title = "Distribution of Median Household Income", x = "Median Household Income", y = "Frequency")
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Boxplot for White and Black Population
data_long <- data %>%
pivot_longer(cols = c(whiteE, blackE), names_to = "Race", values_to = "Population")
ggplot(data_long, aes(x = Race, y = Population)) +
geom_boxplot(fill ="orange") +
labs(title = "Boxplot of White and Black Population in Washtenaw County")

# Probability Density Function (PDF)
ggplot(data, aes(x = median_incomeE)) +
geom_density(aes(color = "Income Density")) +
geom_density(aes(x = whiteE, color = "White Population Density")) +
labs(title = "PDF of Median Income and White Population",
x = "Median Income",
y = "Density")
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_density()`).

# Cumulative Density Function (CDF) for Median Income and Black Population Percentage
data_income <- data %>%
arrange(median_incomeE) %>%
mutate(cdf_income = cumsum(rep(1 / n(), n()))) %>%
select(median_incomeE, cdf_income)
data_black <- data %>%
mutate(black_pct = blackE / total_populationE * 100) %>%
arrange(black_pct) %>%
mutate(cdf_black = cumsum(rep(1 / n(), n()))) %>%
select(black_pct, cdf_black)
# Plot CDF
ggplot() +
geom_line(data = data_income, aes(x = median_incomeE, y = cdf_income, color = "Median Income"), size = 1) +
geom_line(data = data_black, aes(x = black_pct, y = cdf_black, color = "Black Population Percentage"), size = 1) +
labs(
title = "Cumulative Density Function (CDF) for Median Income and Black Population Percentage",
x = "Value",
y = "Cumulative Probability"
) +
scale_color_manual(values = c("Median Income" = "blue", "Black Population Percentage" = "red")) +
theme_minimal() +
theme(legend.title = element_blank())
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Predict population growth from 2021 to 2030
pop_data <- data %>%
summarise(total_population = sum(total_populationE)) %>%
mutate(year = 2021)
# Ensure 'year' is numeric
pop_data$year <- as.numeric(pop_data$year)
# Create a data frame for future years (2025 to 2030)
future_years <- data.frame(year = 2025:2030)
# Fit a linear model for population growth
fit <- lm(total_population ~ year, data = pop_data)
# Predict population for future years
predictions <- predict(fit, newdata = future_years)
## Warning in predict.lm(fit, newdata = future_years): prediction from
## rank-deficient fit; attr(*, "non-estim") has doubtful cases
# Display predictions
print(predictions)
## 1 2 3 4 5 6
## 372428 372428 372428 372428 372428 372428
## attr(,"non-estim")
## 2 3 4 5 6
## 2 3 4 5 6
# Step 7: Correlation and Regression Analysis
data <- data %>%
mutate(white_pct = whiteE / total_populationE * 100, black_pct = blackE / total_populationE * 100)
# Correlation with White Population Percentage
correlation_white <- cor(data$white_pct, data$median_incomeE, use = "complete.obs")
print(paste("Correlation between White Population Percentage and Median Income:", correlation_white))
## [1] "Correlation between White Population Percentage and Median Income: 0.4287309140617"
# OLS Regression with White and Black Population Percentages
regression_model <- lm(median_incomeE ~ white_pct + black_pct, data = data)
summary(regression_model)
##
## Call:
## lm(formula = median_incomeE ~ white_pct + black_pct, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78118 -19192 -6038 16843 105910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53703.8 26574.9 2.021 0.0461 *
## white_pct 565.9 324.6 1.743 0.0844 .
## black_pct -636.3 406.3 -1.566 0.1205
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36630 on 97 degrees of freedom
## (7 observations deleted due to missingness)
## Multiple R-squared: 0.2039, Adjusted R-squared: 0.1875
## F-statistic: 12.43 on 2 and 97 DF, p-value: 1.57e-05
# Scatter plot with regression line for white_pct vs median_incomeE
ggplot(data, aes(x = white_pct, y = median_incomeE)) +
geom_point(alpha = 0.6, color = "blue") +
geom_smooth(method = "lm", formula = y ~ x, color = "red", se = FALSE) +
labs(
title = paste("Scatter Plot and Regression Line: White Population % and Median Income\nCorrelation:", round(correlation_white, 2)),
x = "White Population Percentage",
y = "Median Household Income"
) +
theme_minimal()
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Create an interactive scatter plot with Plotly
plot <- ggplot(data, aes(x = white_pct, y = median_incomeE, text = tract)) +
geom_point(color ="Blue") +
labs(
title = "Interactive Scatter Plot: White Population % vs Median Income",
x = "White Population (%)",
y = "Median Household Income"
)
# Convert ggplot to an interactive plot using Plotly
interactive_plot <- ggplotly(plot)
# Display the interactive plot
interactive_plot
#This analysis examines the link between demographic composition and median household income in Washtenaw County, identifying some significant trends and limitations. Although there is a moderate correlation between White and Black population percentages and income, the low R-squared values indicate that race alone does not adequately account for income variation across census tracts. Other socioeconomic factors—such as education, employment, industry distribution, and housing affordability—likely play substantial roles in determining income levels.
#The regression analysis found a slight positive association between the percentage of White population and income, while the Black population percentage showed a weak, statistically insignificant negative association. This suggests that racial demographics alone are not strong predictors of income in this area.
#The forecast of population growth from 2021 to 2030 points to a moderate increase, which will impact local infrastructure, housing, education, and community services. As the county's demographics evolve, proactive planning and policy adjustments will be essential to meet community needs and support equitable development across all demographic groups.