library(tidycensus)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# 2. List your research questions and ensure the following analysis will help you answer the questions

# What is the relationship between household income and poverty levels across different census tracts in Harris County?

# How does population density correlate with household income and poverty levels across census tracts?

# Is there a significant association between educational attainment and poverty levels across census tracts in Harris County?

# Can we predict variations in poverty levels based on household income, population density, and educational attainment within the county?

# Are there specific census tracts that demonstrate a unique combination of high educational attainment, low income, and high poverty levels, suggesting areas of potential disparity?


census_api_key("80cd72e3cddeaf00e930b56289358c534d1ecfe6", install="TRUE", overwrite=TRUE)
## Your original .Renviron will be backed up and stored in your R HOME directory if needed.
## Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY"). 
## To use now, restart R or run `readRenviron("~/.Renviron")`
## [1] "80cd72e3cddeaf00e930b56289358c534d1ecfe6"
# 3. Use Census API to get the census tract-level data with at least 4 variables 

harris_data <- get_acs(
  geography = "tract",
  variables = c(
    below_poverty = "B17001B_001E", 
    black_pop = "B02001_003E", 
    household_income = "B19013B_001E",
    education_attainment = "C15002B_001E",
    employment_status = "C23002B_001E",
    unemployment_status = "C23002B_003E",
    home_ownership = "B25003B_002E",
    rental_housing = "B25003B_003E"
  ),
  state = "TX", 
  county = "201",  # FIPS code for Harris (Houston) County
  year = 2020,
  output="wide"
)
## Getting data from the 2016-2020 5-year ACS
# 4. Calculate mean, median, min, and max values for relevant Census variables 

summary_stats_poverty <- harris_data %>%
  summarize(
    mean = mean(below_poverty, na.rm = TRUE),
    median = median(below_poverty, na.rm = TRUE),
    min = min(below_poverty, na.rm = TRUE),
    max = max(below_poverty, na.rm = TRUE)
  )

summary_stats_employment <- harris_data %>%
  summarize(
    mean = mean(employment_status, na.rm = TRUE),
    median = median(employment_status, na.rm = TRUE),
    min = min(employment_status, na.rm = TRUE),
    max = max(employment_status, na.rm = TRUE)
  )

summary_stats_unemployment <- harris_data %>%
  summarize(
    mean = mean(unemployment_status, na.rm = TRUE),
    median = median(unemployment_status, na.rm = TRUE),
    min = min(unemployment_status, na.rm = TRUE),
    max = max(unemployment_status, na.rm = TRUE)
  )

summary_stats_income <- harris_data %>%
  summarize(
    mean = mean(household_income, na.rm = TRUE),
    median = median(household_income, na.rm = TRUE),
    min = min(household_income, na.rm = TRUE),
    max = max(household_income, na.rm = TRUE)
  )

# 5. Make at least three types of figures (scatter plot, histogram plot, boxplot, bar plot, etc.) and summarize your findings

# HISTOGRAM

ggplot(harris_data, aes(x = black_pop)) +
  geom_histogram(binwidth = 1000, fill = "skyblue", color = "black") +
  labs(title = "Population Density Distribution of AA Across Harris County",
       x = "African American Population Density", y = "Frequency") +
  theme_minimal()

# DENSITY PLOT

ggplot(harris_data, aes(y = education_attainment)) +
  geom_density(fill = "navyblue", color = "black") +
  labs(title = "Distribution of Education Attainment in Harris County",
       x = "Education Attainment",
       y = "Frequency") +
  theme_minimal()

# SCATTERPLOT
ggplot(harris_data, aes(x = black_pop, y = below_poverty)) +
  geom_point(color = "darkblue") +
  geom_smooth(method = "lm", color = "orange") +
  labs(title = "Black Population vs. Poverty Levels",
       x = "Black Population",
       y = "Poverty Levels") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(harris_data, aes(x = education_attainment, y = below_poverty)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", color = "purple") +
  labs(title = "Education Attainment vs. Poverty Levels",
       x = "Education Attainment",
       y = "Poverty Levels") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# BOXPLOT
ggplot(harris_data, aes(x = household_income, y = employment_status)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Poverty Levels by Household Income",
       x = "Household Income",
       y = "Employment Status")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Removed 439 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

# The African American population in Harris County is steady growing. There is implications of economic challenges within the community. 


# 6. Make at least a PDF (probability density function) chart or CDF (cumulative density function) chart, which should include at least two curves to show the differences 

# CDF PLOT
ggplot(harris_data, aes(x = household_income, color = "Household Income")) +
  stat_ecdf(geom = "step",) +
  stat_ecdf(aes(x = education_attainment * 1000, color = "Education Attainment"), size = 1) + # 
  labs(title = "CDF of Household Income and Education Attainment",
       x = "Value",
       y = "Cumulative Probability") +
  scale_color_manual(name = "Legend", values = c("Household Income" = "lightblue", "Education Attainment" = "navyblue")) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 439 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

# PDF PLOT
ggplot(harris_data, aes(x = household_income, fill = "Household Income")) +
  geom_density(alpha = 1) +
  geom_density(aes(x = education_attainment * 1000, fill = "Education Attainment"), alpha = 0.5) +  
  labs(title = "PDF of Household Income and Education Attainment",
       x = "Value",
       y = "Density") +
  scale_fill_manual(name = "Legend", 
  values = c("Household Income" = "blue", "Education Attainment" = "lightblue")) +
  theme_minimal()
## Warning: Removed 439 rows containing non-finite outside the scale range
## (`stat_density()`).

# 7. Make a prediction of population/GDP/other of your study area for the next five years (2025-2030) 

harris_county <- get_acs(
  geography = "tract",
  variables = c(
    pop_density = "B02001_003E"
  ),
  state = "TX", 
  county = "201",  
  year = 2020,
  output="wide"
)
## Getting data from the 2016-2020 5-year ACS
names(harris_county)[names(harris_county) == "pop_densityE"] <- "black_pop"

harris_county_population <- data.frame(
  year = c(2020, 2021, 2022, 2023, 2024, 2025),
  population = c(4296000, 4325000, 4360000, 4400000, 4443000, 4495000)   
)

# Growth rate from 2020 to 2025
growth_rate <- (harris_county_population$population[6] / harris_county_population$population[1])^(1/5) - 1

# Predict for 2026-2030
future_years <- 2026:2030
future_population <- harris_county_population$population[6] * (1 + growth_rate)^(future_years - 2025)

harris_county_population <- rbind(
  harris_county_population,
  data.frame(year = future_years, population = future_population)
)

print(harris_county_population)
##    year population
## 1  2020    4296000
## 2  2021    4325000
## 3  2022    4360000
## 4  2023    4400000
## 5  2024    4443000
## 6  2025    4495000
## 7  2026    4535893
## 8  2027    4577158
## 9  2028    4618798
## 10 2029    4660817
## 11 2030    4703218
# 8. Fit an OLS regression model to predict educational attainment based on household income

ols_model <- lm(below_poverty ~ household_income + education_attainment + black_pop, data = harris_data)
summary(ols_model)
## 
## Call:
## lm(formula = below_poverty ~ household_income + education_attainment + 
##     black_pop, data = harris_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -936.48    1.63    5.81   12.31   60.79 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.054e+00  4.600e+00  -0.229    0.819    
## household_income      2.507e-05  5.140e-05   0.488    0.626    
## education_attainment -1.338e-03  1.269e-02  -0.105    0.916    
## black_pop             9.910e-01  8.237e-03 120.316   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52.66 on 672 degrees of freedom
##   (439 observations deleted due to missingness)
## Multiple R-squared:  0.9969, Adjusted R-squared:  0.9968 
## F-statistic: 7.102e+04 on 3 and 672 DF,  p-value: < 2.2e-16
# 9. Make at least one interactive plot or one interactive map

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(harris_data, x = ~education_attainment, y = ~household_income, 
        mode = 'markers', color = ~below_poverty) %>%
  layout(title = "Household Income vs Education Attainment")
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: Ignoring 439 observations
# 10. Have brief write-up and summarize your findings in the R Markdown file

# This analysis explores the relationships between different variables such as household income, poverty levels, educational attainment, and population density of African Americans in Harris County, Texas. The findings show that those with higher household incomes and educational attainment tend to have lower poverty levels, putting the spotlight on regions with large African American populations that tend to have higher poverty rates. The results of this analysis show key socioeconomic disparities in Harris County, suggesting areas where policy interventions could be targeted to address inequities. Focusing on improving access to education, increasing household income, and addressing the challenges faced by African American communities, planners and policymakers can work toward creating a more equitable urban environment in the county.