### Q1: List your research questions and ensure the following analysis will help you answer the questions.
#The research focuses on understanding the relationship between the percentage of Black and Hispanic populations and median household income in Bexar County, as well as predicting population growth from 2025 to 2030.


### Q2: Use Census API to get the census tract-level data with at least 4 variables (You can find the relevant codes in the file CensusAPI.R).
# Step 1: Load necessary libraries
library(tidycensus)   # For accessing US Census data
library(dplyr)        # For data manipulation
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)   # For creating plots
library(plotly)  # For interactive plots
## Warning: package 'plotly' was built under R version 4.3.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Step 2: Set your Census API key ()
census_api_key("4645b7ba2f363dfb5a199f288801acd96006ea29",install=TRUE, overwrite = TRUE)
## Your original .Renviron will be backed up and stored in your R HOME directory if needed.
## Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY"). 
## To use now, restart R or run `readRenviron("~/.Renviron")`
## [1] "4645b7ba2f363dfb5a199f288801acd96006ea29"
# This is my API key, you need replace this with yours.

# Step 3: Choose the variables we want to retrieve
#https://api.census.gov/data/2020/acs/acs5/variables.html; ctrl+F to look for variables
# Here, we're selecting 4 variables to meet the project requirements:
# 1. Median Household Income (B19013_001E)
# 2. Hispanic Population (B03002_012E)
# 3. Non-Hispanic African American Population (B03002_004E)
# 4. Total Population (B01003_001E)
var <- c(Median_Household_Income='B19013_001E',  
         Hispanic_Population='B03002_012E',
         Black_or_African_American='B03002_004E', 
         Total_Population = 'B01003_001E') 

# Step 4: Get the Census data at the tract level
county_data <- get_acs(geography = "tract", 
                       variables = var,         # variables we selected in step 3
                       county = "Bexar",        # Bexar County
                       state = "TX",            # Texas
                       year = 2020,             # Specify the year of ACS data
                       output = "wide",         # Specifies the output as "wide", where each variable has its own column
                       geometry = TRUE)         # Includes geographical data (shapefiles) for mapping purposes
## Getting data from the 2016-2020 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |==                                                                    |   4%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |================                                                      |  24%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
### Q3: Calculate mean, median, min, and max values for at least one variable (You can find the relevant codes from Lines 9-13 in Visualization1.R Download Visualization1.R).
# Step 1: Calculate mean, median, min, and max for Median Household Income
mean_income <- mean(county_data$Median_Household_Income, na.rm = TRUE)    # Mean
median_income <- median(county_data$Median_Household_Income, na.rm = TRUE) # Median
max_income <- max(county_data$Median_Household_Income, na.rm = TRUE)      # Max
min_income <- min(county_data$Median_Household_Income, na.rm = TRUE)      # Min

# Step 2: Display the results
mean_income
## [1] 62658.53
median_income
## [1] 53763
max_income
## [1] 217534
min_income
## [1] 10518
### Q4: Make at least three types of figures (scatter plot, histogram plot, boxplot, bar plot, etc.) and summarize your findings (You can find the relevant codes in Visualization1.R)
# Step 1: Prepare data
# Calculate percentage of Black or African American population
county_data$pct_Black <- 100 * county_data$Black_or_African_American / county_data$Total_Population

# Step 2: Create the scatter plot (Black Population % vs. Median Household Income)
ggplot(county_data, aes(x = pct_Black, y = Median_Household_Income)) +  # Set x-axis as percentage of Black population and y-axis as median household income
  geom_point(color = "blue") +                                          # Create scatter plot points and set color to blue
  labs(title = "Black Population % vs Median Household Income",         # Set plot title
       x = "Percentage of Black or African American Population",        # Label for x-axis
       y = "Median Household Income") +                                 # Label for y-axis
  theme_minimal()                                                       # Use a minimal theme for a clean plot appearance
## Warning: Removed 4 rows containing missing values (`geom_point()`).

# Step 3: Create the histogram (Distribution of Median Household Income)
ggplot(county_data, aes(x = Median_Household_Income)) +                 # Set x-axis as median household income for the histogram
  geom_histogram(binwidth = 5000, fill = "green", color = "black") +    # Create a histogram with a binwidth of 5000, green bars, and black outlines
  labs(title = "Distribution of Median Household Income",               # Set plot title
       x = "Median Household Income",                                   # Label for x-axis
       y = "Count") +                                                   # Label for y-axis (Count of tracts in each income range)
  theme_minimal()                                                       # Use a minimal theme for a clean plot appearance
## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).

# Step 4: Create the boxplot (Boxplot of Median Household Income)
ggplot(county_data, aes(y = Median_Household_Income)) +                 # Set y-axis as median household income for the boxplot
  geom_boxplot(fill = "orange") +                                       # Create the boxplot and set the fill color to orange
  labs(title = "Boxplot of Median Household Income",                    # Set plot title
       y = "Median Household Income") +                                 # Label for y-axis
  theme_minimal()                                                       # Use a minimal theme for a clean plot appearance
## Warning: Removed 4 rows containing non-finite values (`stat_boxplot()`).

### Q5: Make at least a PDF (probability density function) chart and CDF (cumulative density function) chart for any variable (You can find the relevant codes from Lines 65-70)
# Step 1: Create the PDF (Probability Density Function) chart for Median Household Income
ggplot(county_data, aes(x = Median_Household_Income)) +                 # Set x-axis as median household income
  geom_density(fill = "lightblue", alpha = 0.5) +                       # Create a density plot (PDF) with light blue fill and transparency (alpha)
  labs(title = "PDF of Median Household Income",                        # Set plot title
       x = "Median Household Income",                                   # Label for x-axis
       y = "Density") +                                                 # Label for y-axis (Density)
  theme_minimal()                                                       # Use a minimal theme for a clean plot appearance
## Warning: Removed 4 rows containing non-finite values (`stat_density()`).

# Step 2: Create the CDF (Cumulative Density Function) chart for Median Household Income
ggplot(county_data, aes(x = Median_Household_Income)) +                 # Set x-axis as median household income
  stat_ecdf(geom = "step", color = "darkgreen", size = 1) +             # Create a CDF (ECDF) plot with green steps and line size of 1
  labs(title = "CDF of Median Household Income",                        # Set plot title
       x = "Median Household Income",                                   # Label for x-axis
       y = "Cumulative Probability") +                                  # Label for y-axis (Cumulative Probability)
  theme_minimal()                  
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 4 rows containing non-finite values (`stat_ecdf()`).

### Q6: Make a prediction of population OR GDP OR other variable of your study area for the next five years (2025-2030) (You can find the relevant codes from lines 59-73 in Population Projection.R)
# Step 1: Define historical population data (for Bexar County)
year <- c(2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023) # historical years
population <- c(1722.841, 1755.36, 1788.768, 1821.354, 1857.977, 1894.811, 1927.409, 1956.394, 1981.061, 2002.445, 2015.501, 2030.981, 2060.191, 2087.679) 
# population in thousand persons, retrived from https://fred.stlouisfed.org/series/TXBEXA9POP

# Step 2: Fit a linear model (1st, 2nd, and 3rd-degree polynomial models)
# We are fitting three models: linear, quadratic, and cubic polynomial
poly.lm1 <- lm(population ~ poly(year, 1))  # Linear model
poly.lm2 <- lm(population ~ poly(year, 2))  # Quadratic model
poly.lm3 <- lm(population ~ poly(year, 3))  # Cubic model

# Step 3: Define new years for prediction (2025-2030)
future_years <- c(2025, 2026, 2027, 2028, 2029, 2030)
future_data <- data.frame(year = future_years)

# Step 4: Predict population for the next five years using the best-fitting model (here we use poly.lm3)
predicted_population <- predict(poly.lm3, newdata = future_data)

# Step 5: Print the predicted population for 2025-2030
cat("Predicted population (in thousands) for 2025-2030:\n")
## Predicted population (in thousands) for 2025-2030:
print(predicted_population)
##        1        2        3        4        5        6 
## 2110.126 2123.093 2134.592 2144.655 2153.313 2160.600
### Q7: Make OLS regression analysis or correlation analysis to examine your research questions (You can find the relevant codes from Lines 6-24 in Population Projection.R)
# Step 1: Prepare the data
# We calculate the percentage of Black and Hispanic populations relative to the total population
# This helps us understand the proportion of these groups in each census tract
county_data <- county_data %>%
  mutate(
    pct_Black = 100 * Black_or_African_American / Total_Population,   # Percentage of Black population
    pct_Hispanic = 100 * Hispanic_Population / Total_Population       # Percentage of Hispanic population
  )

# Step 2: Fit an OLS regression model
# We will use an ordinary least squares (OLS) regression model to predict Median Household Income
# based on the percentage of Black and Hispanic populations
# Median_Household_Income is the dependent variable, pct_Black and pct_Hispanic are the independent variables
ols_model <- lm(Median_Household_Income ~ pct_Black + pct_Hispanic, data = county_data)

# Step 4: View the summary of the OLS regression model
# The summary function gives us detailed results of the regression, including coefficients, R-squared, and p-values
# This information helps us determine the significance and strength of the relationship between variables
summary(ols_model)
## 
## Call:
## lm(formula = Median_Household_Income ~ pct_Black + pct_Hispanic, 
##     data = county_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -47345 -12991  -1291   9699 114651 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  137728.02    3747.88   36.75   <2e-16 ***
## pct_Black     -1483.14     145.60  -10.19   <2e-16 ***
## pct_Hispanic  -1065.33      51.45  -20.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21070 on 368 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.5457, Adjusted R-squared:  0.5432 
## F-statistic:   221 on 2 and 368 DF,  p-value: < 2.2e-16
# Step 5: Visualize the relationship between pct_Black and Median Household Income
# We create a scatter plot to show how the percentage of Black population relates to median household income
# The geom_smooth() function adds a regression line (blue) to visualize the trend
ggplot(county_data, aes(x = pct_Black, y = Median_Household_Income)) +
  geom_point() +                                                   # Plot the data points (scatter plot)
  geom_smooth(method = "lm", col = "blue") +                       # Add a linear regression line (blue)
  labs(title = "Regression: Black Population % vs. Median Household Income",  # Plot title
       x = "Percentage of Black or African American Population",    # X-axis label
       y = "Median Household Income") +                             # Y-axis label
  theme_minimal()                                                   # Apply a clean, minimal theme to the plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).

# Step 6: Visualize the relationship between pct_Hispanic and Median Household Income
# We create a second scatter plot to show the relationship between the percentage of Hispanic population
# and median household income, with a green regression line for visualization
ggplot(county_data, aes(x = pct_Hispanic, y = Median_Household_Income)) +
  geom_point() +                                                    # Plot the data points (scatter plot)
  geom_smooth(method = "lm", col = "green") +                       # Add a linear regression line (green)
  labs(title = "Regression: Hispanic Population % vs. Median Household Income",  # Plot title
       x = "Percentage of Hispanic Population",                     # X-axis label
       y = "Median Household Income") +                             # Y-axis label
  theme_minimal()                                                   # Apply a clean, minimal theme to the plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).
## Removed 4 rows containing missing values (`geom_point()`).

### Q8:Make at least one interactive plot or one interactive map (You can find the codes from Lines 86-89 in the file Visualization1.R Download Visualization1.R)
# Step 1: Prepare the data
# We'll use the same data as before (percentage of Black population vs. Median Household Income)

# Step 2: Create a ggplot2 scatter plot
# We create a basic scatter plot with ggplot2 first
p <- ggplot(county_data, aes(x = pct_Black, y = Median_Household_Income)) +  # Set x and y axes
  geom_point() +                                                             # Plot data points
  geom_smooth(method = "lm", col = "blue") +                                 # Add a linear regression line (blue)
  labs(title = "Interactive Scatter Plot: Black Population % vs. Median Household Income",  # Set plot title
       x = "Percentage of Black or African American Population",             # Label for x-axis
       y = "Median Household Income") +                                      # Label for y-axis
  theme_minimal()                                                            # Apply a clean, minimal theme

p #Showing the plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).
## Removed 4 rows containing missing values (`geom_point()`).

# Step 3: Make the plot interactive using plotly
# We use the ggplotly() function from the plotly package to convert the static ggplot2 plot into an interactive plot
interactive_plot <- ggplotly(p)  # Converts the ggplot into an interactive plotly object
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).
# Step 4: Display the interactive plot
# The interactive plot will allow you to zoom, hover over data points, and interact with the plot dynamically
interactive_plot  # This displays the interactive plot in the RStudio Viewer or a web browser
### Q9: Have brief write-up and summarize your findings in the R Markdown file.

#In this analysis of Bexar County, Texas, we retrieved census data on median household income, Hispanic population, non-Hispanic African American population, and total population. The average median household income was $62,658.53, with a wide range from $10,518 to $217,534. Visualizations showed that higher percentages of Black and Hispanic populations were linked to lower household incomes. A population forecast predicted steady growth from 2.11 million in 2025 to 2.16 million in 2030. OLS regression confirmed that both Black and Hispanic population percentages had a negative impact on income. An interactive scatter plot was also created for further exploration.