#Load Libraries

library(tidycensus)
## Warning: package 'tidycensus' was built under R version 4.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(sf)
## Warning: package 'sf' was built under R version 4.4.2
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.2

#Does race play a role in in tree canopy cover? #Does education Have a role in tree Canopy Cover?

#Census Api Key

#census_api_key("be13863f38ebdc792d7bf1fdbd7f8089b2a708f2", install = TRUE)

#Retrieve Income, Race, and Education Data
king_data <- get_acs(
  geography = "tract",
  variables = c(
    total_population = "B01003_001",      # Total population
    median_income = "B19013_001",
    white = "B02001_002",
    black = "B02001_003",
    asian = "B02001_005",
    hispanic = "B03002_012",
    edu_bachelor = "B15003_022"
  ),
  state = "WA",
  county = "King",
  year = 2020,
  survey = "acs5"
) %>%
  select(GEOID, variable, estimate) %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  mutate(
    white_pct = (white / total_population) * 100,
    black_pct = (black / total_population) * 100,
    asian_pct = (asian / total_population) * 100,
    hispanic_pct = (hispanic / total_population) * 100,
    edu_bachelor_pct = (edu_bachelor / total_population) * 100
  )
## Getting data from the 2016-2020 5-year ACS
#Retrieve Tree Canopy Data

Canopy_Data <- read.csv("C:/Users/witht/Documents/2024_Methods_I/Project/King_County_Canopy.csv")

#Merge Canopy Data with Census Data through geotract id

Census_Canopy_Table <- merge(king_data, Canopy_Data, by = "GEOID", all.x = TRUE)
#Calculate Mean Median Min and Max Values for Each Variable

summary_stats <- Census_Canopy_Table %>%
  summarize(
    white_pct_mean = mean(white_pct, na.rm = TRUE),
    white_pct_median = median(white_pct, na.rm = TRUE),
    white_pct_max = max(white_pct, na.rm = TRUE),
    white_pct_min = min(white_pct, na.rm = TRUE),
    
    black_pct_mean = mean(black_pct, na.rm = TRUE),
    black_pct_median = median(black_pct, na.rm = TRUE),
    black_pct_max = max(black_pct, na.rm = TRUE),
    black_pct_min = min(black_pct, na.rm = TRUE),
    
    hispanic_pct_mean = mean(hispanic_pct, na.rm = TRUE),
    hispanic_pct_median = median(hispanic_pct, na.rm = TRUE),
    hispanic_pct_max = max(hispanic_pct, na.rm = TRUE),
    hispanic_pct_min = min(hispanic_pct, na.rm = TRUE),
    
    asian_pct_mean = mean(asian_pct, na.rm = TRUE),
    asian_pct_median = median(asian_pct, na.rm = TRUE),
    asian_pct_max = max(asian_pct, na.rm = TRUE),
    asian_pct_min = min(asian_pct, na.rm = TRUE),
    
    edu_bachelor_pct_mean = mean(edu_bachelor_pct, na.rm = TRUE),
    edu_bachelor_pct_median = median(edu_bachelor_pct, na.rm = TRUE),
    edu_bachelor_pct_max = max(edu_bachelor_pct, na.rm = TRUE),
    edu_bachelor_pct_min = min(edu_bachelor_pct, na.rm = TRUE),
    
    canopy_pct_mean = mean(canopy_pct, na.rm = TRUE),
    canopy_pct_median = median(canopy_pct, na.rm = TRUE),
    canopy_pct_max = max(canopy_pct, na.rm = TRUE),
    canopy_pct_min = min(canopy_pct, na.rm = TRUE)
  )
#create variable for black white asian and edu variables
tract_data <- Census_Canopy_Table %>%
  mutate(
    race_type = case_when(
      white_pct > 50 ~ "White",
      black_pct > 50 ~ "Black",
      asian_pct > 50 ~ "Asian",
      hispanic_pct > 50 ~ "Hispanic",
      TRUE ~ "Other"
    ),
    education = ifelse(edu_bachelor_pct > 30, "Educated", "Not Educated")
  )
#Make 3 Figures  (scatter plot, histogram plot, boxplot, bar plot, etc.) and summarize your findings
    #get rid of na data
        tract_data_filtered <- tract_data %>%
          filter(!is.na(race_type), !is.na(education))
    #Scatter Plot
        ggplot(tract_data_filtered, aes(x = race_type, y = canopy_pct, color = education)) +
          geom_point(size = 3, position = position_jitter(width = 0.2)) +
          labs(
            title = "Canopy Cover Percentage by Race and Education Level",
            x = "Race Type",
            y = "Canopy Cover Percentage (%)",
            color = "Education Level"
          ) +
          theme_minimal()

    #Box Plot
        ggplot(tract_data_filtered, aes(x = race_type, y = canopy_pct)) +
          geom_boxplot(aes(fill = race_type)) +
          labs(
            title = "Canopy Cover Percentage by Race and Education Level",
            x = "Race Type",
            y = "Canopy Cover Percentage (%)"
          ) +
          theme_minimal() +
          facet_wrap(~education) +
          scale_fill_brewer(palette = "Set2") +
          theme(legend.position = "none")

    #Histogram
        ggplot(tract_data_filtered, aes(x = canopy_pct, fill = race_type)) +
          geom_histogram(position = "dodge", binwidth = 5, alpha = 0.7) +
          labs(
            title = "Distribution of Canopy Cover by Race and Education Level",
            x = "Canopy Cover Percentage (%)",
            y = "Frequency"
          ) +
          theme_minimal() +
          facet_wrap(~education) +
          scale_fill_brewer(palette = "Set2") +
          theme(legend.title = element_blank())

#Summary #The graphs indicate that most census tracts in King County fall under the “White” or “Other” categories, with no tracts categorized as predominantly “Black.” “White” census tracts exhibit higher tree canopy coverage on average and reach higher maximum canopy levels compared to other tracts.

    #In terms of education, the majority of census tracts are not classified as "educated." Additionally, there appears to be an inverse relationship between education levels and canopy coverage, suggesting that areas with more educated populations tend to have lower canopy cover.
    
#Make at least a PDF (probability density function) chart or CDF (cumulative density function) chart, which should include at least two curves to show the differences
        
        ggplot(tract_data_filtered, aes(x = canopy_pct, color = race_type)) +
          geom_density(size = 1) +
          labs(
            title = "Probability Density Function of Canopy Cover by Race and Education Level",
            x = "Canopy Cover Percentage (%)",
            y = "Density"
          ) +
          theme_minimal() +
          facet_wrap(~education) +
          scale_color_brewer(palette = "Set2")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

#make a prediction of population/GDP/other of your study area for the next five years (2025-2030)
        
        # Step 1: Define historical population data (for Bexar County)
        year <- c(2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023) # historical years
        population <- c(1938.431, 1974.499, 2011.708, 2047.967, 2086.174, 2127.372, 2167.863, 2205.001, 2228.488, 2249.653, 2274.282, 2252.980, 2265.311, 2271.380) 
        # population in thousand persons, retrived from hhttps://fred.stlouisfed.org/series/WAKING5POP
        
        # Step 2: Fit a linear model (1st, 2nd, and 3rd-degree polynomial models)
        # We are fitting three models: linear, quadratic, and cubic polynomial
        poly.lm1 <- lm(population ~ poly(year, 1))  # Linear model
        poly.lm2 <- lm(population ~ poly(year, 2))  # Quadratic model
        poly.lm3 <- lm(population ~ poly(year, 3))  # Cubic model
        
        # Step 3: Define new years for prediction (2025-2030)
        future_years <- c(2025, 2026, 2027, 2028, 2029, 2030)
        future_data <- data.frame(year = future_years)
        
        # Step 4: Predict population for the next five years using the best-fitting model
        predicted_population <- predict(poly.lm1, newdata = future_data)
        
        # Step 5: Print the predicted population for 2025-2030
        cat("Predicted population (in thousands) for 2025-2030:\n")
## Predicted population (in thousands) for 2025-2030:
        print(predicted_population)
##        1        2        3        4        5        6 
## 2382.490 2409.832 2437.175 2464.517 2491.860 2519.202
#Make OLS regression analysis or correlation analysis to examine your research questions

        ols_model <- lm(canopy_pct ~ white_pct + black_pct + asian_pct + hispanic_pct + edu_bachelor_pct, data = tract_data_filtered)
        
        summary(ols_model)
## 
## Call:
## lm(formula = canopy_pct ~ white_pct + black_pct + asian_pct + 
##     hispanic_pct + edu_bachelor_pct, data = tract_data_filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.115  -9.174  -0.422   6.953  51.538 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.51907   12.60671   0.279   0.7803    
## white_pct         0.57203    0.13529   4.228 2.82e-05 ***
## black_pct         0.01124    0.17091   0.066   0.9476    
## asian_pct         0.34044    0.13777   2.471   0.0138 *  
## hispanic_pct     -0.08474    0.13721  -0.618   0.5371    
## edu_bachelor_pct -0.53691    0.08685  -6.182 1.34e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.17 on 488 degrees of freedom
## Multiple R-squared:  0.2039, Adjusted R-squared:  0.1957 
## F-statistic: 24.99 on 5 and 488 DF,  p-value: < 2.2e-16
        # Reshape the data to long format
        long_data <- tract_data_filtered %>%
          pivot_longer(cols = c(white_pct, black_pct, asian_pct, hispanic_pct, edu_bachelor_pct), 
                       names_to = "race", 
                       values_to = "percentage")
        
        # Create the scatter plot with regression lines
        ggplot(long_data, aes(x = percentage, y = canopy_pct, color = race)) +
          geom_point() +                                                   
          geom_smooth(method = "lm", se = FALSE) +                       
          labs(title = "Regression: Race/Education % vs. Tree Canopy Coverage",  
               x = "Percent of Population",    # X-axis label
               y = "Canopy Cover") +                             
          theme_minimal() +
          scale_color_brewer(palette = "Set2")  # Optional: use a color palette for better distinction
## `geom_smooth()` using formula = 'y ~ x'

#Make at least one interactive plot or one interactive map
        scatter_plot <- ggplot(long_data, aes(x = percentage, y = canopy_pct, color = race)) +
          geom_point(size = 3, alpha = 0.6) +                                                   
          geom_smooth(method = "lm", se = FALSE) +                       
          labs(title = "Regression: Race % vs. Tree Canopy Coverage",  
               x = "Percent of Population",    # X-axis label
               y = "Canopy Cover") +                             
          theme_minimal() +
          scale_color_brewer(palette = "Set2")  # Optional: use a color palette for better distinction
        
        interactive_plot <- ggplotly(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'
        interactive_plot

#Summary

The regression analysis shows a complex relationship between race/education and canopy cover. Specifically, areas with a higher percentage of White residents tend to have more tree canopy, indicating a positive association. Similarly, a larger Asian population is also linked to increased canopy cover, though the effect appears to be somewhat weaker compared to White populations. In contrast, areas with higher levels of educational attainment, particularly where a greater proportion of residents hold bachelor’s degrees, show a negative relationship with canopy cover, suggesting that more educated neighborhoods may have less tree coverage. Interestingly, the racial composition of Black and Hispanic populations does not appear to have a significant impact on canopy cover.