#Load Libraries
library(tidycensus)
## Warning: package 'tidycensus' was built under R version 4.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(sf)
## Warning: package 'sf' was built under R version 4.4.2
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.2
#Does race play a role in in tree canopy cover? #Does education Have a role in tree Canopy Cover?
#Census Api Key
#census_api_key("be13863f38ebdc792d7bf1fdbd7f8089b2a708f2", install = TRUE)
#Retrieve Income, Race, and Education Data
king_data <- get_acs(
geography = "tract",
variables = c(
total_population = "B01003_001", # Total population
median_income = "B19013_001",
white = "B02001_002",
black = "B02001_003",
asian = "B02001_005",
hispanic = "B03002_012",
edu_bachelor = "B15003_022"
),
state = "WA",
county = "King",
year = 2020,
survey = "acs5"
) %>%
select(GEOID, variable, estimate) %>%
pivot_wider(names_from = variable, values_from = estimate) %>%
mutate(
white_pct = (white / total_population) * 100,
black_pct = (black / total_population) * 100,
asian_pct = (asian / total_population) * 100,
hispanic_pct = (hispanic / total_population) * 100,
edu_bachelor_pct = (edu_bachelor / total_population) * 100
)
## Getting data from the 2016-2020 5-year ACS
#Retrieve Tree Canopy Data
Canopy_Data <- read.csv("C:/Users/witht/Documents/2024_Methods_I/Project/King_County_Canopy.csv")
#Merge Canopy Data with Census Data through geotract id
Census_Canopy_Table <- merge(king_data, Canopy_Data, by = "GEOID", all.x = TRUE)
#Calculate Mean Median Min and Max Values for Each Variable
summary_stats <- Census_Canopy_Table %>%
summarize(
white_pct_mean = mean(white_pct, na.rm = TRUE),
white_pct_median = median(white_pct, na.rm = TRUE),
white_pct_max = max(white_pct, na.rm = TRUE),
white_pct_min = min(white_pct, na.rm = TRUE),
black_pct_mean = mean(black_pct, na.rm = TRUE),
black_pct_median = median(black_pct, na.rm = TRUE),
black_pct_max = max(black_pct, na.rm = TRUE),
black_pct_min = min(black_pct, na.rm = TRUE),
hispanic_pct_mean = mean(hispanic_pct, na.rm = TRUE),
hispanic_pct_median = median(hispanic_pct, na.rm = TRUE),
hispanic_pct_max = max(hispanic_pct, na.rm = TRUE),
hispanic_pct_min = min(hispanic_pct, na.rm = TRUE),
asian_pct_mean = mean(asian_pct, na.rm = TRUE),
asian_pct_median = median(asian_pct, na.rm = TRUE),
asian_pct_max = max(asian_pct, na.rm = TRUE),
asian_pct_min = min(asian_pct, na.rm = TRUE),
edu_bachelor_pct_mean = mean(edu_bachelor_pct, na.rm = TRUE),
edu_bachelor_pct_median = median(edu_bachelor_pct, na.rm = TRUE),
edu_bachelor_pct_max = max(edu_bachelor_pct, na.rm = TRUE),
edu_bachelor_pct_min = min(edu_bachelor_pct, na.rm = TRUE),
canopy_pct_mean = mean(canopy_pct, na.rm = TRUE),
canopy_pct_median = median(canopy_pct, na.rm = TRUE),
canopy_pct_max = max(canopy_pct, na.rm = TRUE),
canopy_pct_min = min(canopy_pct, na.rm = TRUE)
)
#create variable for black white asian and edu variables
tract_data <- Census_Canopy_Table %>%
mutate(
race_type = case_when(
white_pct > 50 ~ "White",
black_pct > 50 ~ "Black",
asian_pct > 50 ~ "Asian",
hispanic_pct > 50 ~ "Hispanic",
TRUE ~ "Other"
),
education = ifelse(edu_bachelor_pct > 30, "Educated", "Not Educated")
)
#Make 3 Figures (scatter plot, histogram plot, boxplot, bar plot, etc.) and summarize your findings
#get rid of na data
tract_data_filtered <- tract_data %>%
filter(!is.na(race_type), !is.na(education))
#Scatter Plot
ggplot(tract_data_filtered, aes(x = race_type, y = canopy_pct, color = education)) +
geom_point(size = 3, position = position_jitter(width = 0.2)) +
labs(
title = "Canopy Cover Percentage by Race and Education Level",
x = "Race Type",
y = "Canopy Cover Percentage (%)",
color = "Education Level"
) +
theme_minimal()
#Box Plot
ggplot(tract_data_filtered, aes(x = race_type, y = canopy_pct)) +
geom_boxplot(aes(fill = race_type)) +
labs(
title = "Canopy Cover Percentage by Race and Education Level",
x = "Race Type",
y = "Canopy Cover Percentage (%)"
) +
theme_minimal() +
facet_wrap(~education) +
scale_fill_brewer(palette = "Set2") +
theme(legend.position = "none")
#Histogram
ggplot(tract_data_filtered, aes(x = canopy_pct, fill = race_type)) +
geom_histogram(position = "dodge", binwidth = 5, alpha = 0.7) +
labs(
title = "Distribution of Canopy Cover by Race and Education Level",
x = "Canopy Cover Percentage (%)",
y = "Frequency"
) +
theme_minimal() +
facet_wrap(~education) +
scale_fill_brewer(palette = "Set2") +
theme(legend.title = element_blank())
#Summary #The graphs indicate that most census tracts in King County fall under the “White” or “Other” categories, with no tracts categorized as predominantly “Black.” “White” census tracts exhibit higher tree canopy coverage on average and reach higher maximum canopy levels compared to other tracts.
#In terms of education, the majority of census tracts are not classified as "educated." Additionally, there appears to be an inverse relationship between education levels and canopy coverage, suggesting that areas with more educated populations tend to have lower canopy cover.
#Make at least a PDF (probability density function) chart or CDF (cumulative density function) chart, which should include at least two curves to show the differences
ggplot(tract_data_filtered, aes(x = canopy_pct, color = race_type)) +
geom_density(size = 1) +
labs(
title = "Probability Density Function of Canopy Cover by Race and Education Level",
x = "Canopy Cover Percentage (%)",
y = "Density"
) +
theme_minimal() +
facet_wrap(~education) +
scale_color_brewer(palette = "Set2")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
#make a prediction of population/GDP/other of your study area for the next five years (2025-2030)
# Step 1: Define historical population data (for Bexar County)
year <- c(2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023) # historical years
population <- c(1938.431, 1974.499, 2011.708, 2047.967, 2086.174, 2127.372, 2167.863, 2205.001, 2228.488, 2249.653, 2274.282, 2252.980, 2265.311, 2271.380)
# population in thousand persons, retrived from hhttps://fred.stlouisfed.org/series/WAKING5POP
# Step 2: Fit a linear model (1st, 2nd, and 3rd-degree polynomial models)
# We are fitting three models: linear, quadratic, and cubic polynomial
poly.lm1 <- lm(population ~ poly(year, 1)) # Linear model
poly.lm2 <- lm(population ~ poly(year, 2)) # Quadratic model
poly.lm3 <- lm(population ~ poly(year, 3)) # Cubic model
# Step 3: Define new years for prediction (2025-2030)
future_years <- c(2025, 2026, 2027, 2028, 2029, 2030)
future_data <- data.frame(year = future_years)
# Step 4: Predict population for the next five years using the best-fitting model
predicted_population <- predict(poly.lm1, newdata = future_data)
# Step 5: Print the predicted population for 2025-2030
cat("Predicted population (in thousands) for 2025-2030:\n")
## Predicted population (in thousands) for 2025-2030:
print(predicted_population)
## 1 2 3 4 5 6
## 2382.490 2409.832 2437.175 2464.517 2491.860 2519.202
#Make OLS regression analysis or correlation analysis to examine your research questions
ols_model <- lm(canopy_pct ~ white_pct + black_pct + asian_pct + hispanic_pct + edu_bachelor_pct, data = tract_data_filtered)
summary(ols_model)
##
## Call:
## lm(formula = canopy_pct ~ white_pct + black_pct + asian_pct +
## hispanic_pct + edu_bachelor_pct, data = tract_data_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.115 -9.174 -0.422 6.953 51.538
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.51907 12.60671 0.279 0.7803
## white_pct 0.57203 0.13529 4.228 2.82e-05 ***
## black_pct 0.01124 0.17091 0.066 0.9476
## asian_pct 0.34044 0.13777 2.471 0.0138 *
## hispanic_pct -0.08474 0.13721 -0.618 0.5371
## edu_bachelor_pct -0.53691 0.08685 -6.182 1.34e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.17 on 488 degrees of freedom
## Multiple R-squared: 0.2039, Adjusted R-squared: 0.1957
## F-statistic: 24.99 on 5 and 488 DF, p-value: < 2.2e-16
# Reshape the data to long format
long_data <- tract_data_filtered %>%
pivot_longer(cols = c(white_pct, black_pct, asian_pct, hispanic_pct, edu_bachelor_pct),
names_to = "race",
values_to = "percentage")
# Create the scatter plot with regression lines
ggplot(long_data, aes(x = percentage, y = canopy_pct, color = race)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Regression: Race/Education % vs. Tree Canopy Coverage",
x = "Percent of Population", # X-axis label
y = "Canopy Cover") +
theme_minimal() +
scale_color_brewer(palette = "Set2") # Optional: use a color palette for better distinction
## `geom_smooth()` using formula = 'y ~ x'
#Make at least one interactive plot or one interactive map
scatter_plot <- ggplot(long_data, aes(x = percentage, y = canopy_pct, color = race)) +
geom_point(size = 3, alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Regression: Race % vs. Tree Canopy Coverage",
x = "Percent of Population", # X-axis label
y = "Canopy Cover") +
theme_minimal() +
scale_color_brewer(palette = "Set2") # Optional: use a color palette for better distinction
interactive_plot <- ggplotly(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'
interactive_plot
#Summary