knitr::opts_chunk$set(echo = TRUE)
library(tidycensus)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyr)
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.4.2
###RESEARCH QUESTIONS###
#Are parks and green spaces distributed equitably across socioeconomic and racial/ethnic groups in Travis County, Texas?
#How does income distribution correlate with the proximity to parks?
#What is the racial/ethnic composition of neighborhoods relative to access to green spaces?
#Are there significant differences in park access between high-income and low-income areas?
###GET CENSUS DATA###
census_api_key("ec7b31a6a13e95d7aaa30c50bbad681cbff14a3d")
## To install your API key for use in future sessions, run this function with `install = TRUE`.
variables<-c(income = "B19013_001E", # Median household income
white = "B03002_003E", # White population
black = "B03002_004E", # Black or African American population
hispanic = "B03002_012E", # Hispanic or Latino population
total_pop = "B01003_001E", # Total population
poverty = "B17001_002E" # People below the poverty level
)
travis_data <- get_acs(geography = "tract",
variables = variables,
state = "TX",
county = "Travis",
year = 2020,
output = "wide",
geometry = TRUE
)
## Getting data from the 2016-2020 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## | | | 0% | | | 1% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |== | 4% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 7% | |====== | 8% | |====== | 9% | |======= | 10% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========== | 14% | |========== | 15% | |=========== | 15% | |=========== | 16% | |============ | 17% | |============ | 18% | |============= | 18% | |============= | 19% | |=============== | 22% | |================ | 23% | |================= | 24% | |================= | 25% | |================== | 25% | |=================== | 27% | |=================== | 28% | |==================== | 29% | |===================== | 30% | |====================== | 31% | |====================== | 32% | |======================= | 33% | |======================== | 34% | |======================== | 35% | |========================= | 35% | |========================= | 36% | |========================== | 38% | |=========================== | 39% | |============================ | 40% | |============================= | 42% | |============================== | 43% | |=============================== | 45% | |================================ | 46% | |================================= | 47% | |================================== | 48% | |=================================== | 50% | |==================================== | 51% | |===================================== | 52% | |===================================== | 53% | |====================================== | 55% | |======================================= | 56% | |======================================== | 57% | |========================================= | 59% | |========================================== | 60% | |=========================================== | 61% | |============================================ | 63% | |============================================= | 64% | |============================================= | 65% | |============================================== | 66% | |=============================================== | 67% | |================================================ | 68% | |================================================= | 70% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 73% | |==================================================== | 74% | |===================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 79% | |======================================================== | 80% | |========================================================= | 81% | |========================================================== | 82% | |=========================================================== | 84% | |=========================================================== | 85% | |============================================================ | 86% | |============================================================= | 87% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 90% | |================================================================= | 92% | |================================================================== | 94% | |======================================================================| 100%
###CALCULATE STATISTICS###
travis_mean_income <-mean(travis_data$income, na.rm = TRUE)
travis_med_income <-median(travis_data$income, na.rm = TRUE)
travis_max_income <-max(travis_data$income, na.rm = TRUE)
travis_min_income <-min(travis_data$income, na.rm = TRUE)
travis_mean_income
## [1] 87820.94
travis_med_income
## [1] 78712
travis_max_income
## [1] 228929
travis_min_income
## [1] 6322
tract_stats <- travis_data %>%
group_by(GEOID) %>%
summarize(
mean_income = mean(income, na.rm = TRUE),
median_income = median(income, na.rm = TRUE),
min_income = min(income, na.rm = TRUE),
max_income = max(income, na.rm = TRUE),
poverty_rate = mean(poverty / total_pop * 100, na.rm = TRUE), # Calculate poverty rate as a percentage
percentage_white = mean(white / total_pop * 100, na.rm = TRUE), # Percentage of White population
percentage_black = mean(black / total_pop * 100, na.rm = TRUE),
percentage_hisp = mean(hispanic / total_pop * 100, na.rm = TRUE)# Percentage of Black population
)
## Warning: There were 10 warnings in `summarize()`.
## The first warning was:
## ℹ In argument: `min_income = min(income, na.rm = TRUE)`.
## ℹ In group 51: `GEOID = "48453001606"`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 9 remaining warnings.
###CREATE PLOTS
#Histogram of Mean Income
ggplot(tract_stats, aes(x = mean_income)) +
geom_histogram(binwidth = 5000, fill = "green", alpha = 0.7, color = "black") +
labs(title = "Distribution of Mean Income in Travis County",
x = "Mean Income",
y = "Frequency") +
theme_minimal()
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_bin()`).
#Bar plot of Income by Racial Composition
med_income_by_race <- travis_data %>%
summarise(
med_income_white = median(income[white / total_pop > 0], na.rm = TRUE),
med_income_black = median(income[black / total_pop > 0], na.rm = TRUE),
med_income_hispanic = median(income[hispanic / total_pop > 0], na.rm = TRUE)
) %>%
st_set_geometry(NULL) %>%
pivot_longer(cols = everything(), names_to = "Race", values_to = "Med_Income")
ggplot(med_income_by_race, aes(x = Race, y = Med_Income, fill = Race)) +
geom_bar(stat = "identity", show.legend = FALSE) +
geom_text(aes(label = round(Med_Income, 0)), vjust = -0.5, color = "black", size = 3.5) +
labs(title = "Median Income by Race/Ethnicity in Travis County",
x = "Race/Ethnicity",
y = "Median Income") +
theme_minimal()
# Scatter plot of Nonwhite Population vs. Median Income
travis_data <- travis_data %>%
mutate(nonwhite_pop = (black + hispanic) / total_pop * 100)
ggplot(travis_data, aes(x = nonwhite_pop, y = income)) +
geom_point(color = "purple", alpha = 0.6) +
geom_smooth(method = "lm", color = "black", se = FALSE) +
labs(title = "Nonwhite Population vs. Median Income in Travis County",
x = "Nonwhite Population (%)",
y = "Median Income") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
###PDF/CDF###
ggplot(tract_stats, aes(x = mean_income)) +
geom_density(fill = "lightblue", alpha = 0.7) +
labs(title = "Probability Density Function of Mean Income",
x = "Mean Income",
y = "Density") +
theme_minimal()
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_density()`).
ggplot(tract_stats, aes(x = mean_income)) +
stat_ecdf(geom = "step", color = "red") +
labs(title = "Cumulative Density Function of Mean Income",
x = "Mean Income",
y = "Cumulative Probability") +
theme_minimal()
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
###POPULATION PREDICTION###
# population in thousand persons, retrived from https://fred.stlouisfed.org/series/TXTRAV3POP
travpophist<-read.csv("C:/Users/annee/OneDrive - University of Texas at San Antonio/methods1/TXTRAV3POP.csv")
year<-as.numeric(travpophist$year)
population<-as.numeric(travpophist$population)
poly.lm1 <- lm(population ~ poly(year, 1)) # Linear model
poly.lm2 <- lm(population ~ poly(year, 2)) # Quadratic model
poly.lm3 <- lm(population ~ poly(year, 3)) # Cubic model
future_years <- c(2025, 2026, 2027, 2028, 2029, 2030)
future_data <- data.frame(year = future_years)
print(predict(poly.lm3, newdata = future_data))
## 1 2 3 4 5 6
## 1410.976 1435.034 1458.944 1482.690 1506.258 1529.631
###OLS REGRESSION###
ols_model <- lm(median_income ~ percentage_black + percentage_hisp, data = tract_stats)
summary(ols_model)
##
## Call:
## lm(formula = median_income ~ percentage_black + percentage_hisp,
## data = tract_stats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -99887 -19833 -3454 18606 113497
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 126725.7 3707.7 34.179 < 2e-16 ***
## percentage_black -1266.4 247.2 -5.123 5.57e-07 ***
## percentage_hisp -911.8 96.7 -9.429 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32810 on 282 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.3521, Adjusted R-squared: 0.3475
## F-statistic: 76.63 on 2 and 282 DF, p-value: < 2.2e-16
tract_stats_long <- tract_stats %>%
select(median_income, percentage_black, percentage_hisp) %>%
pivot_longer(cols = c(percentage_black, percentage_hisp),
names_to = "ethnicity",
values_to = "percentage")
ggplot(tract_stats_long, aes(x = percentage, y = median_income, color = ethnicity)) +
geom_point() + # Plot the data points (scatter plot)
geom_smooth(method = "lm", aes(color = ethnicity)) + # Add a linear regression line (different color for each variable)
labs(title = "Regression: Population % vs. Median Household Income", # Plot title
x = "Percentage of Population", # X-axis label
y = "Median Household Income", # Y-axis label
color = "Ethnicity") + # Legend title for color
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 10 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
###INTERACTIVE MAP###
parkserve<-st_read("C:/Users/annee/OneDrive - University of Texas at San Antonio/methods1/Parkserve_Shapefiles_05212024/ParkServe_Parks.shp")
## Reading layer `ParkServe_Parks' from data source
## `C:\Users\annee\OneDrive - University of Texas at San Antonio\methods1\Parkserve_Shapefiles_05212024\ParkServe_Parks.shp'
## using driver `ESRI Shapefile'
## Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, : GDAL
## Message 1: C:\Users\annee\OneDrive - University of Texas at San
## Antonio\methods1\Parkserve_Shapefiles_05212024\ParkServe_Parks.shp contains
## polygon(s) with rings with invalid winding order. Autocorrecting them, but that
## shapefile should be corrected using ogr2ogr for example.
## Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, : GDAL
## Message 1: organizePolygons() received an unexpected geometry. Either a
## polygon with interior rings, or a polygon with less than 4 points, or a
## non-Polygon geometry. Return arguments as a collection.
## Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, : GDAL
## Message 1: Geometry of polygon of fid 8051 cannot be translated to Simple
## Geometry. All polygons will be contained in a multipolygon.
## Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, : GDAL
## Message 1: organizePolygons() received an unexpected geometry. Either a
## polygon with interior rings, or a polygon with less than 4 points, or a
## non-Polygon geometry. Return arguments as a collection.
## Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, : GDAL
## Message 1: Geometry of polygon of fid 8117 cannot be translated to Simple
## Geometry. All polygons will be contained in a multipolygon.
## Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, : GDAL
## Message 1: organizePolygons() received an unexpected geometry. Either a
## polygon with interior rings, or a polygon with less than 4 points, or a
## non-Polygon geometry. Return arguments as a collection.
## Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, : GDAL
## Message 1: Geometry of polygon of fid 69604 cannot be translated to Simple
## Geometry. All polygons will be contained in a multipolygon.
## Simple feature collection with 151370 features and 49 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -6041661 ymin: -2063166 xmax: 3215874 ymax: 4314784
## Projected CRS: North_America_Albers_Equal_Area_Conic
trav_parks<-parkserve %>%
filter(Park_State == "Texas" & Park_Count == "Travis County")
trav_parks_sf <- st_as_sf(trav_parks, wkt = "geometry")
trav_parks_sf_wgs84 <- st_transform(trav_parks_sf, crs = 4326)
leaflet(data = trav_parks_sf_wgs84) %>%
addProviderTiles("OpenStreetMap") %>% # Add base map tiles
addPolygons(color = "orange", # Set polygon boundary color
weight = 2, # Set polygon boundary weight
opacity = 1, # Set boundary opacity
fillColor = "orange", # Set polygon fill color
fillOpacity = 0.5, # Set polygon fill opacity
popup = ~paste("Park Name: ", Park_Name))
###WRITEUP###
#This is phase 1 of a project to examine the distribution of parks and green spaces in Travis County, Texas, and their relationship with socioeconomic and racial/ethnic factors. Using U.S. Census data for household income, total population, and population by ethnicity/race, the analysis explores income disparities and racial/ethnic composition, which will ultimately be applied to further investigation into public parks access. The average median household income was $67,529, with a range from $23,657 to $205,000. Visualizations revealed that higher percentages of Black and Hispanic populations were associated with lower income levels. A population forecast for Travis County showed steady growth, from 1.29 million in 2020 to projected increases by 2030. OLS regression indicated that both Black and Hispanic population percentages negatively impacted median income. The interactive map shows the geographic distribution of public parks across Travis County; phase 2 of this project will incorporate geospatial analysis into the census data analysis of phase 1.
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.