# Load the necessary packages
library(readr)
library(dplyr)
library(bit)
library(haven)
#Read in Air pollution data
airpo19 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2019.csv")
airpo18 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2018.csv")
airpo17 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2017.csv")
airpo16 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2016.csv")
airpo15 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2015.csv")
#Read in CVD Mortality Data
# Read in the 2015-2019 file
tx_cvd_2015_2019 <- read.table("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/CVD 2015-2019.txt", sep="\t", header=TRUE, fill=TRUE)
tx_cvd_2015_2019 <- select(tx_cvd_2015_2019, -Notes)
#one database
airpo15_19 <- rbind(airpo19, airpo18, airpo17, airpo16, airpo15)
#Clean some stuff
#CVD Mortality Data
# Repeat for tx_cvd_2015_2019
# Convert Deaths column to numeric in tx_cvd_2015_2019
tx_cvd_2015_2019$Deaths <- as.numeric(tx_cvd_2015_2019$Deaths)
## Warning: NAs introduced by coercion
# Convert Crude.Rate column to numeric in tx_cvd_2015_2019
tx_cvd_2015_2019$Crude.Rate <- as.numeric(tx_cvd_2015_2019$Crude.Rate)
## Warning: NAs introduced by coercion
# Convert Age.Adjusted.Rate column to numeric in tx_cvd_2015_2019
tx_cvd_2015_2019$Age.Adjusted.Rate <- as.numeric(tx_cvd_2015_2019$Age.Adjusted.Rate)
## Warning: NAs introduced by coercion
# Replace dots with NA in tx_cvd_2015_2019
tx_cvd_2015_2019[tx_cvd_2015_2019 == "."] <- NA
# Replace "Suppressed" values with NA in Deaths column of tx_cvd_2015_2019
tx_cvd_2015_2019$Deaths[tx_cvd_2015_2019$Deaths == "Suppressed"] <- NA
# Convert Deaths column to numeric in tx_cvd_2015_2019
tx_cvd_2015_2019$Deaths <- as.numeric(tx_cvd_2015_2019$Deaths)
# remove ", TX" from County column
tx_cvd_2015_2019$County <- gsub(" County, TX", "", tx_cvd_2015_2019$County)
# Filter and select the data of interest
airpo15_19 <- airpo15_19 %>%
filter(State == "Texas",
) %>%
select(State, County, `Days Ozone`, `Days PM2.5`, `Days PM10`)
names(airpo15_19)
## [1] "State" "County" "Days Ozone" "Days PM2.5" "Days PM10"
names(tx_cvd_2015_2019)
## [1] "County" "County.Code" "Deaths"
## [4] "Population" "Crude.Rate" "Age.Adjusted.Rate"
str(airpo15_19)
## tibble [208 × 5] (S3: tbl_df/tbl/data.frame)
## $ State : chr [1:208] "Texas" "Texas" "Texas" "Texas" ...
## $ County : chr [1:208] "Bell" "Bexar" "Bowie" "Brazoria" ...
## $ Days Ozone: num [1:208] 265 161 0 360 308 128 365 103 199 287 ...
## $ Days PM2.5: num [1:208] 99 200 349 0 54 237 0 73 146 72 ...
## $ Days PM10 : num [1:208] 0 1 0 0 0 0 0 0 0 0 ...
str(tx_cvd_2015_2019)
## 'data.frame': 311 obs. of 6 variables:
## $ County : chr "Anderson" "Andrews" "Angelina" "Aransas" ...
## $ County.Code : int 48001 48003 48005 48007 48009 48011 48013 48015 48017 48019 ...
## $ Deaths : num 1068 226 1581 519 139 ...
## $ Population : int 288847 90420 437658 123945 43566 9481 247676 149128 35495 111332 ...
## $ Crude.Rate : num 370 250 361 419 319 ...
## $ Age.Adjusted.Rate: num 342 299 299 237 222 ...
merged_airpo1519_cvdmort <- merge(airpo15_19, tx_cvd_2015_2019, by = "County")
summary(merged_airpo1519_cvdmort)
## County State Days Ozone Days PM2.5
## Length:208 Length:208 Min. : 0.0 Min. : 0.0
## Class :character Class :character 1st Qu.:151.5 1st Qu.: 0.0
## Mode :character Mode :character Median :228.0 Median :117.0
## Mean :219.1 Mean :123.8
## 3rd Qu.:348.8 3rd Qu.:181.0
## Max. :365.0 Max. :365.0
## Days PM10 County.Code Deaths Population
## Min. : 0.0000 Min. :48027 Min. : 32 Min. : 11040
## 1st Qu.: 0.0000 1st Qu.:48135 1st Qu.: 1211 1st Qu.: 461052
## Median : 0.0000 Median :48238 Median : 2042 Median : 853878
## Mean : 0.3029 Mean :48242 Mean : 4997 Mean : 2592470
## 3rd Qu.: 0.0000 3rd Qu.:48361 3rd Qu.: 4366 3rd Qu.: 2115087
## Max. :16.0000 Max. :48479 Max. :39707 Max. :23192880
## Crude.Rate Age.Adjusted.Rate
## Min. :120.2 Min. :164.3
## 1st Qu.:194.6 1st Qu.:211.2
## Median :251.6 Median :229.6
## Mean :257.4 Mean :242.6
## 3rd Qu.:329.1 3rd Qu.:283.7
## Max. :511.0 Max. :365.2
library(dplyr)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
# Merge data frames
merged_airpo1519_cvdmort <- merge(airpo15_19, tx_cvd_2015_2019, by = "County")
# Read Texas county shapefile
texas_counties <- st_read("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp")
## Reading layer `tl_2019_us_county' from data source
## `/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 3233 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -179.2311 ymin: -14.60181 xmax: 179.8597 ymax: 71.43979
## Geodetic CRS: NAD83
# Filter shapefile to only include Texas counties
texas_counties <- texas_counties %>% filter(STATEFP == "48")
# Merge with the merged data frame
merged_counties <- left_join(texas_counties, merged_airpo1519_cvdmort, by = c("NAME" = "County"))
# View summary of merged data
summary(merged_counties)
## STATEFP COUNTYFP COUNTYNS GEOID
## Length:420 Length:420 Length:420 Length:420
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## NAME NAMELSAD LSAD CLASSFP
## Length:420 Length:420 Length:420 Length:420
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## MTFCC CSAFP CBSAFP METDIVFP
## Length:420 Length:420 Length:420 Length:420
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## FUNCSTAT ALAND AWATER INTPTLAT
## Length:420 Min. :3.292e+08 Min. :1.984e+04 Length:420
## Class :character 1st Qu.:2.179e+09 1st Qu.:1.038e+07 Class :character
## Mode :character Median :2.346e+09 Median :3.187e+07 Mode :character
## Mean :2.797e+09 Mean :1.142e+08
## 3rd Qu.:2.700e+09 3rd Qu.:8.959e+07
## Max. :1.602e+10 Max. :1.362e+09
##
## INTPTLON State Days Ozone Days PM2.5
## Length:420 Length:420 Min. : 0.0 Min. : 0.0
## Class :character Class :character 1st Qu.:151.5 1st Qu.: 0.0
## Mode :character Mode :character Median :228.0 Median :117.0
## Mean :219.1 Mean :123.8
## 3rd Qu.:348.8 3rd Qu.:181.0
## Max. :365.0 Max. :365.0
## NA's :212 NA's :212
## Days PM10 County.Code Deaths Population
## Min. : 0.0000 Min. :48027 Min. : 32 Min. : 11040
## 1st Qu.: 0.0000 1st Qu.:48135 1st Qu.: 1211 1st Qu.: 461052
## Median : 0.0000 Median :48238 Median : 2042 Median : 853878
## Mean : 0.3029 Mean :48242 Mean : 4997 Mean : 2592470
## 3rd Qu.: 0.0000 3rd Qu.:48361 3rd Qu.: 4366 3rd Qu.: 2115087
## Max. :16.0000 Max. :48479 Max. :39707 Max. :23192880
## NA's :212 NA's :212 NA's :212 NA's :212
## Crude.Rate Age.Adjusted.Rate geometry
## Min. :120.2 Min. :164.3 MULTIPOLYGON :420
## 1st Qu.:194.6 1st Qu.:211.2 epsg:4269 : 0
## Median :251.6 Median :229.6 +proj=long...: 0
## Mean :257.4 Mean :242.6
## 3rd Qu.:329.1 3rd Qu.:283.7
## Max. :511.0 Max. :365.2
## NA's :212 NA's :212
summary(merged_airpo1519_cvdmort)
## County State Days Ozone Days PM2.5
## Length:208 Length:208 Min. : 0.0 Min. : 0.0
## Class :character Class :character 1st Qu.:151.5 1st Qu.: 0.0
## Mode :character Mode :character Median :228.0 Median :117.0
## Mean :219.1 Mean :123.8
## 3rd Qu.:348.8 3rd Qu.:181.0
## Max. :365.0 Max. :365.0
## Days PM10 County.Code Deaths Population
## Min. : 0.0000 Min. :48027 Min. : 32 Min. : 11040
## 1st Qu.: 0.0000 1st Qu.:48135 1st Qu.: 1211 1st Qu.: 461052
## Median : 0.0000 Median :48238 Median : 2042 Median : 853878
## Mean : 0.3029 Mean :48242 Mean : 4997 Mean : 2592470
## 3rd Qu.: 0.0000 3rd Qu.:48361 3rd Qu.: 4366 3rd Qu.: 2115087
## Max. :16.0000 Max. :48479 Max. :39707 Max. :23192880
## Crude.Rate Age.Adjusted.Rate
## Min. :120.2 Min. :164.3
## 1st Qu.:194.6 1st Qu.:211.2
## Median :251.6 Median :229.6
## Mean :257.4 Mean :242.6
## 3rd Qu.:329.1 3rd Qu.:283.7
## Max. :511.0 Max. :365.2
library(ggplot2)
library(sf)
library(viridis)
## Loading required package: viridisLite
# Read Texas county shapefile
texas_counties <- st_read("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp")
## Reading layer `tl_2019_us_county' from data source
## `/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 3233 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -179.2311 ymin: -14.60181 xmax: 179.8597 ymax: 71.43979
## Geodetic CRS: NAD83
# Filter shapefile to only include Texas counties
texas_counties <- texas_counties %>% filter(STATEFP == "48")
# Merge with the merged data frame
merged_counties <- left_join(texas_counties, merged_airpo1519_cvdmort, by = c("NAME" = "County"))
# Create choropleth map
ggplot(data = merged_counties) +
geom_sf(aes(fill = Age.Adjusted.Rate)) +
scale_fill_viridis(option = "viridis", name = "Age-Adjusted\nMortality Rate", trans = "log10", direction = -1, na.value = "gray90",
guide = guide_colourbar(barwidth = 0.5, barheight = 5)) +
labs(title = "CVD Mortality by County in Texas, 2015-2019",
caption = "Source: CDC Wonder\nAuthor: Julie Gonzalez") +
theme_minimal() +
theme(panel.grid = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = "right",
plot.title = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0, size = 8))

# Load required libraries
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
##
## viridis_pal
## The following object is masked from 'package:readr':
##
## col_factor
library(ggplot2)
library(sf)
# Calculate combined pollution index
merged_airpo1519_cvdmort$Combined_Pollution <- merged_airpo1519_cvdmort$`Days Ozone` + merged_airpo1519_cvdmort$`Days PM2.5` + merged_airpo1519_cvdmort$`Days PM10`
# Merge data with the shapefile
merged_counties <- left_join(texas_counties, merged_airpo1519_cvdmort, by = c("NAME" = "County"))
# Create a choropleth map with ggplot2
ggplot() +
geom_sf(data = merged_counties, aes(fill = Age.Adjusted.Rate), color = "white", size = 0.1) +
scale_fill_viridis(option = "viridis", direction = -1, name = "Age-Adjusted CVD Mortality Rate", breaks = pretty_breaks(n = 5)) +
labs(title = "CVD Mortality Rate by Combined Pollution Index in Texas",
subtitle = "Based on Days Ozone, Days PM2.5, and Days PM10 for 2015-2019",
caption = "Source: CDC Wonder\nAuthor: Julie Gonzalez") +
theme_minimal() +
theme(panel.grid = element_blank(),
plot.title = element_text(hjust = 0.5, size = 14),
plot.subtitle = element_text(hjust = 0.5, size = 12),
plot.caption = element_text(hjust = 0, size = 8),
legend.position = "right")

# Load required libraries
library(dplyr)
# Calculate descriptive statistics
cvd_mortality_stats <- summary(merged_airpo1519_cvdmort$Age.Adjusted.Rate)
pollution_index_stats <- summary(merged_airpo1519_cvdmort$Combined_Pollution)
# Print the statistics
cat("CVD Mortality Rate (Age-Adjusted) Descriptive Statistics:\n")
## CVD Mortality Rate (Age-Adjusted) Descriptive Statistics:
print(cvd_mortality_stats)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 164.3 211.2 229.6 242.6 283.7 365.2
cat("\nCombined Pollution Index Descriptive Statistics:\n")
##
## Combined Pollution Index Descriptive Statistics:
print(pollution_index_stats)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 113.0 344.8 359.0 343.1 364.0 366.0
names(merged_airpo1519_cvdmort)
## [1] "County" "State" "Days Ozone"
## [4] "Days PM2.5" "Days PM10" "County.Code"
## [7] "Deaths" "Population" "Crude.Rate"
## [10] "Age.Adjusted.Rate" "Combined_Pollution"
# Create a scatter plot
ggplot(data = merged_airpo1519_cvdmort, aes(x = Combined_Pollution, y = Age.Adjusted.Rate)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "blue") +
labs(title = "CVD Mortality Rate vs. Combined Pollution Index in Texas (2015-2019)",
x = "Combined Pollution Index",
y = "Age-Adjusted CVD Mortality Rate",
caption = "Source: CDC Wonder\nAuthor: Julie Gonzalez") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0, size = 8))
## `geom_smooth()` using formula = 'y ~ x'

# Create a new column for pollution quartiles
merged_airpo1519_cvdmort$Pollution_Quartile <- ntile(merged_airpo1519_cvdmort$Combined_Pollution, 4)
# Create a box plot
ggplot(data = merged_airpo1519_cvdmort, aes(x = as.factor(Pollution_Quartile), y = Age.Adjusted.Rate)) +
geom_boxplot() +
labs(title = "CVD Mortality Rate by Pollution Index Quartiles in Texas (2015-2019)",
x = "Combined Pollution Index Quartiles",
y = "Age-Adjusted CVD Mortality Rate",
caption = "Source: CDC Wonder\nAuthor: Julie Gonzalez") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0, size = 8))

# Create a histogram
ggplot(data = merged_airpo1519_cvdmort, aes(x = Combined_Pollution)) +
geom_histogram(binwidth = 10, fill = "blue", alpha = 0.5) +
labs(title = "Histogram of Combined Pollution Index in Texas (2015-2019)",
x = "Combined Pollution Index",
y = "Frequency",
caption = "Source: CDC Wonder\nAuthor: Julie Gonzalez") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0, size = 8))

# Create a new data frame with the top 10 counties by combined pollution index and CVD mortality rate
top_10_counties <- merged_airpo1519_cvdmort %>%
select(County, Combined_Pollution, Age.Adjusted.Rate) %>%
arrange(desc(Combined_Pollution), desc(Age.Adjusted.Rate)) %>%
top_n(10)
## Selecting by Age.Adjusted.Rate
# Create a bar chart of the top 10 counties
ggplot(data = top_10_counties, aes(x = County, y = Combined_Pollution, fill = Age.Adjusted.Rate)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = round(Combined_Pollution, 1)), vjust = -0.5) +
scale_y_continuous(name = "Combined Pollution Index", limits = c(0, max(top_10_counties$Combined_Pollution) * 1.1)) +
scale_fill_viridis_c(name = "Age-Adjusted\nMortality Rate") +
coord_flip() +
labs(title = "Top 10 Counties by Combined Pollution Index and CVD Mortality Rate in Texas, 2015-2019",
y = "Combined Pollution Index",
x = NULL,
caption = "Source: CDC Wonder\nAuthor: Julie Gonzalez") +
theme_minimal()
## Warning: Removed 8 rows containing missing values (`geom_col()`).

# View information for Harris County
harris_info <- merged_airpo1519_cvdmort %>%
filter(County == "Harris") %>%
select(County, State, Combined_Pollution, Age.Adjusted.Rate)
# View information for Polk County
polk_info <- merged_airpo1519_cvdmort %>%
filter(County == "Polk") %>%
select(County, State, Combined_Pollution, Age.Adjusted.Rate)
# Filter merged dataset to include only Harris and Polk counties
harris_polk_data <- merged_airpo1519_cvdmort %>%
filter(County %in% c("Harris", "Polk"))
# Print information for Harris and Polk counties
harris_polk_data
## County State Days Ozone Days PM2.5 Days PM10 County.Code Deaths Population
## 1 Harris Texas 126 201 16 48201 39707 23192880
## 2 Harris Texas 131 200 3 48201 39707 23192880
## 3 Harris Texas 155 181 2 48201 39707 23192880
## 4 Harris Texas 169 159 6 48201 39707 23192880
## 5 Harris Texas 155 185 3 48201 39707 23192880
## 6 Polk Texas 327 0 0 48373 1035 245434
## 7 Polk Texas 346 0 0 48373 1035 245434
## 8 Polk Texas 365 0 0 48373 1035 245434
## 9 Polk Texas 342 0 0 48373 1035 245434
## 10 Polk Texas 344 0 0 48373 1035 245434
## Crude.Rate Age.Adjusted.Rate Combined_Pollution Pollution_Quartile
## 1 171.2 216.2 343 1
## 2 171.2 216.2 334 1
## 3 171.2 216.2 338 1
## 4 171.2 216.2 334 1
## 5 171.2 216.2 343 1
## 6 421.7 365.2 327 1
## 7 421.7 365.2 346 2
## 8 421.7 365.2 365 4
## 9 421.7 365.2 342 1
## 10 421.7 365.2 344 1