library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(haven)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.1 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tmap)
library(ggplot2)
# Read in the 2010-2014 file
tx_resp_2010_2014 <- read.table("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/CDC2010-2014 Respiratory.txt", sep="\t", header=TRUE, fill=TRUE)
# Read in the 2015-2019 file
tx_resp_2015_2019 <- read.table("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/CDC2015-2019 Respiratory.txt", sep="\t", header=TRUE, fill=TRUE)
library(dplyr)
# Remove notes column
tx_resp_2010_2014 <- select(tx_resp_2010_2014, -Notes)
tx_resp_2015_2019 <- select(tx_resp_2015_2019, -Notes)
# Convert Deaths column to numeric in tx_resp_2010_2014
tx_resp_2010_2014$Deaths <- as.numeric(tx_resp_2010_2014$Deaths)
## Warning: NAs introduced by coercion
# Convert Crude.Rate column to numeric in tx_resp_2010_2014
tx_resp_2010_2014$Crude.Rate <- as.numeric(tx_resp_2010_2014$Crude.Rate)
## Warning: NAs introduced by coercion
# Convert Age.Adjusted.Rate column to numeric in tx_resp_2010_2014
tx_resp_2010_2014$Age.Adjusted.Rate <- as.numeric(tx_resp_2010_2014$Age.Adjusted.Rate)
## Warning: NAs introduced by coercion
# Repeat for tx_resp_2015_2019
# Convert Deaths column to numeric in tx_resp_2015_2019
tx_resp_2015_2019$Deaths <- as.numeric(tx_resp_2015_2019$Deaths)
## Warning: NAs introduced by coercion
# Convert Crude.Rate column to numeric in tx_resp_2015_2019
tx_resp_2015_2019$Crude.Rate <- as.numeric(tx_resp_2015_2019$Crude.Rate)
## Warning: NAs introduced by coercion
# Convert Age.Adjusted.Rate column to numeric in tx_resp_2015_2019
tx_resp_2015_2019$Age.Adjusted.Rate <- as.numeric(tx_resp_2015_2019$Age.Adjusted.Rate)
## Warning: NAs introduced by coercion
# Replace dots with NA in tx_resp_2010_2014
tx_resp_2010_2014[tx_resp_2010_2014 == "."] <- NA
# Replace dots with NA in tx_resp_2015_2019
tx_resp_2015_2019[tx_resp_2015_2019 == "."] <- NA
# Replace "Suppressed" values with NA in Deaths column of tx_resp_2010_2014
tx_resp_2010_2014$Deaths[tx_resp_2010_2014$Deaths == "Suppressed"] <- NA
# Convert Deaths column to numeric in tx_resp_2010_2014
tx_resp_2010_2014$Deaths <- as.numeric(tx_resp_2010_2014$Deaths)
# Replace "Suppressed" values with NA in Deaths column of tx_resp_2015_2019
tx_resp_2015_2019$Deaths[tx_resp_2015_2019$Deaths == "Suppressed"] <- NA
# Convert Deaths column to numeric in tx_resp_2015_2019
tx_resp_2015_2019$Deaths <- as.numeric(tx_resp_2015_2019$Deaths)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
# Read Texas county shapefile
texas_counties <- st_read("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp")
## Reading layer `tl_2019_us_county' from data source
## `/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 3233 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -179.2311 ymin: -14.60181 xmax: 179.8597 ymax: 71.43979
## Geodetic CRS: NAD83
#Variables need to be standardize.
# Standardize county names and rename the column
tx_resp_2010_2014$County <- gsub(" County, TX", "", tx_resp_2010_2014$County)
names(tx_resp_2010_2014)[names(tx_resp_2010_2014) == "County"] <- "NAME"
tx_resp_2015_2019$County <- gsub(" County, TX", "", tx_resp_2015_2019$County)
names(tx_resp_2015_2019)[names(tx_resp_2015_2019) == "County"] <- "NAME"
# Rename the 'County.Code' column to 'GEOID' and convert to character
tx_resp_2010_2014$GEOID <- as.character(tx_resp_2010_2014$County.Code)
tx_resp_2015_2019$GEOID <- as.character(tx_resp_2015_2019$County.Code)
# Remove the 'County.Code' column
tx_resp_2010_2014 <- tx_resp_2010_2014[, !names(tx_resp_2010_2014) %in% "County.Code"]
tx_resp_2015_2019 <- tx_resp_2015_2019[, !names(tx_resp_2015_2019) %in% "County.Code"]
#Merging the datasets using the 'GEOID' column
regions_map_2010_2014 <- merge(texas_counties, tx_resp_2010_2014, by = "GEOID")
regions_map_2015_2019 <- merge(texas_counties, tx_resp_2015_2019, by = "GEOID")
#Review both datasets and variables.
names(texas_counties)
## [1] "STATEFP" "COUNTYFP" "COUNTYNS" "GEOID" "NAME" "NAMELSAD"
## [7] "LSAD" "CLASSFP" "MTFCC" "CSAFP" "CBSAFP" "METDIVFP"
## [13] "FUNCSTAT" "ALAND" "AWATER" "INTPTLAT" "INTPTLON" "geometry"
str(texas_counties)
## Classes 'sf' and 'data.frame': 3233 obs. of 18 variables:
## $ STATEFP : chr "31" "53" "35" "31" ...
## $ COUNTYFP: chr "039" "069" "011" "109" ...
## $ COUNTYNS: chr "00835841" "01513275" "00933054" "00835876" ...
## $ GEOID : chr "31039" "53069" "35011" "31109" ...
## $ NAME : chr "Cuming" "Wahkiakum" "De Baca" "Lancaster" ...
## $ NAMELSAD: chr "Cuming County" "Wahkiakum County" "De Baca County" "Lancaster County" ...
## $ LSAD : chr "06" "06" "06" "06" ...
## $ CLASSFP : chr "H1" "H1" "H1" "H1" ...
## $ MTFCC : chr "G4020" "G4020" "G4020" "G4020" ...
## $ CSAFP : chr NA NA NA "339" ...
## $ CBSAFP : chr NA NA NA "30700" ...
## $ METDIVFP: chr NA NA NA NA ...
## $ FUNCSTAT: chr "A" "A" "A" "A" ...
## $ ALAND : num 1.48e+09 6.81e+08 6.02e+09 2.17e+09 1.49e+09 ...
## $ AWATER : num 10690952 61582307 29089486 22849484 1718484 ...
## $ INTPTLAT: chr "+41.9158651" "+46.2946377" "+34.3592729" "+40.7835474" ...
## $ INTPTLON: chr "-096.7885168" "-123.4244583" "-104.3686961" "-096.6886584" ...
## $ geometry:sfc_MULTIPOLYGON of length 3233; first list element: List of 1
## ..$ :List of 1
## .. ..$ : num [1:1320, 1:2] -97 -97 -97 -97 -97 ...
## ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
## - attr(*, "sf_column")= chr "geometry"
## - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "names")= chr [1:17] "STATEFP" "COUNTYFP" "COUNTYNS" "GEOID" ...
names(tx_resp_2010_2014)
## [1] "NAME" "Deaths" "Population"
## [4] "Crude.Rate" "Age.Adjusted.Rate" "GEOID"
str(tx_resp_2010_2014)
## 'data.frame': 313 obs. of 6 variables:
## $ NAME : chr "Anderson" "Andrews" "Angelina" "Aransas" ...
## $ Deaths : num 268 66 354 171 29 14 172 131 32 103 ...
## $ Population : int 290521 80624 437228 119678 44123 9677 231803 143661 35566 103053 ...
## $ Crude.Rate : num 92.2 81.9 81 142.9 65.7 ...
## $ Age.Adjusted.Rate: num 90 95.9 71.4 85 51.3 NA 71.8 74.4 80.8 68.9 ...
## $ GEOID : chr "48001" "48003" "48005" "48007" ...
library(ggplot2)
library(sf)
# Convert the merged datasets to sf objects
regions_map_2010_2014_sf <- st_as_sf(regions_map_2010_2014)
regions_map_2015_2019_sf <- st_as_sf(regions_map_2015_2019)
# Create a function to plot the maps
plot_map <- function(data, title) {
ggplot() +
geom_sf(data = data, aes(fill = Deaths)) +
scale_fill_viridis_c(trans = "log10", name = "Deaths") +
theme_minimal() +
theme(panel.grid.major = element_line(colour = "white"),
legend.position = "bottom",
axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 1)) +
labs(title = title,
caption = paste("Source: CDC Wonder |",
"Author: Julie Gonzalez"))
}
# Plot the maps
plot_2010_2014 <- plot_map(regions_map_2010_2014_sf, "Respiratory Deaths in Texas (2010-2014)")
plot_2015_2019 <- plot_map(regions_map_2015_2019_sf, "Respiratory Deaths in Texas (2015-2019)")
# Display the maps
plot_2010_2014

plot_2015_2019

# Check the variable names in the merged dataset
names(regions_map_2010_2014)
## [1] "GEOID" "STATEFP" "COUNTYFP"
## [4] "COUNTYNS" "NAME.x" "NAMELSAD"
## [7] "LSAD" "CLASSFP" "MTFCC"
## [10] "CSAFP" "CBSAFP" "METDIVFP"
## [13] "FUNCSTAT" "ALAND" "AWATER"
## [16] "INTPTLAT" "INTPTLON" "NAME.y"
## [19] "Deaths" "Population" "Crude.Rate"
## [22] "Age.Adjusted.Rate" "geometry"
# Check the summary statistics of the 'Crude.Rate' variable
summary(regions_map_2010_2014$Crude.Rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 32.60 79.67 99.30 105.25 128.95 246.00 34
# Create a histogram of the 'Crude.Rate' variable
hist(regions_map_2010_2014$Crude.Rate, breaks = 20, main = "Histogram of Crude Mortality Rates in Texas (2010-2014)")

# Check the variable names in the merged dataset
names(regions_map_2015_2019)
## [1] "GEOID" "STATEFP" "COUNTYFP"
## [4] "COUNTYNS" "NAME.x" "NAMELSAD"
## [7] "LSAD" "CLASSFP" "MTFCC"
## [10] "CSAFP" "CBSAFP" "METDIVFP"
## [13] "FUNCSTAT" "ALAND" "AWATER"
## [16] "INTPTLAT" "INTPTLON" "NAME.y"
## [19] "Deaths" "Population" "Crude.Rate"
## [22] "Age.Adjusted.Rate" "geometry"
# Check the summary statistics of the 'Crude.Rate' variable
summary(regions_map_2015_2019$Crude.Rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 36.00 78.78 103.05 109.49 132.20 243.80 34
# Create a histogram of the 'Crude.Rate' variable
hist(regions_map_2015_2019$Crude.Rate, breaks = 20, main = "Histogram of Crude Mortality Rates in Texas (2015-2019)")

# # Bar plots
# ggplot(top10_deaths_2010_2014, aes(x = reorder(NAME, -Deaths), y = Deaths)) +
# geom_bar(stat = "identity", fill = "steelblue") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
# labs(title = "Top 10 Counties with Highest Respiratory Deaths (2010-2014)", x = "County", y = "Deaths")
#
# ggplot(top10_deaths_2015_2019, aes(x = reorder(NAME, -Deaths), y = Deaths)) +
# geom_bar(stat = "identity", fill = "steelblue") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
# labs(title = "Top 10 Counties with Highest Respiratory Deaths (2015-2019)", x = "County", y = "Deaths")