library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(haven)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.1     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tmap)
library(ggplot2)
# Read in the 2010-2014 file
tx_resp_2010_2014 <- read.table("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/CDC2010-2014 Respiratory.txt", sep="\t", header=TRUE, fill=TRUE)

# Read in the 2015-2019 file
tx_resp_2015_2019 <- read.table("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/CDC2015-2019 Respiratory.txt", sep="\t", header=TRUE, fill=TRUE)
library(dplyr)

# Remove notes column
tx_resp_2010_2014 <- select(tx_resp_2010_2014, -Notes)
tx_resp_2015_2019 <- select(tx_resp_2015_2019, -Notes)
# Convert Deaths column to numeric in tx_resp_2010_2014
tx_resp_2010_2014$Deaths <- as.numeric(tx_resp_2010_2014$Deaths)
## Warning: NAs introduced by coercion
# Convert Crude.Rate column to numeric in tx_resp_2010_2014
tx_resp_2010_2014$Crude.Rate <- as.numeric(tx_resp_2010_2014$Crude.Rate)
## Warning: NAs introduced by coercion
# Convert Age.Adjusted.Rate column to numeric in tx_resp_2010_2014
tx_resp_2010_2014$Age.Adjusted.Rate <- as.numeric(tx_resp_2010_2014$Age.Adjusted.Rate)
## Warning: NAs introduced by coercion
# Repeat for tx_resp_2015_2019
# Convert Deaths column to numeric in tx_resp_2015_2019
tx_resp_2015_2019$Deaths <- as.numeric(tx_resp_2015_2019$Deaths)
## Warning: NAs introduced by coercion
# Convert Crude.Rate column to numeric in tx_resp_2015_2019
tx_resp_2015_2019$Crude.Rate <- as.numeric(tx_resp_2015_2019$Crude.Rate)
## Warning: NAs introduced by coercion
# Convert Age.Adjusted.Rate column to numeric in tx_resp_2015_2019
tx_resp_2015_2019$Age.Adjusted.Rate <- as.numeric(tx_resp_2015_2019$Age.Adjusted.Rate)
## Warning: NAs introduced by coercion
# Replace dots with NA in tx_resp_2010_2014
tx_resp_2010_2014[tx_resp_2010_2014 == "."] <- NA

# Replace dots with NA in tx_resp_2015_2019
tx_resp_2015_2019[tx_resp_2015_2019 == "."] <- NA

# Replace "Suppressed" values with NA in Deaths column of tx_resp_2010_2014
tx_resp_2010_2014$Deaths[tx_resp_2010_2014$Deaths == "Suppressed"] <- NA

# Convert Deaths column to numeric in tx_resp_2010_2014
tx_resp_2010_2014$Deaths <- as.numeric(tx_resp_2010_2014$Deaths)

# Replace "Suppressed" values with NA in Deaths column of tx_resp_2015_2019
tx_resp_2015_2019$Deaths[tx_resp_2015_2019$Deaths == "Suppressed"] <- NA

# Convert Deaths column to numeric in tx_resp_2015_2019
tx_resp_2015_2019$Deaths <- as.numeric(tx_resp_2015_2019$Deaths)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
# Read Texas county shapefile 
texas_counties <- st_read("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp")
## Reading layer `tl_2019_us_county' from data source 
##   `/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 3233 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -179.2311 ymin: -14.60181 xmax: 179.8597 ymax: 71.43979
## Geodetic CRS:  NAD83
#Variables need to be standardize. 
# Standardize county names and rename the column
tx_resp_2010_2014$County <- gsub(" County, TX", "", tx_resp_2010_2014$County)
names(tx_resp_2010_2014)[names(tx_resp_2010_2014) == "County"] <- "NAME"

tx_resp_2015_2019$County <- gsub(" County, TX", "", tx_resp_2015_2019$County)
names(tx_resp_2015_2019)[names(tx_resp_2015_2019) == "County"] <- "NAME"
# Rename the 'County.Code' column to 'GEOID' and convert to character
tx_resp_2010_2014$GEOID <- as.character(tx_resp_2010_2014$County.Code)
tx_resp_2015_2019$GEOID <- as.character(tx_resp_2015_2019$County.Code)

# Remove the 'County.Code' column
tx_resp_2010_2014 <- tx_resp_2010_2014[, !names(tx_resp_2010_2014) %in% "County.Code"]
tx_resp_2015_2019 <- tx_resp_2015_2019[, !names(tx_resp_2015_2019) %in% "County.Code"]
#Merging the datasets using the 'GEOID' column
regions_map_2010_2014 <- merge(texas_counties, tx_resp_2010_2014, by = "GEOID")
regions_map_2015_2019 <- merge(texas_counties, tx_resp_2015_2019, by = "GEOID")
#Review both datasets and variables. 
names(texas_counties)
##  [1] "STATEFP"  "COUNTYFP" "COUNTYNS" "GEOID"    "NAME"     "NAMELSAD"
##  [7] "LSAD"     "CLASSFP"  "MTFCC"    "CSAFP"    "CBSAFP"   "METDIVFP"
## [13] "FUNCSTAT" "ALAND"    "AWATER"   "INTPTLAT" "INTPTLON" "geometry"
str(texas_counties)
## Classes 'sf' and 'data.frame':   3233 obs. of  18 variables:
##  $ STATEFP : chr  "31" "53" "35" "31" ...
##  $ COUNTYFP: chr  "039" "069" "011" "109" ...
##  $ COUNTYNS: chr  "00835841" "01513275" "00933054" "00835876" ...
##  $ GEOID   : chr  "31039" "53069" "35011" "31109" ...
##  $ NAME    : chr  "Cuming" "Wahkiakum" "De Baca" "Lancaster" ...
##  $ NAMELSAD: chr  "Cuming County" "Wahkiakum County" "De Baca County" "Lancaster County" ...
##  $ LSAD    : chr  "06" "06" "06" "06" ...
##  $ CLASSFP : chr  "H1" "H1" "H1" "H1" ...
##  $ MTFCC   : chr  "G4020" "G4020" "G4020" "G4020" ...
##  $ CSAFP   : chr  NA NA NA "339" ...
##  $ CBSAFP  : chr  NA NA NA "30700" ...
##  $ METDIVFP: chr  NA NA NA NA ...
##  $ FUNCSTAT: chr  "A" "A" "A" "A" ...
##  $ ALAND   : num  1.48e+09 6.81e+08 6.02e+09 2.17e+09 1.49e+09 ...
##  $ AWATER  : num  10690952 61582307 29089486 22849484 1718484 ...
##  $ INTPTLAT: chr  "+41.9158651" "+46.2946377" "+34.3592729" "+40.7835474" ...
##  $ INTPTLON: chr  "-096.7885168" "-123.4244583" "-104.3686961" "-096.6886584" ...
##  $ geometry:sfc_MULTIPOLYGON of length 3233; first list element: List of 1
##   ..$ :List of 1
##   .. ..$ : num [1:1320, 1:2] -97 -97 -97 -97 -97 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:17] "STATEFP" "COUNTYFP" "COUNTYNS" "GEOID" ...
names(tx_resp_2010_2014)
## [1] "NAME"              "Deaths"            "Population"       
## [4] "Crude.Rate"        "Age.Adjusted.Rate" "GEOID"
str(tx_resp_2010_2014)
## 'data.frame':    313 obs. of  6 variables:
##  $ NAME             : chr  "Anderson" "Andrews" "Angelina" "Aransas" ...
##  $ Deaths           : num  268 66 354 171 29 14 172 131 32 103 ...
##  $ Population       : int  290521 80624 437228 119678 44123 9677 231803 143661 35566 103053 ...
##  $ Crude.Rate       : num  92.2 81.9 81 142.9 65.7 ...
##  $ Age.Adjusted.Rate: num  90 95.9 71.4 85 51.3 NA 71.8 74.4 80.8 68.9 ...
##  $ GEOID            : chr  "48001" "48003" "48005" "48007" ...
library(ggplot2)
library(sf)

# Convert the merged datasets to sf objects
regions_map_2010_2014_sf <- st_as_sf(regions_map_2010_2014)
regions_map_2015_2019_sf <- st_as_sf(regions_map_2015_2019)

# Create a function to plot the maps
plot_map <- function(data, title) {
  ggplot() +
    geom_sf(data = data, aes(fill = Deaths)) +
    scale_fill_viridis_c(trans = "log10", name = "Deaths") +
    theme_minimal() +
    theme(panel.grid.major = element_line(colour = "white"),
          legend.position = "bottom",
          axis.text = element_blank(),
          axis.title = element_blank(),
          axis.ticks = element_blank(),
          plot.title = element_text(hjust = 0.5),
          plot.caption = element_text(hjust = 1)) +
    labs(title = title,
         caption = paste("Source: CDC Wonder |",
                         "Author: Julie Gonzalez"))
}

# Plot the maps
plot_2010_2014 <- plot_map(regions_map_2010_2014_sf, "Respiratory Deaths in Texas (2010-2014)")
plot_2015_2019 <- plot_map(regions_map_2015_2019_sf, "Respiratory Deaths in Texas (2015-2019)")

# Display the maps
plot_2010_2014

plot_2015_2019

# Check the variable names in the merged dataset
names(regions_map_2010_2014)
##  [1] "GEOID"             "STATEFP"           "COUNTYFP"         
##  [4] "COUNTYNS"          "NAME.x"            "NAMELSAD"         
##  [7] "LSAD"              "CLASSFP"           "MTFCC"            
## [10] "CSAFP"             "CBSAFP"            "METDIVFP"         
## [13] "FUNCSTAT"          "ALAND"             "AWATER"           
## [16] "INTPTLAT"          "INTPTLON"          "NAME.y"           
## [19] "Deaths"            "Population"        "Crude.Rate"       
## [22] "Age.Adjusted.Rate" "geometry"
# Check the summary statistics of the 'Crude.Rate' variable
summary(regions_map_2010_2014$Crude.Rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   32.60   79.67   99.30  105.25  128.95  246.00      34
# Create a histogram of the 'Crude.Rate' variable
hist(regions_map_2010_2014$Crude.Rate, breaks = 20, main = "Histogram of Crude Mortality Rates in Texas (2010-2014)")

# Check the variable names in the merged dataset
names(regions_map_2015_2019)
##  [1] "GEOID"             "STATEFP"           "COUNTYFP"         
##  [4] "COUNTYNS"          "NAME.x"            "NAMELSAD"         
##  [7] "LSAD"              "CLASSFP"           "MTFCC"            
## [10] "CSAFP"             "CBSAFP"            "METDIVFP"         
## [13] "FUNCSTAT"          "ALAND"             "AWATER"           
## [16] "INTPTLAT"          "INTPTLON"          "NAME.y"           
## [19] "Deaths"            "Population"        "Crude.Rate"       
## [22] "Age.Adjusted.Rate" "geometry"
# Check the summary statistics of the 'Crude.Rate' variable
summary(regions_map_2015_2019$Crude.Rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   36.00   78.78  103.05  109.49  132.20  243.80      34
# Create a histogram of the 'Crude.Rate' variable
hist(regions_map_2015_2019$Crude.Rate, breaks = 20, main = "Histogram of Crude Mortality Rates in Texas (2015-2019)")

# # Bar plots
# ggplot(top10_deaths_2010_2014, aes(x = reorder(NAME, -Deaths), y = Deaths)) +
#   geom_bar(stat = "identity", fill = "steelblue") +
#   theme_minimal() +
#   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
#   labs(title = "Top 10 Counties with Highest Respiratory Deaths (2010-2014)", x = "County", y = "Deaths")
# 
# ggplot(top10_deaths_2015_2019, aes(x = reorder(NAME, -Deaths), y = Deaths)) +
#   geom_bar(stat = "identity", fill = "steelblue") +
#   theme_minimal() +
#   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
#   labs(title = "Top 10 Counties with Highest Respiratory Deaths (2015-2019)", x = "County", y = "Deaths")