# Load the necessary packages
library(readr)
library(dplyr)
library(bit)
library(haven)

#Read in Air pollution data
airpo19 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2019.csv")

airpo18 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2018.csv")

airpo17 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2017.csv")

airpo16 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2016.csv")

airpo15 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2015.csv")


#Read in CVD Mortality Data 



# Read in the 2015-2019 file
tx_cvd_2015_2019 <- read.table("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/CVD 2015-2019.txt", sep="\t", header=TRUE, fill=TRUE)

tx_cvd_2015_2019 <- select(tx_cvd_2015_2019, -Notes)
#one database
airpo15_19 <- rbind(airpo19, airpo18, airpo17, airpo16, airpo15)
#Clean some stuff
#CVD Mortality Data

# Repeat for tx_cvd_2015_2019
# Convert Deaths column to numeric in tx_cvd_2015_2019
tx_cvd_2015_2019$Deaths <- as.numeric(tx_cvd_2015_2019$Deaths)
## Warning: NAs introduced by coercion
# Convert Crude.Rate column to numeric in tx_cvd_2015_2019
tx_cvd_2015_2019$Crude.Rate <- as.numeric(tx_cvd_2015_2019$Crude.Rate)
## Warning: NAs introduced by coercion
# Convert Age.Adjusted.Rate column to numeric in tx_cvd_2015_2019
tx_cvd_2015_2019$Age.Adjusted.Rate <- as.numeric(tx_cvd_2015_2019$Age.Adjusted.Rate)
## Warning: NAs introduced by coercion
# Replace dots with NA in tx_cvd_2015_2019
tx_cvd_2015_2019[tx_cvd_2015_2019 == "."] <- NA

# Replace "Suppressed" values with NA in Deaths column of tx_cvd_2015_2019
tx_cvd_2015_2019$Deaths[tx_cvd_2015_2019$Deaths == "Suppressed"] <- NA

# Convert Deaths column to numeric in tx_cvd_2015_2019
tx_cvd_2015_2019$Deaths <- as.numeric(tx_cvd_2015_2019$Deaths)

# remove ", TX" from County column
tx_cvd_2015_2019$County <- gsub(" County, TX", "", tx_cvd_2015_2019$County)

# Filter and select the data of interest
airpo15_19 <- airpo15_19 %>%
  filter(State == "Texas",
         ) %>%
  select(State, County, `Days Ozone`, `Days PM2.5`, `Days PM10`)
names(airpo15_19)
## [1] "State"      "County"     "Days Ozone" "Days PM2.5" "Days PM10"
names(tx_cvd_2015_2019)
## [1] "County"            "County.Code"       "Deaths"           
## [4] "Population"        "Crude.Rate"        "Age.Adjusted.Rate"
str(airpo15_19)
## tibble [208 × 5] (S3: tbl_df/tbl/data.frame)
##  $ State     : chr [1:208] "Texas" "Texas" "Texas" "Texas" ...
##  $ County    : chr [1:208] "Bell" "Bexar" "Bowie" "Brazoria" ...
##  $ Days Ozone: num [1:208] 265 161 0 360 308 128 365 103 199 287 ...
##  $ Days PM2.5: num [1:208] 99 200 349 0 54 237 0 73 146 72 ...
##  $ Days PM10 : num [1:208] 0 1 0 0 0 0 0 0 0 0 ...
str(tx_cvd_2015_2019)
## 'data.frame':    311 obs. of  6 variables:
##  $ County           : chr  "Anderson" "Andrews" "Angelina" "Aransas" ...
##  $ County.Code      : int  48001 48003 48005 48007 48009 48011 48013 48015 48017 48019 ...
##  $ Deaths           : num  1068 226 1581 519 139 ...
##  $ Population       : int  288847 90420 437658 123945 43566 9481 247676 149128 35495 111332 ...
##  $ Crude.Rate       : num  370 250 361 419 319 ...
##  $ Age.Adjusted.Rate: num  342 299 299 237 222 ...
merged_airpo1519_cvdmort <- merge(airpo15_19, tx_cvd_2015_2019, by = "County")
summary(merged_airpo1519_cvdmort)
##     County             State             Days Ozone      Days PM2.5   
##  Length:208         Length:208         Min.   :  0.0   Min.   :  0.0  
##  Class :character   Class :character   1st Qu.:151.5   1st Qu.:  0.0  
##  Mode  :character   Mode  :character   Median :228.0   Median :117.0  
##                                        Mean   :219.1   Mean   :123.8  
##                                        3rd Qu.:348.8   3rd Qu.:181.0  
##                                        Max.   :365.0   Max.   :365.0  
##    Days PM10        County.Code        Deaths        Population      
##  Min.   : 0.0000   Min.   :48027   Min.   :   32   Min.   :   11040  
##  1st Qu.: 0.0000   1st Qu.:48135   1st Qu.: 1211   1st Qu.:  461052  
##  Median : 0.0000   Median :48238   Median : 2042   Median :  853878  
##  Mean   : 0.3029   Mean   :48242   Mean   : 4997   Mean   : 2592470  
##  3rd Qu.: 0.0000   3rd Qu.:48361   3rd Qu.: 4366   3rd Qu.: 2115087  
##  Max.   :16.0000   Max.   :48479   Max.   :39707   Max.   :23192880  
##    Crude.Rate    Age.Adjusted.Rate
##  Min.   :120.2   Min.   :164.3    
##  1st Qu.:194.6   1st Qu.:211.2    
##  Median :251.6   Median :229.6    
##  Mean   :257.4   Mean   :242.6    
##  3rd Qu.:329.1   3rd Qu.:283.7    
##  Max.   :511.0   Max.   :365.2
library(dplyr)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
# Merge data frames
merged_airpo1519_cvdmort <- merge(airpo15_19, tx_cvd_2015_2019, by = "County")

# Read Texas county shapefile 
texas_counties <- st_read("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp")
## Reading layer `tl_2019_us_county' from data source 
##   `/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 3233 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -179.2311 ymin: -14.60181 xmax: 179.8597 ymax: 71.43979
## Geodetic CRS:  NAD83
# Filter shapefile to only include Texas counties
texas_counties <- texas_counties %>% filter(STATEFP == "48")

# Merge with the merged data frame
merged_counties <- left_join(texas_counties, merged_airpo1519_cvdmort, by = c("NAME" = "County"))

# View summary of merged data
summary(merged_counties)
##    STATEFP            COUNTYFP           COUNTYNS            GEOID          
##  Length:420         Length:420         Length:420         Length:420        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      NAME             NAMELSAD             LSAD             CLASSFP         
##  Length:420         Length:420         Length:420         Length:420        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     MTFCC              CSAFP              CBSAFP            METDIVFP        
##  Length:420         Length:420         Length:420         Length:420        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    FUNCSTAT             ALAND               AWATER            INTPTLAT        
##  Length:420         Min.   :3.292e+08   Min.   :1.984e+04   Length:420        
##  Class :character   1st Qu.:2.179e+09   1st Qu.:1.038e+07   Class :character  
##  Mode  :character   Median :2.346e+09   Median :3.187e+07   Mode  :character  
##                     Mean   :2.797e+09   Mean   :1.142e+08                     
##                     3rd Qu.:2.700e+09   3rd Qu.:8.959e+07                     
##                     Max.   :1.602e+10   Max.   :1.362e+09                     
##                                                                               
##    INTPTLON            State             Days Ozone      Days PM2.5   
##  Length:420         Length:420         Min.   :  0.0   Min.   :  0.0  
##  Class :character   Class :character   1st Qu.:151.5   1st Qu.:  0.0  
##  Mode  :character   Mode  :character   Median :228.0   Median :117.0  
##                                        Mean   :219.1   Mean   :123.8  
##                                        3rd Qu.:348.8   3rd Qu.:181.0  
##                                        Max.   :365.0   Max.   :365.0  
##                                        NA's   :212     NA's   :212    
##    Days PM10        County.Code        Deaths        Population      
##  Min.   : 0.0000   Min.   :48027   Min.   :   32   Min.   :   11040  
##  1st Qu.: 0.0000   1st Qu.:48135   1st Qu.: 1211   1st Qu.:  461052  
##  Median : 0.0000   Median :48238   Median : 2042   Median :  853878  
##  Mean   : 0.3029   Mean   :48242   Mean   : 4997   Mean   : 2592470  
##  3rd Qu.: 0.0000   3rd Qu.:48361   3rd Qu.: 4366   3rd Qu.: 2115087  
##  Max.   :16.0000   Max.   :48479   Max.   :39707   Max.   :23192880  
##  NA's   :212       NA's   :212     NA's   :212     NA's   :212       
##    Crude.Rate    Age.Adjusted.Rate          geometry  
##  Min.   :120.2   Min.   :164.3     MULTIPOLYGON :420  
##  1st Qu.:194.6   1st Qu.:211.2     epsg:4269    :  0  
##  Median :251.6   Median :229.6     +proj=long...:  0  
##  Mean   :257.4   Mean   :242.6                        
##  3rd Qu.:329.1   3rd Qu.:283.7                        
##  Max.   :511.0   Max.   :365.2                        
##  NA's   :212     NA's   :212
summary(merged_airpo1519_cvdmort)
##     County             State             Days Ozone      Days PM2.5   
##  Length:208         Length:208         Min.   :  0.0   Min.   :  0.0  
##  Class :character   Class :character   1st Qu.:151.5   1st Qu.:  0.0  
##  Mode  :character   Mode  :character   Median :228.0   Median :117.0  
##                                        Mean   :219.1   Mean   :123.8  
##                                        3rd Qu.:348.8   3rd Qu.:181.0  
##                                        Max.   :365.0   Max.   :365.0  
##    Days PM10        County.Code        Deaths        Population      
##  Min.   : 0.0000   Min.   :48027   Min.   :   32   Min.   :   11040  
##  1st Qu.: 0.0000   1st Qu.:48135   1st Qu.: 1211   1st Qu.:  461052  
##  Median : 0.0000   Median :48238   Median : 2042   Median :  853878  
##  Mean   : 0.3029   Mean   :48242   Mean   : 4997   Mean   : 2592470  
##  3rd Qu.: 0.0000   3rd Qu.:48361   3rd Qu.: 4366   3rd Qu.: 2115087  
##  Max.   :16.0000   Max.   :48479   Max.   :39707   Max.   :23192880  
##    Crude.Rate    Age.Adjusted.Rate
##  Min.   :120.2   Min.   :164.3    
##  1st Qu.:194.6   1st Qu.:211.2    
##  Median :251.6   Median :229.6    
##  Mean   :257.4   Mean   :242.6    
##  3rd Qu.:329.1   3rd Qu.:283.7    
##  Max.   :511.0   Max.   :365.2
library(ggplot2)
library(sf)
library(viridis)
## Loading required package: viridisLite
# Read Texas county shapefile 
texas_counties <- st_read("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp")
## Reading layer `tl_2019_us_county' from data source 
##   `/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 3233 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -179.2311 ymin: -14.60181 xmax: 179.8597 ymax: 71.43979
## Geodetic CRS:  NAD83
# Filter shapefile to only include Texas counties
texas_counties <- texas_counties %>% filter(STATEFP == "48")

# Merge with the merged data frame
merged_counties <- left_join(texas_counties, merged_airpo1519_cvdmort, by = c("NAME" = "County"))

# Create choropleth map
ggplot(data = merged_counties) +
  geom_sf(aes(fill = Age.Adjusted.Rate)) +
  scale_fill_viridis(option = "viridis", name = "Age-Adjusted\nMortality Rate", trans = "log10", direction = -1, na.value = "gray90",
                     guide = guide_colourbar(barwidth = 0.5, barheight = 5)) +
  labs(title = "CVD Mortality by County in Texas, 2015-2019",
       caption = "Source: CDC Wonder\nAuthor: Julie Gonzalez") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        legend.position = "right",
        plot.title = element_text(hjust = 0.5),
        plot.caption = element_text(hjust = 0, size = 8))

# Load required libraries
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggplot2)
library(sf)



# Calculate combined pollution index
merged_airpo1519_cvdmort$Combined_Pollution <- merged_airpo1519_cvdmort$`Days Ozone` + merged_airpo1519_cvdmort$`Days PM2.5` + merged_airpo1519_cvdmort$`Days PM10`

# Merge data with the shapefile
merged_counties <- left_join(texas_counties, merged_airpo1519_cvdmort, by = c("NAME" = "County"))

# Create a choropleth map with ggplot2
ggplot() +
  geom_sf(data = merged_counties, aes(fill = Age.Adjusted.Rate), color = "white", size = 0.1) +
  scale_fill_viridis(option = "viridis", direction = -1, name = "Age-Adjusted CVD Mortality Rate", breaks = pretty_breaks(n = 5)) +
  labs(title = "CVD Mortality Rate by Combined Pollution Index in Texas",
       subtitle = "Based on Days Ozone, Days PM2.5, and Days PM10 for 2015-2019",
       caption = "Source: CDC Wonder\nAuthor: Julie Gonzalez") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        plot.title = element_text(hjust = 0.5, size = 14),
        plot.subtitle = element_text(hjust = 0.5, size = 12),
        plot.caption = element_text(hjust = 0, size = 8),
        legend.position = "right")

# Load required libraries
library(dplyr)

# Calculate descriptive statistics
cvd_mortality_stats <- summary(merged_airpo1519_cvdmort$Age.Adjusted.Rate)
pollution_index_stats <- summary(merged_airpo1519_cvdmort$Combined_Pollution)

# Print the statistics
cat("CVD Mortality Rate (Age-Adjusted) Descriptive Statistics:\n")
## CVD Mortality Rate (Age-Adjusted) Descriptive Statistics:
print(cvd_mortality_stats)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   164.3   211.2   229.6   242.6   283.7   365.2
cat("\nCombined Pollution Index Descriptive Statistics:\n")
## 
## Combined Pollution Index Descriptive Statistics:
print(pollution_index_stats)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   113.0   344.8   359.0   343.1   364.0   366.0
names(merged_airpo1519_cvdmort)
##  [1] "County"             "State"              "Days Ozone"        
##  [4] "Days PM2.5"         "Days PM10"          "County.Code"       
##  [7] "Deaths"             "Population"         "Crude.Rate"        
## [10] "Age.Adjusted.Rate"  "Combined_Pollution"
# Create a scatter plot
ggplot(data = merged_airpo1519_cvdmort, aes(x = Combined_Pollution, y = Age.Adjusted.Rate)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "blue") +
  labs(title = "CVD Mortality Rate vs. Combined Pollution Index in Texas (2015-2019)",
       x = "Combined Pollution Index",
       y = "Age-Adjusted CVD Mortality Rate",
       caption = "Source: CDC Wonder\nAuthor: Julie Gonzalez") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        plot.caption = element_text(hjust = 0, size = 8))
## `geom_smooth()` using formula = 'y ~ x'

# Create a new column for pollution quartiles
merged_airpo1519_cvdmort$Pollution_Quartile <- ntile(merged_airpo1519_cvdmort$Combined_Pollution, 4)

# Create a box plot
ggplot(data = merged_airpo1519_cvdmort, aes(x = as.factor(Pollution_Quartile), y = Age.Adjusted.Rate)) +
  geom_boxplot() +
  labs(title = "CVD Mortality Rate by Pollution Index Quartiles in Texas (2015-2019)",
       x = "Combined Pollution Index Quartiles",
       y = "Age-Adjusted CVD Mortality Rate",
       caption = "Source: CDC Wonder\nAuthor: Julie Gonzalez") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        plot.caption = element_text(hjust = 0, size = 8))

# Create a histogram
ggplot(data = merged_airpo1519_cvdmort, aes(x = Combined_Pollution)) +
  geom_histogram(binwidth = 10, fill = "blue", alpha = 0.5) +
  labs(title = "Histogram of Combined Pollution Index in Texas (2015-2019)",
       x = "Combined Pollution Index",
       y = "Frequency",
       caption = "Source: CDC Wonder\nAuthor: Julie Gonzalez") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        plot.caption = element_text(hjust = 0, size = 8))

# Create a new data frame with the top 10 counties by combined pollution index and CVD mortality rate
top_10_counties <- merged_airpo1519_cvdmort %>%
  select(County, Combined_Pollution, Age.Adjusted.Rate) %>%
  arrange(desc(Combined_Pollution), desc(Age.Adjusted.Rate)) %>%
  top_n(10)
## Selecting by Age.Adjusted.Rate
# Create a bar chart of the top 10 counties
ggplot(data = top_10_counties, aes(x = County, y = Combined_Pollution, fill = Age.Adjusted.Rate)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = round(Combined_Pollution, 1)), vjust = -0.5) +
  scale_y_continuous(name = "Combined Pollution Index", limits = c(0, max(top_10_counties$Combined_Pollution) * 1.1)) +
  scale_fill_viridis_c(name = "Age-Adjusted\nMortality Rate") +
  coord_flip() +
  labs(title = "Top 10 Counties by Combined Pollution Index and CVD Mortality Rate in Texas, 2015-2019",
       y = "Combined Pollution Index",
       x = NULL,
       caption = "Source: CDC Wonder\nAuthor: Julie Gonzalez") +
  theme_minimal()
## Warning: Removed 8 rows containing missing values (`geom_col()`).

# View information for Harris County
harris_info <- merged_airpo1519_cvdmort %>% 
  filter(County == "Harris") %>% 
  select(County, State, Combined_Pollution, Age.Adjusted.Rate)

# View information for Polk County
polk_info <- merged_airpo1519_cvdmort %>% 
  filter(County == "Polk") %>% 
  select(County, State, Combined_Pollution, Age.Adjusted.Rate)
# Filter merged dataset to include only Harris and Polk counties
harris_polk_data <- merged_airpo1519_cvdmort %>%
  filter(County %in% c("Harris", "Polk"))

# Print information for Harris and Polk counties
harris_polk_data
##    County State Days Ozone Days PM2.5 Days PM10 County.Code Deaths Population
## 1  Harris Texas        126        201        16       48201  39707   23192880
## 2  Harris Texas        131        200         3       48201  39707   23192880
## 3  Harris Texas        155        181         2       48201  39707   23192880
## 4  Harris Texas        169        159         6       48201  39707   23192880
## 5  Harris Texas        155        185         3       48201  39707   23192880
## 6    Polk Texas        327          0         0       48373   1035     245434
## 7    Polk Texas        346          0         0       48373   1035     245434
## 8    Polk Texas        365          0         0       48373   1035     245434
## 9    Polk Texas        342          0         0       48373   1035     245434
## 10   Polk Texas        344          0         0       48373   1035     245434
##    Crude.Rate Age.Adjusted.Rate Combined_Pollution Pollution_Quartile
## 1       171.2             216.2                343                  1
## 2       171.2             216.2                334                  1
## 3       171.2             216.2                338                  1
## 4       171.2             216.2                334                  1
## 5       171.2             216.2                343                  1
## 6       421.7             365.2                327                  1
## 7       421.7             365.2                346                  2
## 8       421.7             365.2                365                  4
## 9       421.7             365.2                342                  1
## 10      421.7             365.2                344                  1