airpo_cvdmort20152019

# Load the necessary packages
library(readr)
library(dplyr)
library(bit)
library(haven)

#Read in Air pollution data
airpo19 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2019.csv")

airpo18 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2018.csv")

airpo17 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2017.csv")

airpo16 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2016.csv")

airpo15 <- read_csv("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Laptop_airpo_20102019/annual_aqi_by_county_2015.csv")


#Read in CVD Mortality Data 



# Read in the 2015-2019 file
tx_cvd_2015_2019 <- read.table("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/CVD 2015-2019.txt", sep="\t", header=TRUE, fill=TRUE)

tx_cvd_2015_2019 <- select(tx_cvd_2015_2019, -Notes)

#one database
airpo15_19 <- rbind(airpo19, airpo18, airpo17, airpo16, airpo15)

#Clean some stuff
#CVD Mortality Data

# Repeat for tx_cvd_2015_2019
# Convert Deaths column to numeric in tx_cvd_2015_2019
tx_cvd_2015_2019$Deaths <- as.numeric(tx_cvd_2015_2019$Deaths)

## Warning: NAs introduced by coercion

# Convert Crude.Rate column to numeric in tx_cvd_2015_2019
tx_cvd_2015_2019$Crude.Rate <- as.numeric(tx_cvd_2015_2019$Crude.Rate)

## Warning: NAs introduced by coercion

# Convert Age.Adjusted.Rate column to numeric in tx_cvd_2015_2019
tx_cvd_2015_2019$Age.Adjusted.Rate <- as.numeric(tx_cvd_2015_2019$Age.Adjusted.Rate)

## Warning: NAs introduced by coercion

# Replace dots with NA in tx_cvd_2015_2019
tx_cvd_2015_2019[tx_cvd_2015_2019 == "."] <- NA

# Replace "Suppressed" values with NA in Deaths column of tx_cvd_2015_2019
tx_cvd_2015_2019$Deaths[tx_cvd_2015_2019$Deaths == "Suppressed"] <- NA

# Convert Deaths column to numeric in tx_cvd_2015_2019
tx_cvd_2015_2019$Deaths <- as.numeric(tx_cvd_2015_2019$Deaths)

# remove ", TX" from County column
tx_cvd_2015_2019$County <- gsub(" County, TX", "", tx_cvd_2015_2019$County)

# Filter and select the data of interest
airpo15_19 <- airpo15_19 %>%
  filter(State == "Texas"
         ) %>%
  select(State, County, `Median AQI`)

# Aggregate airpo15_19 by County
airpo15_19_agg <- airpo15_19 %>%
  group_by(County) %>%
  summarize(Median_AQI = mean(`Median AQI`, na.rm = TRUE))

# Aggregate tx_cvd_2015_2019 by County
tx_cvd_2015_2019_agg <- tx_cvd_2015_2019 %>%
  group_by(County) %>%
  summarize(Crude_Rate = mean(`Crude.Rate`, na.rm = TRUE))

# Merge the aggregated datasets
airpomort_agg <- left_join(airpo15_19_agg, tx_cvd_2015_2019_agg, by = "County")

summary(airpomort_agg)

##     County            Median_AQI      Crude_Rate   
##  Length:42          Min.   :20.80   Min.   :120.2  
##  Class :character   1st Qu.:34.05   1st Qu.:195.0  
##  Mode  :character   Median :37.80   Median :251.6  
##                     Mean   :36.82   Mean   :258.6  
##                     3rd Qu.:40.65   3rd Qu.:324.2  
##                     Max.   :51.60   Max.   :511.0

library(sf)

## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE

library(ggplot2)

# Read Texas county shapefile 
texas_counties <- st_read("/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp")

## Reading layer `tl_2019_us_county' from data source 
##   `/Users/juliegonzalez/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/Spring 2023 GIS/tl_2019_us_county/tl_2019_us_county.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 3233 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -179.2311 ymin: -14.60181 xmax: 179.8597 ymax: 71.43979
## Geodetic CRS:  NAD83

# Filter shapefile to only include Texas counties
texas_counties <- texas_counties %>% filter(STATEFP == "48")

tx_counties_data <- left_join(texas_counties, airpomort_agg, by = c("NAME" = "County"))

ggplot() +
  geom_sf(data = tx_counties_data, aes(fill = Median_AQI), color = "black", size = 0.25) +
  scale_fill_viridis_c(option = "plasma", name = "Median AQI", na.value = "gray") +
  theme_void() +
  theme(legend.position = "left")

ggplot() +
  geom_sf(data = tx_counties_data, aes(fill = Crude_Rate), color = "black", size = 0.25) +
  scale_fill_viridis_c(option = "plasma", name = "Crude_Rate", na.value = "gray") +
  theme_void() +
  theme(legend.position = "right")

#map with more details
median_aqi_map <- ggplot() +
  geom_sf(data = tx_counties_data, aes(fill = Median_AQI), color = "black", size = 0.25) +
  scale_fill_viridis_c(option = "plasma", name = "Median AQI", na.value = "gray") +
  labs(title = "Median AQI by County in Texas (2015-2019)",
       caption = "Source: EPA Annual AQI | Author: Julie Gonzalez") +
  theme_void() +
  theme(legend.position = "right",
        plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
        plot.caption = element_text(hjust = 1, size = 10))

# Display the Median AQI map
median_aqi_map

#Map with more details, CVD mortality
crude_rate_map <- ggplot() +
  geom_sf(data = tx_counties_data, aes(fill = Crude_Rate), color = "black", size = 0.25) +
  scale_fill_viridis_c(option = "plasma", name = "Crude Rate", na.value = "gray") +
  labs(title = "Crude Rate by County in Texas (2015-2019)",
       caption = "Source: CDC Wonder | Author: Julie Gonzalez") +
  theme_void() +
  theme(legend.position = "right",
        plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
        plot.caption = element_text(hjust = 1, size = 10))

# Display the Crude Rate map
crude_rate_map

top3_median_aqi <- tx_counties_data %>% 
  arrange(desc(Median_AQI)) %>%
  head(3)

top3_crude_rate <- tx_counties_data %>% 
  arrange(desc(Crude_Rate)) %>%
  head(3)

median_aqi_map_labeled <- median_aqi_map +
  geom_sf_label(data = top3_median_aqi, aes(label = NAME),
                size = 3, fontface = "bold", check_overlap = TRUE)

## Warning in layer_sf(data = data, mapping = mapping, stat = stat, geom =
## GeomLabel, : Ignoring unknown parameters: `check_overlap`

# Display the Median AQI map with labels
median_aqi_map_labeled

## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data

crude_rate_map_labeled <- crude_rate_map +
  geom_sf_label(data = top3_crude_rate, aes(label = NAME),
                size = 3, fontface = "bold", check_overlap = TRUE)

## Warning in layer_sf(data = data, mapping = mapping, stat = stat, geom =
## GeomLabel, : Ignoring unknown parameters: `check_overlap`

# Display the Crude Rate map with labels
crude_rate_map_labeled

## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data

median_aqi_summary <- tx_counties_data %>%
  summarise(min = min(Median_AQI, na.rm = TRUE),
            max = max(Median_AQI, na.rm = TRUE),
            mean = mean(Median_AQI, na.rm = TRUE),
            median = median(Median_AQI, na.rm = TRUE),
            sd = sd(Median_AQI, na.rm = TRUE))

print(median_aqi_summary)

## Simple feature collection with 1 feature and 5 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -106.6456 ymin: 25.83716 xmax: -93.50804 ymax: 36.5007
## Geodetic CRS:  NAD83
##    min  max     mean median       sd                       geometry
## 1 20.8 51.6 36.81905   37.8 6.719583 POLYGON ((-98.9524 34.2125,...

crude_rate_summary <- tx_counties_data %>%
  summarise(min = min(Crude_Rate, na.rm = TRUE),
            max = max(Crude_Rate, na.rm = TRUE),
            mean = mean(Crude_Rate, na.rm = TRUE),
            median = median(Crude_Rate, na.rm = TRUE),
            sd = sd(Crude_Rate, na.rm = TRUE))

print(crude_rate_summary)

## Simple feature collection with 1 feature and 5 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -106.6456 ymin: 25.83716 xmax: -93.50804 ymax: 36.5007
## Geodetic CRS:  NAD83
##     min max     mean median       sd                       geometry
## 1 120.2 511 258.5976  251.6 88.70143 POLYGON ((-98.9524 34.2125,...

# Histogram for Median AQI
ggplot(tx_counties_data, aes(x = Median_AQI)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Median AQI",
       x = "Median AQI",
       y = "Count")

## Warning: Removed 212 rows containing non-finite values (`stat_bin()`).

# Histogram for Crude Rate
ggplot(tx_counties_data, aes(x = Crude_Rate)) +
  geom_histogram(binwidth = 5, fill = "red", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Crude Rate",
       x = "Crude Rate",
       y = "Count")

## Warning: Removed 212 rows containing non-finite values (`stat_bin()`).

ggplot(tx_counties_data, aes(x = Median_AQI, y = Crude_Rate)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "red", linetype = "solid") +
  labs(title = "Scatterplot of Median AQI vs. Crude Rate with Regression Line",
       x = "Median AQI",
       y = "Crude Rate")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 212 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 212 rows containing missing values (`geom_point()`).

library(dplyr)
library(knitr)

median_aqi_summary <- tx_counties_data %>%
  summarise(min = min(Median_AQI, na.rm = TRUE),
            q1 = quantile(Median_AQI, 0.25, na.rm = TRUE),
            median = median(Median_AQI, na.rm = TRUE),
            mean = mean(Median_AQI, na.rm = TRUE),
            q3 = quantile(Median_AQI, 0.75, na.rm = TRUE),
            max = max(Median_AQI, na.rm = TRUE),
            sd = sd(Median_AQI, na.rm = TRUE),
            iqr = IQR(Median_AQI, na.rm = TRUE))

crude_rate_summary <- tx_counties_data %>%
  summarise(min = min(Crude_Rate, na.rm = TRUE),
            q1 = quantile(Crude_Rate, 0.25, na.rm = TRUE),
            median = median(Crude_Rate, na.rm = TRUE),
            mean = mean(Crude_Rate, na.rm = TRUE),
            q3 = quantile(Crude_Rate, 0.75, na.rm = TRUE),
            max = max(Crude_Rate, na.rm = TRUE),
            sd = sd(Crude_Rate, na.rm = TRUE),
            iqr = IQR(Crude_Rate, na.rm = TRUE))

summary_df <- data.frame(Variable = c("Median AQI", "Crude Rate"),
                         Min = c(median_aqi_summary$min, crude_rate_summary$min),
                         Q1 = c(median_aqi_summary$q1, crude_rate_summary$q1),
                         Median = c(median_aqi_summary$median, crude_rate_summary$median),
                         Mean = c(median_aqi_summary$mean, crude_rate_summary$mean),
                         Q3 = c(median_aqi_summary$q3, crude_rate_summary$q3),
                         Max = c(median_aqi_summary$max, crude_rate_summary$max),
                         SD = c(median_aqi_summary$sd, crude_rate_summary$sd),
                         IQR = c(median_aqi_summary$iqr, crude_rate_summary$iqr))

knitr::kable(summary_df, digits = 2, align = "r")

Variable	Min	Q1	Median	Mean	Q3	Max	SD	IQR
Median AQI	20.8	34.05	37.8	36.82	40.65	51.6	6.72	6.60
Crude Rate	120.2	195.00	251.6	258.60	324.18	511.0	88.70	129.18

airpo_cvdmort20152019

Jules Gonzalez

2023-04-27