Covide_CARD_project

#California DATA is FLAT
unique(california_as_a_whole_df$location_key) #the data is FLAT!

## [1] "US_CA"

The key output is that: OUR DATA IS FLAT!

We lack details related to from a local to a state specific perspective–however

#STEP 1 : LOAD & FILTER DATA : 
library(readxl) 
file_path <- "~/Downloads/US_FIPS_Codes.xls"
fips_data <- read_excel(file_path, col_names = TRUE)

## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`

# Display the first few rows
colnames(fips_data)

## [1] "U.S. FIPS County Codes" "...2"                   "...3"                  
## [4] "...4"

#First Row is the col-names
colnamnes <- fips_data[1,]

#Remove first row
fips_data <- fips_data[-1, ]

#Rename it
colnames(fips_data) <- colnamnes
colnames(fips_data)

## [1] "State"       "County Name" "FIPS State"  "FIPS County"

head(fips_data)

## # A tibble: 6 × 4
##   State   `County Name` `FIPS State` `FIPS County`
##   <chr>   <chr>         <chr>        <chr>        
## 1 Alabama Autauga       01           001          
## 2 Alabama Baldwin       01           003          
## 3 Alabama Barbour       01           005          
## 4 Alabama Bibb          01           007          
## 5 Alabama Blount        01           009          
## 6 Alabama Bullock       01           011

#Case sensitive for cali
unique(fips_data$State)

##  [1] "Alabama"              "Alaska"               "Arizona"             
##  [4] "Arkansas"             "California"           "Colorado"            
##  [7] "Connecticut"          "Delaware"             "District of Columbia"
## [10] "Florida"              "Georgia"              "Hawaii"              
## [13] "Idaho"                "Illinois"             "Indiana"             
## [16] "Iowa"                 "Kansas"               "Kentucky"            
## [19] "Louisiana"            "Maine"                "Maryland"            
## [22] "Massachusetts"        "Michigan"             "Minnesota"           
## [25] "Mississippi"          "Missouri"             "Montana"             
## [28] "Nebraska"             "Nevada"               "New Hampshire"       
## [31] "New Jersey"           "New Mexico"           "New York"            
## [34] "North Carolina"       "North Dakota"         "Ohio"                
## [37] "Oklahoma"             "Oregon"               "Pennsylvania"        
## [40] "Rhode Island"         "South Carolina"       "South Dakota"        
## [43] "Tennessee"            "Texas"                "Utah"                
## [46] "Vermont"              "Virginia"             "Washington"          
## [49] "West Virginia"        "Wisconsin"            "Wyoming"

#Focus on only california: 
california_counties_fibs_data <- fips_data %>% filter(State == "California")

# Verify that we have 58 counties
nrow(california_counties_fibs_data) == 58

## [1] TRUE

Because there are 58 counties, we should have 58 observations of fibs codes for specifically California in our data set!

#Select relevant info & View Data: 
california_counties_fibs_data <- california_counties_fibs_data %>% select("County Name", "FIPS State", "FIPS County") %>% mutate(fips_code = paste0(`FIPS State`, `FIPS County`))
head(california_counties_fibs_data)

## # A tibble: 6 × 4
##   `County Name` `FIPS State` `FIPS County` fips_code
##   <chr>         <chr>        <chr>         <chr>    
## 1 Alameda       06           001           06001    
## 2 Alpine        06           003           06003    
## 3 Amador        06           005           06005    
## 4 Butte         06           007           06007    
## 5 Calaveras     06           009           06009    
## 6 Colusa        06           011           06011

library(httr)
library(readr)
# STEP 1 : Download COVID-19 Data for Each County : 

# Create a list to store each county's data
county_data_list <- list()

# Loop over each combined FIPS code to download the CSV
for (fips_code in california_counties_fibs_data$fips_code) {
  
  # Construct the URL for each county's COVID-19 data
  url <- paste0("https://storage.googleapis.com/covid19-open-data/v3/location/US_CA_", fips_code, ".csv")
  
  # Define a filename to save the downloaded CSV
  output_filename <- paste0("covid_data_", fips_code, ".csv")
  
  # Download and save the CSV file
  response <- GET(url, write_disk(output_filename, overwrite = TRUE))
  
  # Check if the file was downloaded successfully (status code 200)
  if (status_code(response) == 200) {
    # Read the CSV file into R and store it in the list
    county_data <- tryCatch({
      read_csv(output_filename, show_col_types = FALSE)
    }, error = function(e) {
      message(paste("Error reading CSV for FIPS:", fips_code))
      return(NULL)
    })
    
    # Append the county data to the list if successfully read
    if (!is.null(county_data)) {
      county_data_list[[fips_code]] <- county_data
    }
  } else {
    message(paste("Failed to download CSV for FIPS:", fips_code, "with status code:", status_code(response)))
  }
}

# Step 3: Combine All County Data
# Combine all dataframes in the list into a single dataframe
county_combined_data <- bind_rows(county_data_list)

# Display the first few rows of the combined dataframe
print(head(county_combined_data))

## # A tibble: 6 × 472
##   location_key date       place_id       wikidata_id datacommons_id country_code
##   <chr>        <date>     <chr>          <chr>       <chr>          <chr>       
## 1 US_CA_06001  2020-01-01 ChIJWRd5NDfyj… Q107146     geoId/06001    US          
## 2 US_CA_06001  2020-01-02 ChIJWRd5NDfyj… Q107146     geoId/06001    US          
## 3 US_CA_06001  2020-01-03 ChIJWRd5NDfyj… Q107146     geoId/06001    US          
## 4 US_CA_06001  2020-01-04 ChIJWRd5NDfyj… Q107146     geoId/06001    US          
## 5 US_CA_06001  2020-01-05 ChIJWRd5NDfyj… Q107146     geoId/06001    US          
## 6 US_CA_06001  2020-01-06 ChIJWRd5NDfyj… Q107146     geoId/06001    US          
## # ℹ 466 more variables: country_name <chr>, subregion1_code <chr>,
## #   subregion1_name <chr>, subregion2_code <chr>, subregion2_name <chr>,
## #   iso_3166_1_alpha_2 <chr>, iso_3166_1_alpha_3 <chr>,
## #   aggregation_level <dbl>, new_confirmed <dbl>, new_deceased <dbl>,
## #   cumulative_confirmed <dbl>, cumulative_deceased <dbl>,
## #   new_persons_fully_vaccinated <dbl>,
## #   cumulative_persons_fully_vaccinated <dbl>, population <dbl>, …

# Display the number of unique county codes
cat("Number of unique county codes:", length(unique(county_combined_data$location_key)), "\n")

## Number of unique county codes: 58

VERIFY:

#View both data: 
head(county_combined_data$location_key)

## [1] "US_CA_06001" "US_CA_06001" "US_CA_06001" "US_CA_06001" "US_CA_06001"
## [6] "US_CA_06001"

head(california_counties_fibs_data)

## # A tibble: 6 × 4
##   `County Name` `FIPS State` `FIPS County` fips_code
##   <chr>         <chr>        <chr>         <chr>    
## 1 Alameda       06           001           06001    
## 2 Alpine        06           003           06003    
## 3 Amador        06           005           06005    
## 4 Butte         06           007           06007    
## 5 Calaveras     06           009           06009    
## 6 Colusa        06           011           06011

# Step 1: Extract the FIPS code from the `location_key`
county_combined_data <- county_combined_data %>%
  mutate(fips_code = substr(location_key, nchar(location_key) - 4, nchar(location_key)))

# Step 2: Perform a left join with california_counties_fibs_data to attach the County Name
county_combined_data <- county_combined_data %>%
  left_join(california_counties_fibs_data, by = "fips_code")

We have extracted the data from 58 counties

length(unique(county_combined_data$longitude))

## [1] 57

length(unique(county_combined_data$latitude))

## [1] 56

# Load necessary libraries
library(ggplot2)
library(maps)
library(dplyr)

# Load the map of California
california_map <- map_data("county", region = "california")

# Plot the county map with points representing longitude and latitude
ggplot() +
  # Draw the base map of California counties
  geom_polygon(data = california_map, aes(x = long, y = lat, group = group), fill = "white", color = "black") +
  # Add points for each county's longitude and latitude
  geom_point(data = county_combined_data, aes(x = longitude, y = latitude), color = "red", size = 2) +
  # Add labels to make it more informative
  labs(title = "County Locations in California", x = "Longitude", y = "Latitude") +
  theme_minimal()

For every FIPS-Code, lets

# Remove all columns starting with "search_trends_"
reduced_col <- county_combined_data %>%
  select(-starts_with("search_trends_")) %>% 
  select(-starts_with("FIPS County.")) %>% 
  select(-starts_with("County Name.")) %>%
  select(-starts_with("FIPS State"))  %>% 
  select(date, 
         location_key, 
         fips_code, 
         'County Name',
         area_sq_km, 
         longitude, 
         latitude,
         elevation_m,
         average_temperature_celsius:snowfall_mm, 
         mobility_retail_and_recreation, 
         mobility_grocery_and_pharmacy, 
         mobility_parks:mobility_workplaces,
         population, 
         population_male, 
         population_female, 
         population_age_00_09:population_age_70_79, 
         population_age_80_and_older,
         life_expectancy,
         new_confirmed, 
         cumulative_confirmed, 
         new_deceased, 
         cumulative_deceased, 
         new_persons_fully_vaccinated, 
         cumulative_persons_fully_vaccinated,
         )

# View the first few rows to confirm
cat("Number of Features :", ncol(reduced_col), "\n")

## Number of Features : 39

head(reduced_col)

## # A tibble: 6 × 39
##   date       location_key fips_code `County Name` area_sq_km longitude latitude
##   <date>     <chr>        <chr>     <chr>              <dbl>     <dbl>    <dbl>
## 1 2020-01-01 US_CA_06001  06001     Alameda             2127     -122.     37.6
## 2 2020-01-02 US_CA_06001  06001     Alameda             2127     -122.     37.6
## 3 2020-01-03 US_CA_06001  06001     Alameda             2127     -122.     37.6
## 4 2020-01-04 US_CA_06001  06001     Alameda             2127     -122.     37.6
## 5 2020-01-05 US_CA_06001  06001     Alameda             2127     -122.     37.6
## 6 2020-01-06 US_CA_06001  06001     Alameda             2127     -122.     37.6
## # ℹ 32 more variables: elevation_m <dbl>, average_temperature_celsius <dbl>,
## #   minimum_temperature_celsius <dbl>, maximum_temperature_celsius <dbl>,
## #   rainfall_mm <dbl>, dew_point <dbl>, relative_humidity <dbl>,
## #   snowfall_mm <dbl>, mobility_retail_and_recreation <dbl>,
## #   mobility_grocery_and_pharmacy <dbl>, mobility_parks <dbl>,
## #   mobility_transit_stations <dbl>, mobility_workplaces <dbl>,
## #   population <dbl>, population_male <dbl>, population_female <dbl>, …

cat("The number of Longitudes: ", length(unique(reduced_col$longitude)), "\n")

## The number of Longitudes:  57

cat("The number of latitudes: ", length(unique(reduced_col$latitude)), "\n")

## The number of latitudes:  56

# Combine longitude and latitude into a single column and count unique pairs
unique_pairs <- reduced_col %>%
  distinct(longitude, latitude) %>%
  nrow()

#We should expect 58: 
cat("Number of unique longitude and latitude pairs:", unique_pairs, "\n")

## Number of unique longitude and latitude pairs: 58

#However consider: 
unique(reduced_col$elevation_m)

## [1]   NA    3 1992

# List of required packages (excluding rayshader)
packages <- c("sf", "rnaturalearth", "elevatr", "terra", "dplyr")

# Check if each package is installed
for (pkg in packages) {
  is_installed <- require(pkg, character.only = TRUE, quietly = TRUE)
  if (is_installed) {
    message(paste("Package", pkg, "is installed."))
  } else {
    message(paste("Package", pkg, "is NOT installed."))
  }
}

## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE

## Package sf is installed.

## Package rnaturalearth is installed.

## elevatr v0.99.0 NOTE: Version 0.99.0 of 'elevatr' uses 'sf' and 'terra'.  Use 
## of the 'sp', 'raster', and underlying 'rgdal' packages by 'elevatr' is being 
## deprecated; however, get_elev_raster continues to return a RasterLayer.  This 
## will be dropped in future versions, so please plan accordingly.

## Package elevatr is installed.

## terra 1.7.78

## Package terra is installed.

## Package dplyr is installed.

# Load Required Libraries
library(sf)
library(rnaturalearth)
library(elevatr)
library(terra)
library(dplyr)
library(ggplot2)

# Get California Spatial Data
california <- ne_states(country = "united states of america", returnclass = "sf") %>%
  filter(name == "California")

# Get Elevation Data Using elevatr and Crop with terra
california_bbox <- st_bbox(california)
california_elev <- get_elev_raster(california, z = 7, clip = "bbox")

## Mosaicing & Projecting

## Clipping DEM to bbox

## Note: Elevation units are in meters.

# Convert Raster Data to Data Frame for ggplot2
california_elev_rast <- rast(california_elev)
california_df <- as.data.frame(california_elev_rast, xy = TRUE)
names(california_df) <- c("x", "y", "elevation")


# Plot Elevation Data Using ggplot2
ggplot(california_df, aes(x = x, y = y, fill = elevation)) +
  geom_raster() +
  scale_fill_viridis_c() +
  coord_sf() +
  theme_minimal() +
  ggtitle("California Elevation Map")

#Determine Covid Data's range 
range(reduced_col$latitude)

## [1] 33.02 41.74

range(reduced_col$longitude)

## [1] -123.96 -115.35

# Print bounding box of California
california_bbox <- st_bbox(california)
print(california_bbox)

##       xmin       ymin       xmax       ymax 
## -124.40920   32.53167 -114.11906   41.99954

# Load Required Libraries
library(sf)
library(rnaturalearth)
library(elevatr)
library(terra)
library(dplyr)
library(ggplot2)
library(maps)

# Get California Spatial Data from Natural Earth
california <- ne_states(country = "united states of america", returnclass = "sf") %>%
  filter(name == "California")

# Get Elevation Data Using elevatr and Crop with terra
california_bbox <- st_bbox(california)
california_elev <- get_elev_raster(california, z = 7, clip = "bbox")

## Mosaicing & Projecting

## Clipping DEM to bbox

## Note: Elevation units are in meters.

# Convert Raster Data to Data Frame for ggplot2
california_elev_rast <- rast(california_elev)
california_df <- as.data.frame(california_elev_rast, xy = TRUE)
names(california_df) <- c("x", "y", "elevation")

# Load the map of California counties from maps package
california_map <- map_data("county", region = "california")

# Calculate the centroid of each county for plotting
california_centroids <- california_map %>%
  group_by(subregion) %>%
  summarize(long = mean(range(long)), lat = mean(range(lat)))

# Create a combined plot: Elevation Data + County Map + County Locations
ggplot() +
  # Draw the base map of California elevation data
  geom_raster(data = california_df, aes(x = x, y = y, fill = elevation)) +
  scale_fill_viridis_c() +
  # Add the county boundaries
  geom_polygon(data = california_map, aes(x = long, y = lat, group = group), fill = NA, color = "black") +
  # Add points for each county's centroid
  geom_point(data = california_centroids, aes(x = long, y = lat), color = "red", size = 2) +
  # Add labels to make it more informative
  labs(title = "California Elevation Map with County Boundaries and Centroids", x = "Longitude", y = "Latitude") +
  coord_sf() +
  theme_minimal()

# Export `reduced_col` DataFrame as a CSV file
write.csv(reduced_col, file = "reduced_col.csv", row.names = FALSE)


# Write Cali-Fibs-Codes 
california_counties_fibs_data %>% select('County Name', "fips_code") %>% write.csv(file = "california_fibs.csv", row.names = FALSE)

Visualize each variable and each variable with each:

How to visualize categorical and numerical.
Visualize each variable by itself and with variables that it may have a meaningful,
- Focus on new-cases

Stats Questions:

S-Q1) What is the general pattern of NA’s and NULL’s?

S-Q2) What do the relationships of variables look like independently and together?

Math Questions:

M-Q1) What ML algorithms have been used in the past to study epidemiology in regards to covid?

M-Q2) What is time series analysis?

Contextualization Questions:

C-Q1) What is the definition of a covid-case?

C-Q2) What is the definition of a covid-death?

Covide_CARD_project

Isaiah C. Mireles

2024-11-05