#California DATA is FLAT
unique(california_as_a_whole_df$location_key) #the data is FLAT!
## [1] "US_CA"
The key output is that: OUR DATA IS FLAT!
#STEP 1 : LOAD & FILTER DATA :
library(readxl)
file_path <- "~/Downloads/US_FIPS_Codes.xls"
fips_data <- read_excel(file_path, col_names = TRUE)
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
# Display the first few rows
colnames(fips_data)
## [1] "U.S. FIPS County Codes" "...2" "...3"
## [4] "...4"
#First Row is the col-names
colnamnes <- fips_data[1,]
#Remove first row
fips_data <- fips_data[-1, ]
#Rename it
colnames(fips_data) <- colnamnes
colnames(fips_data)
## [1] "State" "County Name" "FIPS State" "FIPS County"
head(fips_data)
## # A tibble: 6 × 4
## State `County Name` `FIPS State` `FIPS County`
## <chr> <chr> <chr> <chr>
## 1 Alabama Autauga 01 001
## 2 Alabama Baldwin 01 003
## 3 Alabama Barbour 01 005
## 4 Alabama Bibb 01 007
## 5 Alabama Blount 01 009
## 6 Alabama Bullock 01 011
#Case sensitive for cali
unique(fips_data$State)
## [1] "Alabama" "Alaska" "Arizona"
## [4] "Arkansas" "California" "Colorado"
## [7] "Connecticut" "Delaware" "District of Columbia"
## [10] "Florida" "Georgia" "Hawaii"
## [13] "Idaho" "Illinois" "Indiana"
## [16] "Iowa" "Kansas" "Kentucky"
## [19] "Louisiana" "Maine" "Maryland"
## [22] "Massachusetts" "Michigan" "Minnesota"
## [25] "Mississippi" "Missouri" "Montana"
## [28] "Nebraska" "Nevada" "New Hampshire"
## [31] "New Jersey" "New Mexico" "New York"
## [34] "North Carolina" "North Dakota" "Ohio"
## [37] "Oklahoma" "Oregon" "Pennsylvania"
## [40] "Rhode Island" "South Carolina" "South Dakota"
## [43] "Tennessee" "Texas" "Utah"
## [46] "Vermont" "Virginia" "Washington"
## [49] "West Virginia" "Wisconsin" "Wyoming"
#Focus on only california:
california_counties_fibs_data <- fips_data %>% filter(State == "California")
# Verify that we have 58 counties
nrow(california_counties_fibs_data) == 58
## [1] TRUE
#Select relevant info & View Data:
california_counties_fibs_data <- california_counties_fibs_data %>% select("County Name", "FIPS State", "FIPS County") %>% mutate(fips_code = paste0(`FIPS State`, `FIPS County`))
head(california_counties_fibs_data)
## # A tibble: 6 × 4
## `County Name` `FIPS State` `FIPS County` fips_code
## <chr> <chr> <chr> <chr>
## 1 Alameda 06 001 06001
## 2 Alpine 06 003 06003
## 3 Amador 06 005 06005
## 4 Butte 06 007 06007
## 5 Calaveras 06 009 06009
## 6 Colusa 06 011 06011
library(httr)
library(readr)
# STEP 1 : Download COVID-19 Data for Each County :
# Create a list to store each county's data
county_data_list <- list()
# Loop over each combined FIPS code to download the CSV
for (fips_code in california_counties_fibs_data$fips_code) {
# Construct the URL for each county's COVID-19 data
url <- paste0("https://storage.googleapis.com/covid19-open-data/v3/location/US_CA_", fips_code, ".csv")
# Define a filename to save the downloaded CSV
output_filename <- paste0("covid_data_", fips_code, ".csv")
# Download and save the CSV file
response <- GET(url, write_disk(output_filename, overwrite = TRUE))
# Check if the file was downloaded successfully (status code 200)
if (status_code(response) == 200) {
# Read the CSV file into R and store it in the list
county_data <- tryCatch({
read_csv(output_filename, show_col_types = FALSE)
}, error = function(e) {
message(paste("Error reading CSV for FIPS:", fips_code))
return(NULL)
})
# Append the county data to the list if successfully read
if (!is.null(county_data)) {
county_data_list[[fips_code]] <- county_data
}
} else {
message(paste("Failed to download CSV for FIPS:", fips_code, "with status code:", status_code(response)))
}
}
# Step 3: Combine All County Data
# Combine all dataframes in the list into a single dataframe
county_combined_data <- bind_rows(county_data_list)
# Display the first few rows of the combined dataframe
print(head(county_combined_data))
## # A tibble: 6 × 472
## location_key date place_id wikidata_id datacommons_id country_code
## <chr> <date> <chr> <chr> <chr> <chr>
## 1 US_CA_06001 2020-01-01 ChIJWRd5NDfyj… Q107146 geoId/06001 US
## 2 US_CA_06001 2020-01-02 ChIJWRd5NDfyj… Q107146 geoId/06001 US
## 3 US_CA_06001 2020-01-03 ChIJWRd5NDfyj… Q107146 geoId/06001 US
## 4 US_CA_06001 2020-01-04 ChIJWRd5NDfyj… Q107146 geoId/06001 US
## 5 US_CA_06001 2020-01-05 ChIJWRd5NDfyj… Q107146 geoId/06001 US
## 6 US_CA_06001 2020-01-06 ChIJWRd5NDfyj… Q107146 geoId/06001 US
## # ℹ 466 more variables: country_name <chr>, subregion1_code <chr>,
## # subregion1_name <chr>, subregion2_code <chr>, subregion2_name <chr>,
## # iso_3166_1_alpha_2 <chr>, iso_3166_1_alpha_3 <chr>,
## # aggregation_level <dbl>, new_confirmed <dbl>, new_deceased <dbl>,
## # cumulative_confirmed <dbl>, cumulative_deceased <dbl>,
## # new_persons_fully_vaccinated <dbl>,
## # cumulative_persons_fully_vaccinated <dbl>, population <dbl>, …
# Display the number of unique county codes
cat("Number of unique county codes:", length(unique(county_combined_data$location_key)), "\n")
## Number of unique county codes: 58
#View both data:
head(county_combined_data$location_key)
## [1] "US_CA_06001" "US_CA_06001" "US_CA_06001" "US_CA_06001" "US_CA_06001"
## [6] "US_CA_06001"
head(california_counties_fibs_data)
## # A tibble: 6 × 4
## `County Name` `FIPS State` `FIPS County` fips_code
## <chr> <chr> <chr> <chr>
## 1 Alameda 06 001 06001
## 2 Alpine 06 003 06003
## 3 Amador 06 005 06005
## 4 Butte 06 007 06007
## 5 Calaveras 06 009 06009
## 6 Colusa 06 011 06011
# Step 1: Extract the FIPS code from the `location_key`
county_combined_data <- county_combined_data %>%
mutate(fips_code = substr(location_key, nchar(location_key) - 4, nchar(location_key)))
# Step 2: Perform a left join with california_counties_fibs_data to attach the County Name
county_combined_data <- county_combined_data %>%
left_join(california_counties_fibs_data, by = "fips_code")
length(unique(county_combined_data$longitude))
## [1] 57
length(unique(county_combined_data$latitude))
## [1] 56
# Load necessary libraries
library(ggplot2)
library(maps)
library(dplyr)
# Load the map of California
california_map <- map_data("county", region = "california")
# Plot the county map with points representing longitude and latitude
ggplot() +
# Draw the base map of California counties
geom_polygon(data = california_map, aes(x = long, y = lat, group = group), fill = "white", color = "black") +
# Add points for each county's longitude and latitude
geom_point(data = county_combined_data, aes(x = longitude, y = latitude), color = "red", size = 2) +
# Add labels to make it more informative
labs(title = "County Locations in California", x = "Longitude", y = "Latitude") +
theme_minimal()
# Remove all columns starting with "search_trends_"
reduced_col <- county_combined_data %>%
select(-starts_with("search_trends_")) %>%
select(-starts_with("FIPS County.")) %>%
select(-starts_with("County Name.")) %>%
select(-starts_with("FIPS State")) %>%
select(date,
location_key,
fips_code,
'County Name',
area_sq_km,
longitude,
latitude,
elevation_m,
average_temperature_celsius:snowfall_mm,
mobility_retail_and_recreation,
mobility_grocery_and_pharmacy,
mobility_parks:mobility_workplaces,
population,
population_male,
population_female,
population_age_00_09:population_age_70_79,
population_age_80_and_older,
life_expectancy,
new_confirmed,
cumulative_confirmed,
new_deceased,
cumulative_deceased,
new_persons_fully_vaccinated,
cumulative_persons_fully_vaccinated,
)
# View the first few rows to confirm
cat("Number of Features :", ncol(reduced_col), "\n")
## Number of Features : 39
head(reduced_col)
## # A tibble: 6 × 39
## date location_key fips_code `County Name` area_sq_km longitude latitude
## <date> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2020-01-01 US_CA_06001 06001 Alameda 2127 -122. 37.6
## 2 2020-01-02 US_CA_06001 06001 Alameda 2127 -122. 37.6
## 3 2020-01-03 US_CA_06001 06001 Alameda 2127 -122. 37.6
## 4 2020-01-04 US_CA_06001 06001 Alameda 2127 -122. 37.6
## 5 2020-01-05 US_CA_06001 06001 Alameda 2127 -122. 37.6
## 6 2020-01-06 US_CA_06001 06001 Alameda 2127 -122. 37.6
## # ℹ 32 more variables: elevation_m <dbl>, average_temperature_celsius <dbl>,
## # minimum_temperature_celsius <dbl>, maximum_temperature_celsius <dbl>,
## # rainfall_mm <dbl>, dew_point <dbl>, relative_humidity <dbl>,
## # snowfall_mm <dbl>, mobility_retail_and_recreation <dbl>,
## # mobility_grocery_and_pharmacy <dbl>, mobility_parks <dbl>,
## # mobility_transit_stations <dbl>, mobility_workplaces <dbl>,
## # population <dbl>, population_male <dbl>, population_female <dbl>, …
cat("The number of Longitudes: ", length(unique(reduced_col$longitude)), "\n")
## The number of Longitudes: 57
cat("The number of latitudes: ", length(unique(reduced_col$latitude)), "\n")
## The number of latitudes: 56
# Combine longitude and latitude into a single column and count unique pairs
unique_pairs <- reduced_col %>%
distinct(longitude, latitude) %>%
nrow()
#We should expect 58:
cat("Number of unique longitude and latitude pairs:", unique_pairs, "\n")
## Number of unique longitude and latitude pairs: 58
#However consider:
unique(reduced_col$elevation_m)
## [1] NA 3 1992
# List of required packages (excluding rayshader)
packages <- c("sf", "rnaturalearth", "elevatr", "terra", "dplyr")
# Check if each package is installed
for (pkg in packages) {
is_installed <- require(pkg, character.only = TRUE, quietly = TRUE)
if (is_installed) {
message(paste("Package", pkg, "is installed."))
} else {
message(paste("Package", pkg, "is NOT installed."))
}
}
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
## Package sf is installed.
## Package rnaturalearth is installed.
## elevatr v0.99.0 NOTE: Version 0.99.0 of 'elevatr' uses 'sf' and 'terra'. Use
## of the 'sp', 'raster', and underlying 'rgdal' packages by 'elevatr' is being
## deprecated; however, get_elev_raster continues to return a RasterLayer. This
## will be dropped in future versions, so please plan accordingly.
## Package elevatr is installed.
## terra 1.7.78
## Package terra is installed.
## Package dplyr is installed.
# Load Required Libraries
library(sf)
library(rnaturalearth)
library(elevatr)
library(terra)
library(dplyr)
library(ggplot2)
# Get California Spatial Data
california <- ne_states(country = "united states of america", returnclass = "sf") %>%
filter(name == "California")
# Get Elevation Data Using elevatr and Crop with terra
california_bbox <- st_bbox(california)
california_elev <- get_elev_raster(california, z = 7, clip = "bbox")
## Mosaicing & Projecting
## Clipping DEM to bbox
## Note: Elevation units are in meters.
# Convert Raster Data to Data Frame for ggplot2
california_elev_rast <- rast(california_elev)
california_df <- as.data.frame(california_elev_rast, xy = TRUE)
names(california_df) <- c("x", "y", "elevation")
# Plot Elevation Data Using ggplot2
ggplot(california_df, aes(x = x, y = y, fill = elevation)) +
geom_raster() +
scale_fill_viridis_c() +
coord_sf() +
theme_minimal() +
ggtitle("California Elevation Map")
#Determine Covid Data's range
range(reduced_col$latitude)
## [1] 33.02 41.74
range(reduced_col$longitude)
## [1] -123.96 -115.35
# Print bounding box of California
california_bbox <- st_bbox(california)
print(california_bbox)
## xmin ymin xmax ymax
## -124.40920 32.53167 -114.11906 41.99954
# Load Required Libraries
library(sf)
library(rnaturalearth)
library(elevatr)
library(terra)
library(dplyr)
library(ggplot2)
library(maps)
# Get California Spatial Data from Natural Earth
california <- ne_states(country = "united states of america", returnclass = "sf") %>%
filter(name == "California")
# Get Elevation Data Using elevatr and Crop with terra
california_bbox <- st_bbox(california)
california_elev <- get_elev_raster(california, z = 7, clip = "bbox")
## Mosaicing & Projecting
## Clipping DEM to bbox
## Note: Elevation units are in meters.
# Convert Raster Data to Data Frame for ggplot2
california_elev_rast <- rast(california_elev)
california_df <- as.data.frame(california_elev_rast, xy = TRUE)
names(california_df) <- c("x", "y", "elevation")
# Load the map of California counties from maps package
california_map <- map_data("county", region = "california")
# Calculate the centroid of each county for plotting
california_centroids <- california_map %>%
group_by(subregion) %>%
summarize(long = mean(range(long)), lat = mean(range(lat)))
# Create a combined plot: Elevation Data + County Map + County Locations
ggplot() +
# Draw the base map of California elevation data
geom_raster(data = california_df, aes(x = x, y = y, fill = elevation)) +
scale_fill_viridis_c() +
# Add the county boundaries
geom_polygon(data = california_map, aes(x = long, y = lat, group = group), fill = NA, color = "black") +
# Add points for each county's centroid
geom_point(data = california_centroids, aes(x = long, y = lat), color = "red", size = 2) +
# Add labels to make it more informative
labs(title = "California Elevation Map with County Boundaries and Centroids", x = "Longitude", y = "Latitude") +
coord_sf() +
theme_minimal()
# Export `reduced_col` DataFrame as a CSV file
write.csv(reduced_col, file = "reduced_col.csv", row.names = FALSE)
# Write Cali-Fibs-Codes
california_counties_fibs_data %>% select('County Name', "fips_code") %>% write.csv(file = "california_fibs.csv", row.names = FALSE)
Visualize each variable and each variable with each:
How to visualize categorical and numerical.
Visualize each variable by itself and with variables that it may have a meaningful,
Stats Questions:
S-Q1) What is the general pattern of NA’s and NULL’s?
S-Q2) What do the relationships of variables look like independently and together?
Math Questions:
M-Q1) What ML algorithms have been used in the past to study epidemiology in regards to covid?
M-Q2) What is time series analysis?
Contextualization Questions:
C-Q1) What is the definition of a covid-case?
C-Q2) What is the definition of a covid-death?