For my second project, I chose to work on the us_contagious_diseases.csv data set, including data six variables, disease, state, year, weeks_reporting, count, and population, a mix of character and integer data types. I found this data set in our class-shared Google drive and thought the data was extremely relevant and valuable to know as someone who lives in the United States. It contains information dating back to the early 20th century and could be used to create a fantastic visual. Before creating any visual, I needed to clean the data set and make it usable to create a map of the United States showing the change in “count” the total number of cases of a specific disease and eventually compare all. I started by filtering my original data set cd_us for any NA or/and missing values. Following that, I created another data set, hp_us, for just hepatitis A cases, so I could look deeper into that disease. Next, changing the year from int to numeric data type in our data sets was necessary for future use. The us_states data set, which includes data for mapping, also needed to be adjusted/cleaned to match the rest of my data sets, which listed states with the first letter capitalized. I used str_to_title to fix this; for example, “alabama” became “Alabama.” Furthermore, to merge/join my data sets, I need to assign codes to each state. I create a data frame called codes assigning each state a number 1-50, then merge this data frame with us_states, hp_us, and cd_us, which gives each state an identical number in all data sets. Subsequently, I used the left_join function to merge the cd_us and us_states, hp_us, and us_states so we can create maps that show the change in disease count over time. Lastly, to ensure that the data did not create any NA values and remove the unnecessary column “subregion” from the data sets. I filtered cd_us and hp_us using !is.na and select(-subregion).
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(tmap)
## Warning: package 'tmap' was built under R version 4.2.3
library(tmaptools)
## Warning: package 'tmaptools' was built under R version 4.2.3
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.2.3
library(sf)
## Warning: package 'sf' was built under R version 4.2.3
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(leaflet.extras)
## Warning: package 'leaflet.extras' was built under R version 4.2.3
library(dplyr)
library(rio)
## Warning: package 'rio' was built under R version 4.2.3
library(sp)
## Warning: package 'sp' was built under R version 4.2.3
library(urbnmapr)
library(gganimate)
## Warning: package 'gganimate' was built under R version 4.2.3
library(gifski)
## Warning: package 'gifski' was built under R version 4.2.3
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:rio':
##
## export
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.2.3
library(stringr)
library(DataExplorer)
library(ggplot2)
library(ggfortify)
setwd("C:/Users/jakea/OneDrive/Desktop/MC 2022/DATA-110")
cd_us <- read.csv("us_contagious_diseases.csv")
head(cd_us)
## disease state year weeks_reporting count population
## 1 Hepatitis A Alabama 1966 50 321 3345787
## 2 Hepatitis A Alabama 1967 49 291 3364130
## 3 Hepatitis A Alabama 1968 52 314 3386068
## 4 Hepatitis A Alabama 1969 49 380 3412450
## 5 Hepatitis A Alabama 1970 51 413 3444165
## 6 Hepatitis A Alabama 1971 51 378 3481798
str(cd_us)
## 'data.frame': 18870 obs. of 6 variables:
## $ disease : chr "Hepatitis A" "Hepatitis A" "Hepatitis A" "Hepatitis A" ...
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ year : int 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 ...
## $ weeks_reporting: int 50 49 52 49 51 51 45 45 45 46 ...
## $ count : int 321 291 314 380 413 378 342 467 244 286 ...
## $ population : int 3345787 3364130 3386068 3412450 3444165 3481798 3524543 3571209 3620548 3671246 ...
cd_us <- cd_us %>%
filter(!is.na(disease) & !is.na(year) & !is.na(weeks_reporting) & !is.na(population))
hp_us <- cd_us %>%
filter(!is.na(disease) & !is.na(year) & !is.na(weeks_reporting) & !is.na(population)) %>%
filter(disease %in% c("Hepatitis A"))
us_states <- map_data("state")
head(us_states)
## long lat group order region subregion
## 1 -87.46201 30.38968 1 1 alabama <NA>
## 2 -87.48493 30.37249 1 2 alabama <NA>
## 3 -87.52503 30.37249 1 3 alabama <NA>
## 4 -87.53076 30.33239 1 4 alabama <NA>
## 5 -87.57087 30.32665 1 5 alabama <NA>
## 6 -87.58806 30.32665 1 6 alabama <NA>
us_states$region <- str_to_title(tolower(us_states$region))
head(us_states)
## long lat group order region subregion
## 1 -87.46201 30.38968 1 1 Alabama <NA>
## 2 -87.48493 30.37249 1 2 Alabama <NA>
## 3 -87.52503 30.37249 1 3 Alabama <NA>
## 4 -87.53076 30.33239 1 4 Alabama <NA>
## 5 -87.57087 30.32665 1 5 Alabama <NA>
## 6 -87.58806 30.32665 1 6 Alabama <NA>
codes <- list(state = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"), code = c(1:50)) # creates a list of state names and assigns them a value 1-50
codes <- data.frame(codes) # convert the list to a data frame
head(codes) # preview that data set
## state code
## 1 Alabama 1
## 2 Alaska 2
## 3 Arizona 3
## 4 Arkansas 4
## 5 California 5
## 6 Colorado 6
us_states <- left_join(us_states, codes, by = c("region"="state"))
head(us_states)
## long lat group order region subregion code
## 1 -87.46201 30.38968 1 1 Alabama <NA> 1
## 2 -87.48493 30.37249 1 2 Alabama <NA> 1
## 3 -87.52503 30.37249 1 3 Alabama <NA> 1
## 4 -87.53076 30.33239 1 4 Alabama <NA> 1
## 5 -87.57087 30.32665 1 5 Alabama <NA> 1
## 6 -87.58806 30.32665 1 6 Alabama <NA> 1
hp_us <- left_join(hp_us, codes, by = "state")
head(hp_us)
## disease state year weeks_reporting count population code
## 1 Hepatitis A Alabama 1966 50 321 3345787 1
## 2 Hepatitis A Alabama 1967 49 291 3364130 1
## 3 Hepatitis A Alabama 1968 52 314 3386068 1
## 4 Hepatitis A Alabama 1969 49 380 3412450 1
## 5 Hepatitis A Alabama 1970 51 413 3444165 1
## 6 Hepatitis A Alabama 1971 51 378 3481798 1
cd_us <- left_join(cd_us, codes, by = "state")
head(cd_us)
## disease state year weeks_reporting count population code
## 1 Hepatitis A Alabama 1966 50 321 3345787 1
## 2 Hepatitis A Alabama 1967 49 291 3364130 1
## 3 Hepatitis A Alabama 1968 52 314 3386068 1
## 4 Hepatitis A Alabama 1969 49 380 3412450 1
## 5 Hepatitis A Alabama 1970 51 413 3444165 1
## 6 Hepatitis A Alabama 1971 51 378 3481798 1
cd_us_merged <- left_join(cd_us, us_states, by = c("code"="code"))
## Warning in left_join(cd_us, us_states, by = c(code = "code")): Each row in `x` is expected to match at most 1 row in `y`.
## ℹ Row 1 of `x` matches multiple rows.
## ℹ If multiple matches are expected, set `multiple = "all"` to silence this
## warning.
cd_us_merged <- cd_us_merged %>%
filter(!is.na(disease) & !is.na(year) & !is.na(weeks_reporting) & !is.na(count) & !is.na(population) & !is.na(code) & !is.na(long) & !is.na(lat) & !is.na(order) & !is.na(region))
cd_us_merged <- cd_us_merged %>%
select(-subregion)
hp_us_merged <- left_join(hp_us, us_states, by=c("code"="code"))
## Warning in left_join(hp_us, us_states, by = c(code = "code")): Each row in `x` is expected to match at most 1 row in `y`.
## ℹ Row 1 of `x` matches multiple rows.
## ℹ If multiple matches are expected, set `multiple = "all"` to silence this
## warning.
head(hp_us_merged)
## disease state year weeks_reporting count population code long
## 1 Hepatitis A Alabama 1966 50 321 3345787 1 -87.46201
## 2 Hepatitis A Alabama 1966 50 321 3345787 1 -87.48493
## 3 Hepatitis A Alabama 1966 50 321 3345787 1 -87.52503
## 4 Hepatitis A Alabama 1966 50 321 3345787 1 -87.53076
## 5 Hepatitis A Alabama 1966 50 321 3345787 1 -87.57087
## 6 Hepatitis A Alabama 1966 50 321 3345787 1 -87.58806
## lat group order region subregion
## 1 30.38968 1 1 Alabama <NA>
## 2 30.37249 1 2 Alabama <NA>
## 3 30.37249 1 3 Alabama <NA>
## 4 30.33239 1 4 Alabama <NA>
## 5 30.32665 1 5 Alabama <NA>
## 6 30.32665 1 6 Alabama <NA>
hp_us_merged <- hp_us_merged %>%
select(-subregion)
hp_us_merged <- hp_us_merged %>%
filter(!is.na(disease) & !is.na(state)& !is.na(year) & !is.na(weeks_reporting) & !is.na(count) & !is.na(population) & !is.na(code) & !is.na(long) & !is.na(lat) & !is.na(order) & !is.na(region))
hp_us_plot <- ggplot(hp_us, aes(count, weeks_reporting)) +
geom_point(aes(size = count, color = state, alpha = .5)) +
guides(color = FALSE) # Legend has to be removed too cluttered
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
hp_us_plot <- ggplotly(hp_us_plot)
hp_us_plot
year <- hp_us$year # Need to define year variable
hp_us_plot <- ggplot(hp_us, aes(count, weeks_reporting)) +
geom_point(aes(size = count, color = state, alpha = .5)) +
guides(color = FALSE) + # Removes legend
labs(title = 'Year: {frame_time}', x = '#Of Infections', y = 'Weeks Reported') +
transition_time(year) +
ease_aes('linear')
hp_us_plot
ggplot(hp_us_merged, aes(x = state, y = count)) +
geom_boxplot()
ggplot(hp_us_merged, aes(x = state, y = count, color = state)) +
geom_boxplot() +
guides(color = FALSE) +
labs(title = "Distribution of Hepatitis A Case Counts by State", y = "Count", x = "States") +
coord_flip() +
theme(axis.text.y = element_text(size = 7))
ggplot(hp_us_merged, aes(x = state, y = count, color = state)) +
geom_boxplot() +
guides(color = FALSE) +
labs(title = "Distribution of Hepatitis A Case Counts by State (Year: {frame_time})", y = "Count", x = "States") +
coord_flip() +
theme(axis.text.y = element_text(size = 7)) +
transition_time(year) +
ease_aes('linear')
ggplot(hp_us_merged, aes(x = state, y = count, fill = state)) +
geom_col() +
guides(fill = FALSE) +
labs(title = "Distribution of Hepatitis A Case Counts by State (Year: {frame_time})", y = "Count", x = "States") +
coord_flip() +
theme(axis.text.y = element_text(size = 7)) +
transition_time(year) +
ease_aes('linear')
cl_plot <- ggplot(cd_us, aes(x = year, y = count, color = state))+
guides(color = FALSE, fill = FALSE) +
geom_point() +
geom_smooth(method='lm',formula=y~x, color = "red") +
labs(title = "Count of Diseases in Each Year") +
xlab("Year") +
ylab ("Count") +
theme_minimal()
cl_plot <- ggplotly(cl_plot)
cl_plot
cl_plot <- ggplot(cd_us, aes(x = year, y = count, color = state))+
guides(color = FALSE, fill = FALSE) +
geom_point() +
geom_smooth(method='lm',formula=y~x, color = "red") +
labs(title = "Count of Diseases in Each Year") +
xlab("Year") +
ylab ("Count") +
theme_minimal() +
facet_wrap(~ state)
cl_plot
plot_correlation(cd_us)
## 1 features with more than 20 categories ignored!
## state: 51 categories
## Warning: Removed 22 rows containing missing values (`geom_text()`).
summary(cd_us)
## disease state year weeks_reporting
## Length:18666 Length:18666 Min. :1928 Min. : 0.0
## Class :character Class :character 1st Qu.:1957 1st Qu.:16.0
## Mode :character Mode :character Median :1977 Median :44.0
## Mean :1974 Mean :33.6
## 3rd Qu.:1992 3rd Qu.:50.0
## Max. :2011 Max. :52.0
##
## count population code
## Min. : 0 Min. : 86853 Min. : 1.00
## 1st Qu.: 2 1st Qu.: 1046542 1st Qu.:13.00
## Median : 49 Median : 2824918 Median :26.00
## Mean : 1381 Mean : 4242911 Mean :25.71
## 3rd Qu.: 453 3rd Qu.: 5153640 3rd Qu.:38.00
## Max. :132342 Max. :37607525 Max. :50.00
## NA's :370
cd_us_fit <- lm(count ~ population + weeks_reporting, data = cd_us)
summary(cd_us_fit)
##
## Call:
## lm(formula = count ~ population + weeks_reporting, data = cd_us)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5630 -1787 -1071 439 129385
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.529e+02 8.234e+01 -10.36 <2e-16 ***
## population 1.076e-04 8.269e-06 13.02 <2e-16 ***
## weeks_reporting 5.288e+01 1.992e+00 26.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5398 on 18663 degrees of freedom
## Multiple R-squared: 0.04878, Adjusted R-squared: 0.04868
## F-statistic: 478.5 on 2 and 18663 DF, p-value: < 2.2e-16
cd_us_plots <- autoplot(cd_us_fit, 1:4)
cd_us_plots
us_map <- ggplot(data = us_states, mapping = aes(x = long, y = lat, group = group, fill = region)) +
geom_polygon() +
guides(fill = FALSE)
us_map <- ggplotly(us_map)
us_map
ggplot(data = hp_us_merged, mapping = aes(x = long, y = lat, group = code, fill = count, color = region)) +
geom_polygon()
ggplot(data = hp_us_merged, mapping = aes(x = long, y = lat, group = code, fill = count, color = region)) +
geom_polygon() +
guides(color = FALSE) +
labs(title = "Count of Hepatitis A in the United States from 1966-2011") +
theme_classic()
ggplot(data = hp_us_merged, mapping = aes(x = long, y = lat, group = code, fill = count, color = region)) +
geom_polygon() +
guides(color = FALSE) +
labs(title = "Count of Hepatitis A in the United States from 1966-2011 (Year: {frame_time})") +
theme_classic() +
transition_time(year) +
ease_aes('linear')
ggplot(data = hp_us_merged, mapping = aes(x = long, y = lat, group = code, fill = count, color = region)) +
geom_polygon() +
scale_fill_gradient(name = "Disease Count", low = "white", high = "red") +
guides(color = FALSE) +
labs(title = "Count of Hepatitis A in the United States from 1966-2011 (Year: {frame_time})") +
theme_classic() +
transition_time(year) +
ease_aes('linear')
ggplot(data = cd_us_merged, mapping = aes(x = long, y = lat, group = code, fill = count, color = region)) +
geom_polygon() +
scale_fill_gradient(name = "Disease Count", low = "white", high = "red") +
guides(color = FALSE) +
labs(title = "Count of Different Contagious Diseases in the United States (Year: {frame_time})") +
theme_classic() +
transition_time(year) +
ease_aes('linear') +
facet_wrap(~ disease, scales = "free")