Ds labs assignment

Author

Emrick Dallo

# install.packages("dslabs") # these are data science labs
library("dslabs")
Warning: package 'dslabs' was built under R version 4.3.3
data(package="dslabs")
list.files(system.file("script", package = "dslabs"))
 [1] "make-admissions.R"                   
 [2] "make-brca.R"                         
 [3] "make-brexit_polls.R"                 
 [4] "make-calificaciones.R"               
 [5] "make-death_prob.R"                   
 [6] "make-divorce_margarine.R"            
 [7] "make-gapminder-rdas.R"               
 [8] "make-greenhouse_gases.R"             
 [9] "make-historic_co2.R"                 
[10] "make-mice_weights.R"                 
[11] "make-mnist_127.R"                    
[12] "make-mnist_27.R"                     
[13] "make-movielens.R"                    
[14] "make-murders-rda.R"                  
[15] "make-na_example-rda.R"               
[16] "make-nyc_regents_scores.R"           
[17] "make-olive.R"                        
[18] "make-outlier_example.R"              
[19] "make-polls_2008.R"                   
[20] "make-polls_us_election_2016.R"       
[21] "make-pr_death_counts.R"              
[22] "make-reported_heights-rda.R"         
[23] "make-research_funding_rates.R"       
[24] "make-stars.R"                        
[25] "make-temp_carbon.R"                  
[26] "make-tissue-gene-expression.R"       
[27] "make-trump_tweets.R"                 
[28] "make-weekly_us_contagious_diseases.R"
[29] "save-gapminder-example-csv.R"        
data("us_contagious_diseases")
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
Warning: package 'ggthemes' was built under R version 4.3.3
library(ggrepel)
Warning: package 'ggrepel' was built under R version 4.3.3
library(ggplot2)
#view(us_contagious_diseases)
#write_csv(us_contagious_diseases, "us_contagious_diseases.csv", na="")

I installed a package called reshape2 to help visualize the filtered data better, It helped by making the format wide instead of long. The dcast fuction is what arranged the data.

library(dslabs)
library(dplyr)
library(ggplot2)
library(reshape2)
Warning: package 'reshape2' was built under R version 4.3.3

Attaching package: 'reshape2'
The following object is masked from 'package:tidyr':

    smiths
# Load the dataset
data("us_contagious_diseases")
# Prepare the data for heatmap plotting
# Let's say we want to plot counts of cases of Hepatitis A over the years and states
hepatitis_data <- us_contagious_diseases %>%
  filter(disease == "Hepatitis A") %>%
  select(year, state, count) %>%
  dcast(year ~ state, value.var = "count", fill = 0)

Making a heatmap for hepatitis A cases over the years by state.

 ggplot(melt(hepatitis_data, id.vars = "year"), aes(x = variable, y = year, fill = value)) +
  geom_tile(color = "lightblue") +
  scale_fill_gradient(low = "white", high = "red") +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  labs(x = "State", y = "Year", fill = "Hepatitis A Cases") +
  ggtitle("Hepatitis A Cases over the Years by State")

Conclusion

For this assignment, I used the US contagious diseases data set. The data set had a lot of variables. I chose to stick with one disease, and compare its effect over the fifty states. I filtered out hepatitis A and only used the three columns I needed, which were state count and year.