HW 8, Michael Simms

HW 8, Michael Simms

Loading the Libraries and Dataset

library("dslabs")
data(package="dslabs")
list.files(system.file("script", package = "dslabs"))
 [1] "make-admissions.R"                   
 [2] "make-brca.R"                         
 [3] "make-brexit_polls.R"                 
 [4] "make-death_prob.R"                   
 [5] "make-divorce_margarine.R"            
 [6] "make-gapminder-rdas.R"               
 [7] "make-greenhouse_gases.R"             
 [8] "make-historic_co2.R"                 
 [9] "make-mice_weights.R"                 
[10] "make-mnist_27.R"                     
[11] "make-movielens.R"                    
[12] "make-murders-rda.R"                  
[13] "make-na_example-rda.R"               
[14] "make-nyc_regents_scores.R"           
[15] "make-olive.R"                        
[16] "make-outlier_example.R"              
[17] "make-polls_2008.R"                   
[18] "make-polls_us_election_2016.R"       
[19] "make-reported_heights-rda.R"         
[20] "make-research_funding_rates.R"       
[21] "make-stars.R"                        
[22] "make-temp_carbon.R"                  
[23] "make-tissue-gene-expression.R"       
[24] "make-trump_tweets.R"                 
[25] "make-weekly_us_contagious_diseases.R"
[26] "save-gapminder-example-csv.R"        
data("us_contagious_diseases")
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
write_csv(us_contagious_diseases, "us_contagious_diseases.csv", na="")
str(us_contagious_diseases)
'data.frame':   16065 obs. of  6 variables:
 $ disease        : Factor w/ 7 levels "Hepatitis A",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ state          : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ year           : num  1966 1967 1968 1969 1970 ...
 $ weeks_reporting: num  50 49 52 49 51 51 45 45 45 46 ...
 $ count          : num  321 291 314 380 413 378 342 467 244 286 ...
 $ population     : num  3345787 3364130 3386068 3412450 3444165 ...
#An interest in the occurrence of disease has me selecting this dataset, as I'm curious to identify trends...

Cleaning and Exploring the Data Variables

head(us_contagious_diseases)
      disease   state year weeks_reporting count population
1 Hepatitis A Alabama 1966              50   321    3345787
2 Hepatitis A Alabama 1967              49   291    3364130
3 Hepatitis A Alabama 1968              52   314    3386068
4 Hepatitis A Alabama 1969              49   380    3412450
5 Hepatitis A Alabama 1970              51   413    3444165
6 Hepatitis A Alabama 1971              51   378    3481798
unique(us_contagious_diseases$disease)
[1] Hepatitis A Measles     Mumps       Pertussis   Polio       Rubella    
[7] Smallpox   
Levels: Hepatitis A Measles Mumps Pertussis Polio Rubella Smallpox
#Here we can note the range of diseases included in this dataset
us_contagious_diseases2 <- us_contagious_diseases |>
  filter (disease == "Polio")
head(us_contagious_diseases2)
  disease   state year weeks_reporting count population
1   Polio Alabama 1928              52    62    2589923
2   Polio Alabama 1929              52    58    2619131
3   Polio Alabama 1930              51    67    2646248
4   Polio Alabama 1931              52    54    2670818
5   Polio Alabama 1932              52    35    2693027
6   Polio Alabama 1933              52    28    2713243
str(us_contagious_diseases2)
'data.frame':   2091 obs. of  6 variables:
 $ disease        : Factor w/ 7 levels "Hepatitis A",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ state          : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ year           : num  1928 1929 1930 1931 1932 ...
 $ weeks_reporting: num  52 52 51 52 52 52 52 52 52 52 ...
 $ count          : num  62 58 67 54 35 28 50 61 420 84 ...
 $ population     : num  2589923 2619131 2646248 2670818 2693027 ...
#2091 observations-- quite unwieldy-- aiming for a more manageable size...
unique(us_contagious_diseases2$state)
 [1] Alabama              Alaska               Arizona             
 [4] Arkansas             California           Colorado            
 [7] Connecticut          Delaware             District Of Columbia
[10] Florida              Georgia              Hawaii              
[13] Idaho                Illinois             Indiana             
[16] Iowa                 Kansas               Kentucky            
[19] Louisiana            Maine                Maryland            
[22] Massachusetts        Michigan             Minnesota           
[25] Mississippi          Missouri             Montana             
[28] Nebraska             Nevada               New Hampshire       
[31] New Jersey           New Mexico           New York            
[34] North Carolina       North Dakota         Ohio                
[37] Oklahoma             Oregon               Pennsylvania        
[40] Rhode Island         South Carolina       South Dakota        
[43] Tennessee            Texas                Utah                
[46] Vermont              Virginia             Washington          
[49] West Virginia        Wisconsin            Wyoming             
51 Levels: Alabama Alaska Arizona Arkansas California Colorado ... Wyoming
#I'm also keen to observe the names given within the state variable (notice the upper case "O" in "District Of Columbia")
us_contagious_diseases3 <-filter(us_contagious_diseases2, state %in%  c("District Of Columbia", "Maryland", "Virginia"))
head(us_contagious_diseases3)
  disease                state year weeks_reporting count population
1   Polio District Of Columbia 1928              41    33     472771
2   Polio District Of Columbia 1929              52     6     478871
3   Polio District Of Columbia 1930              51     9     486869
4   Polio District Of Columbia 1931              52    15     497179
5   Polio District Of Columbia 1932              52    34     509735
6   Polio District Of Columbia 1933              52     7     524346
#Here I am creating a new dataframe, to include only data from the Washington Metropolitan Area
str(us_contagious_diseases3)
'data.frame':   123 obs. of  6 variables:
 $ disease        : Factor w/ 7 levels "Hepatitis A",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ state          : Factor w/ 51 levels "Alabama","Alaska",..: 9 9 9 9 9 9 9 9 9 9 ...
 $ year           : num  1928 1929 1930 1931 1932 ...
 $ weeks_reporting: num  41 52 51 52 52 52 52 52 51 52 ...
 $ count          : num  33 6 9 15 34 7 9 86 6 30 ...
 $ population     : num  472771 478871 486869 497179 509735 ...
#123 observations-- definitely more within the range that works well...
unique(us_contagious_diseases3$year)
 [1] 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942
[16] 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957
[31] 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968
#Before creating the visualization, knowing the year range is important.

Creating the Visualization

p1<- ggplot(data = us_contagious_diseases3, mapping = aes(x = year, y = count)) +
  geom_point() +
xlab("Year") +
  theme_minimal(base_size = 12) +
 ylab("Number of Reported Polio Cases") +
 ggtitle("Polio Cases in the Washington Metropolitan Area, 1928-1968")  +
  scale_color_brewer(palette = "Set1") +
 geom_line(mapping = aes(color = state))
p1

#In this chunk is the code for the scatterplot of the polio cases per year in each jurisdiction (state), with labels, altering of the font size, and setting the palette. I find the plot to be most informative, as the number of cases sharply rose and fell at just about mid-century.