DATA607 Final Project Code

Load libraries

options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("httr")

## Installing package into 'C:/Users/PATELM70/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'httr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\PATELM70\AppData\Local\Temp\RtmpC4Yojn\downloaded_packages

install.packages("jsonlite")

## Installing package into 'C:/Users/PATELM70/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'jsonlite' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'jsonlite'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\PATELM70\AppData\Local\R\win-library\4.4\00LOCK\jsonlite\libs\x64\jsonlite.dll
## to
## C:\Users\PATELM70\AppData\Local\R\win-library\4.4\jsonlite\libs\x64\jsonlite.dll:
## Permission denied

## Warning: restored 'jsonlite'

## 
## The downloaded binary packages are in
##  C:\Users\PATELM70\AppData\Local\Temp\RtmpC4Yojn\downloaded_packages

install.packages("tidyverse")

## Installing package into 'C:/Users/PATELM70/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\PATELM70\AppData\Local\Temp\RtmpC4Yojn\downloaded_packages

install.packages("ggplot2")

## Installing package into 'C:/Users/PATELM70/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\PATELM70\AppData\Local\Temp\RtmpC4Yojn\downloaded_packages

library(httr)

## Warning: package 'httr' was built under R version 4.4.3

library(jsonlite)

## Warning: package 'jsonlite' was built under R version 4.4.3

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

Data set 1

url <- "https://data.cdc.gov/api/views/fhky-rtsk/rows.json?accessType=DOWNLOAD"
response <- GET(url)
json_data <- content(response, "text")
vaccine_data_list <- fromJSON(json_data, flatten = TRUE)

# Inspect the structure of the resulting list
str(vaccine_data_list)

## List of 2
##  $ meta:List of 1
##   ..$ view:List of 40
##   .. ..$ id                      : chr "fhky-rtsk"
##   .. ..$ name                    : chr "Vaccination Coverage among Young Children (0-35 Months)"
##   .. ..$ assetType               : chr "dataset"
##   .. ..$ attribution             : chr "National Center for Immunization and Respiratory Diseases (NCIRD)"
##   .. ..$ averageRating           : int 0
##   .. ..$ category                : chr "Child Vaccinations"
##   .. ..$ createdAt               : int 1620841972
##   .. ..$ description             : chr "Vaccination Coverage among Young Children (0-35 Months)\n\n• National, regional, state, and selected local area"| __truncated__
##   .. ..$ diciBackend             : logi FALSE
##   .. ..$ displayType             : chr "table"
##   .. ..$ downloadCount           : int 4862
##   .. ..$ hideFromCatalog         : logi FALSE
##   .. ..$ hideFromDataJson        : logi FALSE
##   .. ..$ locked                  : logi FALSE
##   .. ..$ newBackend              : logi TRUE
##   .. ..$ numberOfComments        : int 0
##   .. ..$ oid                     : int 37173135
##   .. ..$ provenance              : chr "official"
##   .. ..$ publicationAppendEnabled: logi FALSE
##   .. ..$ publicationDate         : int 1620874584
##   .. ..$ publicationGroup        : int 18303992
##   .. ..$ publicationStage        : chr "published"
##   .. ..$ rowsUpdatedAt           : int 1727374116
##   .. ..$ rowsUpdatedBy           : chr "b3ca-i9my"
##   .. ..$ tableId                 : int 18303992
##   .. ..$ totalTimesRated         : int 0
##   .. ..$ viewCount               : int 21704
##   .. ..$ viewLastModified        : int 1644612339
##   .. ..$ viewType                : chr "tabular"
##   .. ..$ approvals               :'data.frame':  1 obs. of  16 variables:
##   .. .. ..$ reviewedAt                               : int 1644522373
##   .. .. ..$ reviewedAutomatically                    : logi TRUE
##   .. .. ..$ state                                    : chr "approved"
##   .. .. ..$ submissionId                             : int 4427834
##   .. .. ..$ submissionObject                         : chr "public_audience_request"
##   .. .. ..$ submissionOutcome                        : chr "change_audience"
##   .. .. ..$ submittedAt                              : int 1644522373
##   .. .. ..$ targetAudience                           : chr "public"
##   .. .. ..$ workflowId                               : int 2100
##   .. .. ..$ submissionDetails.permissionType         : chr "READ"
##   .. .. ..$ submissionOutcomeApplication.endedAt     : int 1644522373
##   .. .. ..$ submissionOutcomeApplication.failureCount: int 0
##   .. .. ..$ submissionOutcomeApplication.startedAt   : int 1644522373
##   .. .. ..$ submissionOutcomeApplication.status      : chr "success"
##   .. .. ..$ submitter.id                             : chr "x3nx-2a2w"
##   .. .. ..$ submitter.displayName                    : chr "Julia A. Falvey"
##   .. ..$ clientContext           :List of 2
##   .. .. ..$ clientContextVariables: list()
##   .. .. ..$ inheritedVariables    : Named list()
##   .. ..$ columns                 :'data.frame':  18 obs. of  16 variables:
##   .. .. ..$ id                        : int [1:18] -1 -1 -1 -1 -1 -1 -1 -1 540437842 540437841 ...
##   .. .. ..$ name                      : chr [1:18] "sid" "id" "position" "created_at" ...
##   .. .. ..$ dataTypeName              : chr [1:18] "meta_data" "meta_data" "meta_data" "meta_data" ...
##   .. .. ..$ fieldName                 : chr [1:18] ":sid" ":id" ":position" ":created_at" ...
##   .. .. ..$ position                  : int [1:18] 0 0 0 0 0 0 0 0 2 3 ...
##   .. .. ..$ renderTypeName            : chr [1:18] "meta_data" "meta_data" "meta_data" "meta_data" ...
##   .. .. ..$ flags                     :List of 18
##   .. .. .. ..$ : chr "hidden"
##   .. .. .. ..$ : chr "hidden"
##   .. .. .. ..$ : chr "hidden"
##   .. .. .. ..$ : chr "hidden"
##   .. .. .. ..$ : chr "hidden"
##   .. .. .. ..$ : chr "hidden"
##   .. .. .. ..$ : chr "hidden"
##   .. .. .. ..$ : chr "hidden"
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. ..$ description               : chr [1:18] NA NA NA NA ...
##   .. .. ..$ tableColumnId             : int [1:18] NA NA NA NA NA NA NA NA 137964304 137964303 ...
##   .. .. ..$ cachedContents.non_null   : chr [1:18] NA NA NA NA ...
##   .. .. ..$ cachedContents.largest    : chr [1:18] NA NA NA NA ...
##   .. .. ..$ cachedContents.null       : chr [1:18] NA NA NA NA ...
##   .. .. ..$ cachedContents.top        :List of 18
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ :'data.frame':    11 obs. of  2 variables:
##   .. .. .. .. ..$ item : chr [1:11] "Hib" "Hep B" "PCV" "DTaP" ...
##   .. .. .. .. ..$ count: chr [1:11] "25377" "20094" "17274" "17273" ...
##   .. .. .. ..$ :'data.frame':    10 obs. of  2 variables:
##   .. .. .. .. ..$ item : chr [1:10] "≥3 Doses" "" "≥2 Doses" "≥1 Dose" ...
##   .. .. .. .. ..$ count: chr [1:10] "38729" "25061" "21186" "12826" ...
##   .. .. .. ..$ :'data.frame':    2 obs. of  2 variables:
##   .. .. .. .. ..$ item : chr [1:2] "States/Local Areas" "HHS Regions/National"
##   .. .. .. .. ..$ count: chr [1:2] "112272" "15916"
##   .. .. .. ..$ :'data.frame':    20 obs. of  2 variables:
##   .. .. .. .. ..$ item : chr [1:20] "Georgia" "Texas" "Pennsylvania" "United States" ...
##   .. .. .. .. ..$ count: chr [1:20] "1856" "1856" "1856" "1856" ...
##   .. .. .. ..$ :'data.frame':    20 obs. of  2 variables:
##   .. .. .. .. ..$ item : chr [1:20] "2016-2019" "2014-2017" "2016-2017" "2016" ...
##   .. .. .. .. ..$ count: chr [1:20] "12450" "12435" "5624" "5623" ...
##   .. .. .. ..$ :'data.frame':    6 obs. of  2 variables:
##   .. .. .. .. ..$ item : chr [1:6] "Age" "Race and Ethnicity" "Poverty" "Insurance Coverage" ...
##   .. .. .. .. ..$ count: chr [1:6] "103303" "6946" "5655" "5339" ...
##   .. .. .. ..$ :'data.frame':    20 obs. of  2 variables:
##   .. .. .. .. ..$ item : chr [1:20] "24 Months" "35 Months" "19 Months" "13 Months" ...
##   .. .. .. .. ..$ count: chr [1:20] "20917" "19527" "18153" "13958" ...
##   .. .. .. ..$ :'data.frame':    20 obs. of  2 variables:
##   .. .. .. .. ..$ item : chr [1:20] "92.6" "91.7" "92.0" "92.8" ...
##   .. .. .. .. ..$ count: chr [1:20] "800" "790" "787" "775" ...
##   .. .. .. ..$ :'data.frame':    20 obs. of  2 variables:
##   .. .. .. .. ..$ item : chr [1:20] "91.4 to 96.9" "91.0 to 96.7" "89.0 to 95.5" "90.1 to 97.4" ...
##   .. .. .. .. ..$ count: chr [1:20] "32" "31" "25" "24" ...
##   .. .. .. ..$ :'data.frame':    20 obs. of  2 variables:
##   .. .. .. .. ..$ item : chr [1:20] "293" "271" "243" "248" ...
##   .. .. .. .. ..$ count: chr [1:20] "859" "681" "680" "652" ...
##   .. .. ..$ cachedContents.smallest   : chr [1:18] NA NA NA NA ...
##   .. .. ..$ cachedContents.count      : chr [1:18] NA NA NA NA ...
##   .. .. ..$ cachedContents.cardinality: chr [1:18] NA NA NA NA ...
##   .. ..$ grants                  :'data.frame':  1 obs. of  3 variables:
##   .. .. ..$ inherited: logi FALSE
##   .. .. ..$ type     : chr "viewer"
##   .. .. ..$ flags    :List of 1
##   .. .. .. ..$ : chr "public"
##   .. ..$ metadata                :List of 2
##   .. .. ..$ custom_fields        :List of 1
##   .. .. .. ..$ Common Core:List of 5
##   .. .. .. .. ..$ Contact Email: chr "VaxView@cdc.gov"
##   .. .. .. .. ..$ Homepage     : chr "https://www.cdc.gov/vaccines/imz-managers/coverage/childvaxview/index.html"
##   .. .. .. .. ..$ Contact Name : chr " "
##   .. .. .. .. ..$ Program Code : chr "009:020"
##   .. .. .. .. ..$ Bureau Code  : chr "009:20"
##   .. .. ..$ availableDisplayTypes: chr [1:3] "table" "fatrow" "page"
##   .. ..$ owner                   :List of 5
##   .. .. ..$ id         : chr "knbc-mfp8"
##   .. .. ..$ displayName: chr "NCIRD"
##   .. .. ..$ screenName : chr "NCIRD"
##   .. .. ..$ type       : chr "interactive"
##   .. .. ..$ flags      : chr [1:2] "acceptedEula" "mayBeStoriesCoOwner"
##   .. ..$ query                   : Named list()
##   .. ..$ rights                  : chr "read"
##   .. ..$ tableAuthor             :List of 5
##   .. .. ..$ id         : chr "knbc-mfp8"
##   .. .. ..$ displayName: chr "NCIRD"
##   .. .. ..$ screenName : chr "NCIRD"
##   .. .. ..$ type       : chr "interactive"
##   .. .. ..$ flags      : chr [1:2] "acceptedEula" "mayBeStoriesCoOwner"
##   .. ..$ tags                    : chr [1:18] "vaxviews" "vaccination" "immunization" "vaccination coverage" ...
##   .. ..$ flags                   : chr [1:4] "default" "ownerMayBeContacted" "restorable" "restorePossibleForType"
##  $ data: chr [1:128188, 1:18] "row-s9mi-m2cf_n7rp" "row-3eg9_5em9.j2vi" "row-cm7i_br7b_5rn8" "row-iikw.rebt_d3tm" ...

# Check if the key containing the actual data is part of the list and convert it to a data frame
if ("data" %in% names(vaccine_data_list)) {
  vaccine_data <- as.data.frame(vaccine_data_list$data, stringsAsFactors = FALSE)
  
  # Get the column names
  column_names <- colnames(vaccine_data)

  # Print the column names
  print(column_names)
} else {
  print("The key containing the data was not found in the JSON structure")
}

##  [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10" "V11" "V12"
## [13] "V13" "V14" "V15" "V16" "V17" "V18"

Clean Data set 1

#new dataframe of only the necessary columns
vaccine_data_filtered <- vaccine_data[, c("V9", "V12", "V13", "V14", "V15", "V16")]
new_column_names <- c("Vaccine Type", "Geography", "Birth.Year.Birth.Cohort", "Dimension.Type", "Dimension", "Estimate")
colnames(vaccine_data_filtered) <- new_column_names

#removing the second year in the column as it is not needed
vaccine_data_filtered$`Birth.Year.Birth.Cohort` <- sapply(vaccine_data_filtered$`Birth.Year.Birth.Cohort`, function(x) {
  sub("^(\\d{4})-\\d{4}$", "\\1", x)
})

#renaming for analysis
vaccine_data_filtered$'Vaccine Type'[vaccine_data_filtered$'Vaccine Type' == "≥1 Dose Varicella"] <- "Varicella"
vaccine_data_filtered$'Vaccine Type'[vaccine_data_filtered$'Vaccine Type' == "≥1 Dose MMR"] <- "Measles, Mumps, Rubella"
vaccine_data_filtered$'Vaccine Type'[vaccine_data_filtered$'Vaccine Type' == "Combined 7 Series"] <- "Combined Series"


#Combine all the counties into their states
vaccine_data_filtered$Geography <- ifelse(grepl("^TX", vaccine_data_filtered$Geography), "Texas", vaccine_data_filtered$Geography)
vaccine_data_filtered$Geography <- ifelse(grepl("^IL", vaccine_data_filtered$Geography), "Illinois", vaccine_data_filtered$Geography)
vaccine_data_filtered$Geography <- ifelse(grepl("^NY", vaccine_data_filtered$Geography), "New York", vaccine_data_filtered$Geography)
vaccine_data_filtered$Geography <- ifelse(grepl("^PA", vaccine_data_filtered$Geography), "Pennsylvania", vaccine_data_filtered$Geography)

clean_vaccine_data <- vaccine_data_filtered %>%
  filter(Dimension.Type == "Age") %>%
  mutate(
    Age_Value = case_when(
      grepl("Months", Dimension) ~ paste0(gsub("\\s*Months.*", "", Dimension), "m"),
      grepl("Days", Dimension) ~ paste0(gsub("\\D", "", Dimension), "d"),
      TRUE ~ Dimension
    ),
    Estimate = as.numeric(as.character(Estimate))
  ) %>%
  filter(!is.na(Estimate)) %>%
  group_by(`Vaccine Type`, Geography, Birth.Year.Birth.Cohort, Age_Value) %>%
  summarize(Estimate = mean(Estimate, na.rm = TRUE), .groups = "drop")

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Estimate = as.numeric(as.character(Estimate))`.
## Caused by warning:
## ! NAs introduced by coercion

#Pivot wider to show better coverage of vaccines per age in states
clean_vaccine_data_wide <- clean_vaccine_data %>%
  pivot_wider(
    id_cols = c(`Vaccine Type`, Geography, Birth.Year.Birth.Cohort),
    names_from = Age_Value,
    values_from = Estimate
  )

Data set 2

vaccine_data_2 <- read.csv('https://raw.githubusercontent.com/mirajpatel289/Data607/refs/heads/main/Vaccination%20Coverage%2019-35month%20.csv')

Clean Data set 2

vaccine_data_2_filtered <- vaccine_data_2[, c("Year", "Vaccination", "Category", "Group", "Estimate" )]

clean_vaccine_data_2 <- vaccine_data_2_filtered %>%
  mutate(`Vaccination` = case_when(
    `Vaccination` %in% c(
      "Combined series (4:3:1:4:3:1:4)",
      "Combined series (4:3:1:3:3:1:4)",
      "Combined series (4:3:1:3:3:1)"
    ) ~ "Combined Series",
    Vaccination == "DTP/DT/DTaP (4 doses or more)" ~ "DTaP",
    Vaccination == "Polio (3 doses or more)" ~ "Polio",
    Vaccination %in% c("Hib (3 doses or more)", "Hib (primary series plus booster dose)") ~ "Hib",
    Vaccination == "Hepatitis B (3 doses or more)" ~ "Hep B",
    Vaccination == "PCV (4 doses or more)" ~ "PCV",
    TRUE ~ `Vaccination`
  ))

Graphs

#Data Set 1
summarized_data_1 <- clean_vaccine_data %>%
  group_by(`Vaccine Type`, `Birth.Year.Birth.Cohort`) %>%
  summarize(Average_Estimate = mean(Estimate, na.rm = TRUE), .groups = "drop") %>%
  mutate(Dataset = "Dataset 1") #used for combination of the two data sets

#Bar graph
ggplot(summarized_data_1, aes(x = `Birth.Year.Birth.Cohort`, y = Average_Estimate, fill = `Vaccine Type`)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(title = "Average Vaccination Estimate by Birth Year/Cohort",
       x = "Birth Year/Cohort",
       y = "Average Estimate",
       fill = "Vaccine Type") +
  theme_minimal()

#Another plot, Heat Map, easier to tell the percentage than bar graph
ggplot(summarized_data_1, aes(x = `Birth.Year.Birth.Cohort`, y = `Vaccine Type`, fill = Average_Estimate)) +
  geom_tile() +
  labs(title = "Heatmap of Vaccination Estimates",
       x = "Birth Year/Cohort",
       y = "Vaccine Type",
       fill = "Average Estimate") +
  theme_minimal()

#Data Set 2
summarized_data_2 <- clean_vaccine_data_2 %>%
  group_by(Year, Vaccination) %>%
  summarize(Average_Estimate = mean(Estimate, na.rm = TRUE), .groups = "drop") %>%
  mutate(Dataset = "Dataset 2", `Birth.Year.Birth.Cohort` = Year, `Vaccine Type` = Vaccination) #used for combination of two data sets

#Bar Graph
ggplot(summarized_data_2, aes(x = Year, y = Average_Estimate, fill = Vaccination)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(title = "Average Vaccination Estimate by Year",
       x = "Year",
       y = "Average Estimate",
       fill = "Vaccination") +
  theme_minimal()

#Heat Map
ggplot(summarized_data_2, aes(x = Year, y = Vaccination, fill = Average_Estimate)) +
  geom_tile() +
  labs(title = "Heatmap of Vaccination Estimates",
       x = "Year",
       y = "Vaccination",
       fill = "Average Estimate") +
  theme_minimal()

### Combine two datasets

#Convert Birth.Year.Birth.Cohort to character in summarized_data_1
summarized_data_1 <- summarized_data_1 %>%
  mutate(`Birth.Year.Birth.Cohort` = as.character(`Birth.Year.Birth.Cohort`))

#Convert Birth.Year.Birth.Cohort to character in summarized_data_2
summarized_data_2 <- summarized_data_2 %>%
  mutate(`Birth.Year.Birth.Cohort` = as.character(Year),
         `Vaccine Type` = Vaccination)

combined_data <- bind_rows(summarized_data_1, summarized_data_2)

#Adjust the factor levels to ensure Dataset 2 appears before Dataset 1 so the years are in order
combined_data$Dataset <- factor(combined_data$Dataset, levels = c("Dataset 2", "Dataset 1"))

# Create the combined bar graph using combined data
ggplot(combined_data, aes(x = `Birth.Year.Birth.Cohort`, y = Average_Estimate, fill = `Vaccine Type`)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  facet_wrap(~Dataset, scales = "free_x") +
  labs(title = "Average Vaccination Estimate by Year/Cohort",
       x = "Year or Birth Year/Cohort",
       y = "Average Estimate",
       fill = "Vaccine Type") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) #easier to read

# Create the combined heatmap using combined data
ggplot(combined_data, aes(x = `Birth.Year.Birth.Cohort`, y = `Vaccine Type`, fill = Average_Estimate)) +
  geom_tile() +
  facet_wrap(~Dataset, scales = "free_x") +
  labs(title = "Heatmap of Vaccination Estimates",
       x = "Year or Birth Year/Cohort",
       y = "Vaccine Type",
       fill = "Average Estimate") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) #easier to read

Statistical Analysis

#Correlation for Dataset 2
correlation_dataset_2 <- cor.test(as.numeric(clean_vaccine_data_2$Estimate), as.numeric(clean_vaccine_data_2$Year))
print(correlation_dataset_2)

## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(clean_vaccine_data_2$Estimate) and as.numeric(clean_vaccine_data_2$Year)
## t = -5.568, df = 1010, p-value = 3.303e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2317320 -0.1121404
## sample estimates:
##        cor 
## -0.1725721

#Correlation for Dataset 1
correlation_dataset_1 <- cor.test(as.numeric(clean_vaccine_data$Estimate), as.numeric(clean_vaccine_data$`Birth.Year.Birth.Cohort`))
print(correlation_dataset_1)

## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(clean_vaccine_data$Estimate) and as.numeric(clean_vaccine_data$Birth.Year.Birth.Cohort)
## t = 6.8852, df = 36981, p-value = 5.861e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02559843 0.04595588
## sample estimates:
##        cor 
## 0.03578087

#Scatter plot for correlation visualization
ggplot(clean_vaccine_data_2, aes(x = Year, y = Estimate)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Vaccination Estimate vs. Year (Dataset 2)",
       x = "Year",
       y = "Estimate",
       caption = paste("Correlation Coefficient:", round(correlation_dataset_2$estimate, 2))) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 104 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 104 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Birth.Year.Birth.Cohort is numeric for proper model fitting
clean_vaccine_data$Birth.Year.Birth.Cohort <- as.numeric(clean_vaccine_data$Birth.Year.Birth.Cohort)

ggplot(clean_vaccine_data, aes(x = `Birth.Year.Birth.Cohort`, y = Estimate)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Vaccination Estimate vs. Birth Year/Cohort (Dataset 1)",
       x = "Birth Year/Cohort",
       y = "Estimate",
       caption = paste("Correlation Coefficient:", round(correlation_dataset_1$estimate, 3))) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Data set 1: There is a statistically significant very weak positive correlation between vaccination estimates and birth year/birth cohort 2011 to 2021. This suggests that every new year, vaccination coverage shows a slight increase, although the correlation is very weak.

Data Set 2: There is a statistically significant weak negative correlation between vaccination estimates and year 1995 to 2009. This suggests that every new year, there might be a slight decrease in vaccination coverage, although this correlation is quite weak.

DATA607 Final Project Code

Miraj Patel

2025-05-11