Based off your respective group work, create a visualization that tells me the number of cases by case status, (ethnic group, case identification, or village) and sex.
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
dir <- "C:\\Users\\micha\\OneDrive\\Documents\\Rscript\\MA500 Homework\\cajigalm_hw4"
clean_vill <- read_csv(paste0(dir, "\\cleaned_cdc_village.csv"))
## Rows: 19668 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Village, new_vill
## dbl (1): id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dphss_covid <- read_csv(paste0(dir, "\\dphss_covid.csv"))
## Rows: 19668 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sex, Case.Status, Travel.location, Symptomatic?, PCR.Lab.Name
## dbl (9): age_calc, Community, Household, Healthcare, Workplace, Travel.14.d...
## date (4): Symptom.Onset.Date, Specimen.Collected.Date, PCR.Lab.Reported.Date...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Merging datasets by id.
dphss_covid_clean_new_vill <- left_join(dphss_covid, clean_vill, by = "id")
bar1 <- ggplot(dphss_covid_clean_new_vill, aes(x = new_vill, fill = Sex)) +
geom_bar(position = "dodge") +
labs(x = "Gender",
y = "Count",
title = "Number of Cases by Village and Sex")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(bar1)
# Faceted by sex to see the distribution better.
bar1_alt <- ggplot(dphss_covid_clean_new_vill, aes(x = new_vill, fill = Sex)) +
geom_bar() +
labs(x = "Village",
y = "Count",
title = "Number of Cases by Village and Sex") +
facet_wrap(~ Sex, ncol = 1) + # Create one facet for each gender
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(bar1_alt)
Merge the cleaned columns for ethnicity, village, and case identification from our previous class exercise with the dphss_covid.csv dataset, and then summarize the data.
# Setting directory and reading in all clean data sets.
dir <- "C:\\Users\\micha\\OneDrive\\Documents\\Rscript\\MA500 Homework\\cajigalm_hw4"
clean_vill <- read_csv(paste0(dir, "\\cleaned_cdc_village.csv"))
## Rows: 19668 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Village, new_vill
## dbl (1): id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clean_ethnic <- read_csv(paste0(dir, "\\cdc_ethnicity_clean.csv"))
## Rows: 19668 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Race.Ethnicity, Sex, Case.Status, Ethnicity
## dbl (1): id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clean_ident <- read_csv(paste0(dir, "\\case_ident_clean.csv"))
## Rows: 19668 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): id by, clean_id
## dbl (1): id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dphss_covid <- read_csv(paste0(dir, "\\dphss_covid.csv"))
## Rows: 19668 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sex, Case.Status, Travel.location, Symptomatic?, PCR.Lab.Name
## dbl (9): age_calc, Community, Household, Healthcare, Workplace, Travel.14.d...
## date (4): Symptom.Onset.Date, Specimen.Collected.Date, PCR.Lab.Reported.Date...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Merging datasets by id.
dphss_covid_clean <- left_join(dphss_covid, clean_vill, by = "id")
dphss_covid_clean <- left_join(dphss_covid_clean, clean_ethnic, by = "id")
dphss_covid_clean <- left_join(dphss_covid_clean, clean_ident, by = "id")
# Removing duplicate columns as a result of merging data sets.
dphss_covid_clean <- subset(dphss_covid_clean, select = -Sex.y)
dphss_covid_clean <- subset(dphss_covid_clean, select = -Case.Status.y)
# Changing duplicate column names to their original names.
colnames(dphss_covid_clean)[2] <- "Sex"
colnames(dphss_covid_clean)[3] <- "Case.Status"
# Factoring categorical variables to summarize frequencies for each group.
dphss_covid_clean[, c("Sex", "Case.Status", "Travel.location", "Symptomatic?", "PCR.Lab.Name", "new_vill", "Ethnicity", "clean_id")] <- lapply(dphss_covid_clean[, c("Sex", "Case.Status", "Travel.location", "Symptomatic?", "PCR.Lab.Name", "new_vill","Ethnicity", "clean_id")], factor)
# Summarizing data set.
summary(dphss_covid_clean)
## age_calc Sex Case.Status Community
## Min. : 0.00 Female: 8853 Confirmed:15393 Min. :0.000
## 1st Qu.:21.00 Male :10815 Probable : 4275 1st Qu.:0.000
## Median :33.00 Median :0.000
## Mean :34.49 Mean :0.346
## 3rd Qu.:48.00 3rd Qu.:1.000
## Max. :98.00 Max. :1.000
## NA's :14117
## Household Healthcare Workplace Travel.location
## Min. :0.00 Min. :0.000 Min. :0.000 Continental US: 546
## 1st Qu.:0.00 1st Qu.:0.000 1st Qu.:0.000 HAWAII : 83
## Median :1.00 Median :0.000 Median :0.000 PHILIPPINES : 73
## Mean :0.62 Mean :0.023 Mean :0.209 Hawaii : 19
## 3rd Qu.:1.00 3rd Qu.:0.000 3rd Qu.:0.000 JAPAN : 12
## Max. :1.00 Max. :1.000 Max. :1.000 (Other) : 45
## NA's :11454 NA's :15091 NA's :9195 NA's :18890
## Travel.14.days.before.onset? Symptomatic? Symptom.Onset.Date
## Min. :0.00000 No :8422 Min. :2020-03-01
## 1st Qu.:0.00000 Unknown: 238 1st Qu.:2020-10-13
## Median :0.00000 Yes :8992 Median :2021-08-02
## Mean :0.03956 NA's :2016 Mean :2021-04-11
## 3rd Qu.:0.00000 3rd Qu.:2021-09-22
## Max. :1.00000 Max. :2022-07-25
## NA's :10676
## Specimen.Collected.Date PCR.Lab.Name
## Min. :2020-03-14 GPHL :10196
## 1st Qu.:2020-11-04 DLS : 1725
## Median :2021-08-25 USNH : 1595
## Mean :2021-05-08 Guam Radiology Consultants: 1389
## 3rd Qu.:2021-09-28 GRMC : 1003
## Max. :2021-12-31 GMHA : 973
## NA's :95 (Other) : 2787
## PCR.Lab.Reported.Date Death Recovered Date.of.recovery
## Min. :2020-03-15 Min. :0.00000 Min. :1 Min. :2020-03-27
## 1st Qu.:2020-11-04 1st Qu.:0.00000 1st Qu.:1 1st Qu.:2020-11-20
## Median :2021-08-25 Median :0.00000 Median :1 Median :2021-08-31
## Mean :2021-05-07 Mean :0.01413 Mean :1 Mean :2021-05-19
## 3rd Qu.:2021-09-28 3rd Qu.:0.00000 3rd Qu.:1 3rd Qu.:2021-10-07
## Max. :2021-12-31 Max. :1.00000 Max. :1 Max. :2022-07-30
## NA's :278 NA's :278
## id Village new_vill
## Min. : 20201 Length:19668 DEDEDO :5416
## 1st Qu.: 20204918 Class :character YIGO :2738
## Median : 20209834 Mode :character TAMUNING-TUMON-HARMON:2336
## Mean :108652015 MANGILAO :1819
## 3rd Qu.:202014751 BARRIGADA :1235
## Max. :202055406 UNKNOWN : 996
## (Other) :5128
## Race.Ethnicity Ethnicity id by
## Length:19668 Chamorro :6809 Length:19668
## Class :character Filipino :4560 Class :character
## Mode :character Chuukese :1660 Mode :character
## Other Pacific Islander:1441
## Other :1394
## Mixed :1304
## (Other) :2500
## clean_id
## Clinical evaluation : 1998
## Contact tracing : 6032
## Multiple identifications: 13
## Other : 86
## Routine surveillance :10266
## Unknown : 1273
## Calculate the average number of days between symptom onset (Symptom Onset Date) and a positive test result (PCR Lab Reported Date) using the difftime() function.
# Using difftime() to get the difference in days between symptom date and reported positive test date.
day_diff_symptom_positive <- difftime(dphss_covid_clean$PCR.Lab.Reported.Date,
dphss_covid_clean$Symptom.Onset.Date,
units = "days")
# print(dayday_diff_symptom_positive)
# Gets the average number of days between symptom data and reported positive test date.
mean(day_diff_symptom_positive, na.rm = TRUE)
## Time difference of 3.138234 days
• Convert the resulting column into a numeric variable with the as.numeric() function.
dphss_covid_clean$day_diff_symptom_positive <- as.numeric(day_diff_symptom_positive)
• Examine this new variable for any outliers or missing values. Assess whether any data points should be excluded from further analysis.
# Doing different stuff to see if there are any outliers:
# Running summary of difference in days between symptom date and reported positive test date.
summary(dphss_covid_clean$day_diff_symptom_positive)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -360.000 1.000 2.000 3.138 4.000 367.000 10676
# Checing histogram between symptom date and reported positive test date.
histogram_day_diff <- ggplot(dphss_covid_clean, aes(x=dphss_covid_clean$day_diff_symptom_positive)) + geom_histogram()
print(histogram_day_diff)
## Warning: Use of `dphss_covid_clean$day_diff_symptom_positive` is discouraged.
## ℹ Use `day_diff_symptom_positive` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 10676 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Checking boxplot between symptom date and reported positive test date.
boxplot(dphss_covid_clean$day_diff_symptom_positive)
# detect_and_remove_outliers <- function(df) {
#
# # Create a new column to mark outliers
# df$is_outlier <- FALSE
#
# # Loop through each numeric column to detect outliers
# for (col in colnames(df)) {
# if (is.numeric(df[[col]])) {
# # Calculate IQR for the column
# Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
# Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
# IQR_value <- Q3 - Q1
#
# lower_bound <- Q1 - 1.5 * IQR_value
# upper_bound <- Q3 + 1.5 * IQR_value
#
# # Identify outliers
# df$is_outlier <- df$is_outlier | (df[[col]] <= lower_bound |
# df[[col]] >= upper_bound)
# }
# }
#
# # Remove rows that are outliers
# df_removed <- df[!df$is_outlier, ]
#
# # Drop the is_outlier column
# df_removed$is_outlier <- NULL
#
# return(df_removed)
# }
#
# # percentile way - may be useful for highly skewed data as
# # IQR might miss values on one tail better for large
# # datasets flexible thresholds
# detect_and_remove_outliers_percentile <- function(df) {
#
# # Create a new column to mark outliers
# df$is_outlier <- FALSE
#
# # Loop through each numeric column to detect outliers
# for (col in colnames(df)) {
# if (is.numeric(df[[col]])) {
# # Calculate IQR for the column
# lower_bound <- quantile(df[[col]], 0.05, na.rm = TRUE)
# upper_bound <- quantile(df[[col]], 0.95, na.rm = TRUE)
#
# # Identify outliers
# df$is_outlier <- df$is_outlier | (df[[col]] <= lower_bound |
# df[[col]] >= upper_bound)
# }
# }
#
# # Remove rows that are outliers
# df_removed <- df[!df$is_outlier, ]
#
# # Drop the is_outlier column
# df_removed$is_outlier <- NULL
#
# return(df_removed)
# }
# # Alternate IQR-based outlliers function that doesnt remove rows but replaces it with NA:
# I went for the alternative of replacing values with NA as oppose to removing the values due to having errors when trying to clean data set as a result of mismatch number of rows.
# detect_and_replace_outliers <- function(df) {
#
# # Loop through each numeric column to detect outliers
# for (col in colnames(df)) {
# if (is.numeric(df[[col]])) {
# # Calculate IQR for the column
# Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
# Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
# IQR_value <- Q3 - Q1
#
# lower_bound <- Q1 - 1.5 * IQR_value
# upper_bound <- Q3 + 1.5 * IQR_value
#
# # Replace outliers with NA
# df[[col]][df[[col]] < lower_bound | df[[col]] > upper_bound] <- NA
# }
# }
#
# return(df)
# }
# I went with the percentile-based outlier detection as IQR may leave out some outliers.
# Alternatively, for percentile-based outlier detection:
detect_and_replace_outliers_percentile <- function(df) {
# Loop through each numeric column to detect outliers
for (col in colnames(df)) {
if (is.numeric(df[[col]])) {
# Calculate the 5th and 95th percentile bounds
lower_bound <- quantile(df[[col]], 0.05, na.rm = TRUE)
upper_bound <- quantile(df[[col]], 0.95, na.rm = TRUE)
# Replace outliers with NA
df[[col]][df[[col]] < lower_bound | df[[col]] > upper_bound] <- NA
}
}
return(df)
}
# Concerting the difference in days between symptom date and report of positive test result date into dataframe to use outlier function.
free_day_diff <- data.frame(dphss_covid_clean$day_diff_symptom_positive)
# Used detect_and_replace_outliers_percentile() function to detect and replace outliers with NA based off percentile.
free_day_diff <- detect_and_replace_outliers_percentile(free_day_diff)
# Running summary once again to see significant changes in mean and values.
summary(free_day_diff)
## dphss_covid_clean.day_diff_symptom_positive
## Min. : 0.000
## 1st Qu.: 1.000
## Median : 2.000
## Mean : 2.713
## 3rd Qu.: 4.000
## Max. :10.000
## NA's :11371
# Adding this new column into original clean data set.
dphss_covid_clean$free_day_diff <- free_day_diff$dphss_covid_clean.day_diff_symptom_positive
# Questions I have: 1) Is it possible for symptom onset date to come after a positive test result? Hence, this would explain the negative values? 2) Should exclude all extreme values as it may be impractical to have waited 300 days to get a positive result?
• Create a histogram to visualize the distribution of these days. Does the average number of days vary between months?
# Questions I have: 1) Are we creating the visualization for the original diff. in days or the one where we left out any outliers? 2) How do we get month?
# Creates basic histogram to visualize the distribution of these days.
hist1 <- ggplot(data = dphss_covid_clean, aes(x = free_day_diff)) +
geom_histogram(, binwidth = 1, color = "black", fill = "white") +
labs(title = "Distribution on Days between Symptom Onset and a Positive Test Result",
x = "Number of Days",
y = "Frequency")
print(hist1)
## Warning: Removed 11371 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Convert to date format in order to get month.
dphss_covid_clean$month_full <- as.Date(dphss_covid_clean$PCR.Lab.Reported.Date, format = "%Y-%m-%d")
dphss_covid_clean$month_full <- format(dphss_covid_clean$month_full, "%B")
# Factoring month by correct order for visualizations
dphss_covid_clean$month_full <- factor(dphss_covid_clean$month_full, levels = c("January", "February", "March", "April", "May", "June","July", "August", "September", "October", "November", "December"))
# Created a plot density to further investigate if the average number of days vary between months.
free_dens1 <- ggplot(data = dphss_covid_clean, aes(x = free_day_diff)) +
geom_histogram(aes(y = after_stat(density)), binwidth = 2,
color = "black", fill = "white") + geom_density(aes(color = month_full,
fill = month_full), alpha = 0.2)
print(free_dens1)
## Warning: Removed 11371 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 11371 rows containing non-finite outside the scale range
## (`stat_density()`).
# It was a bit difficult to see the variance within the plot density, so I have created boxplots across the different months to view distribution of data which gives insight on mean
box1 <- ggplot(dphss_covid_clean, aes(x = month_full, y = free_day_diff)) +
geom_boxplot(aes(fill = factor(month_full))) +
labs(title = "Box Plot of Days between Symptom Onset and a Positive Test Result",
x = "Number of Cylinders", y = "Miles Per Gallon",
fill = "Number of Cylinders") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(box1)
## Warning: Removed 11371 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Still wasn't sure about the average number of days varying between months in previous graphs, so I just made a bar graph of the average days per month by first getting the average days per month and found this was the best way to see if the avergae number of days vary between months.
avg_days_per_month <- dphss_covid_clean %>%
group_by(month_full) %>%
summarize(free_day_diff = mean(free_day_diff, na.rm = TRUE))
barplot_gg1 <- ggplot(avg_days_per_month, aes(x = month_full, y = free_day_diff, fill = month_full)) +
geom_bar(stat = "identity") +
labs(title = "Average Days between Symptom Onset and a Positive Test Result by Month",
x = "Month",
y = "Average Days",
fill = "Month") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))
print(barplot_gg1)
# # I am further going to see if the average number of days between symptom onset date and positive test date varies using an ANOVA test which will give me further insight if the difference in average days within a month is significant.
# sum(is.na(avg_days_per_month$free_day_diff))
# sum(is.na(avg_days_per_month$month_full))
#
# avg_days_per_month$month_full <- as.factor(avg_days_per_month$month_full)
# avg_days_per_month$free_day_diff <- as.numeric(avg_days_per_month$free_day_diff)
#
# # Running the ANOVA test.
# aov(free_day_diff ~ month_full, data = avg_days_per_month)
# # Unfortunately it doesnt give the p-value.
Calculate the average number of days for patients to recover from COVID-19 after receiving a positive result.
# Using difftime() to get the difference in days between reported positive test date and recovery date.
day_diff_positive_recover <- difftime(dphss_covid_clean$Date.of.recovery, dphss_covid_clean$PCR.Lab.Reported.Date, units = "days")
# print(dayday_diff_symptom_positive)
# Gets the average number of days between positive test result and recovery date.
mean(day_diff_positive_recover, na.rm = TRUE)
## Time difference of 11.14007 days
• Convert the resulting column into a numeric variable with the as.numeric() function.
dphss_covid_clean$day_diff_positive_recover <- as.numeric(day_diff_positive_recover)
• Examine this new variable for any outliers or missing values. Assess
whether any data points should be excluded from further analysis.
summary(dphss_covid_clean$day_diff_positive_recover)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -355.00 10.00 10.00 11.14 12.00 365.00 278
histogram_day_diff_positive_recover <- ggplot(dphss_covid_clean, aes(x=dphss_covid_clean$day_diff_positive_recover)) + geom_histogram()
print(histogram_day_diff_positive_recover)
## Warning: Use of `dphss_covid_clean$day_diff_positive_recover` is discouraged.
## ℹ Use `day_diff_positive_recover` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 278 rows containing non-finite outside the scale range
## (`stat_bin()`).
boxplot(dphss_covid_clean$day_diff_positive_recover)
# # Alternate outlliers function that doesnt remove rows but replaces it with NA:
# detect_and_replace_outliers <- function(df) {
#
# # Loop through each numeric column to detect outliers
# for (col in colnames(df)) {
# if (is.numeric(df[[col]])) {
# # Calculate IQR for the column
# Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
# Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
# IQR_value <- Q3 - Q1
#
# lower_bound <- Q1 - 1.5 * IQR_value
# upper_bound <- Q3 + 1.5 * IQR_value
#
# # Replace outliers with NA
# df[[col]][df[[col]] < lower_bound | df[[col]] > upper_bound] <- NA
# }
# }
#
# return(df)
# }
# I actually used the percentile-based outlier detection.
# Alternatively, for percentile-based outlier detection:
detect_and_replace_outliers_percentile <- function(df) {
# Loop through each numeric column to detect outliers
for (col in colnames(df)) {
if (is.numeric(df[[col]])) {
# Calculate the 5th and 95th percentile bounds
lower_bound <- quantile(df[[col]], 0.05, na.rm = TRUE)
upper_bound <- quantile(df[[col]], 0.95, na.rm = TRUE)
# Replace outliers with NA
df[[col]][df[[col]] < lower_bound | df[[col]] > upper_bound] <- NA
}
}
return(df)
}
free_day_diff_positive_recover <- data.frame(dphss_covid_clean$day_diff_positive_recover)
free_day_diff_positive_recover <- detect_and_replace_outliers_percentile(free_day_diff_positive_recover)
summary(free_day_diff_positive_recover)
## dphss_covid_clean.day_diff_positive_recover
## Min. : 0.00
## 1st Qu.:10.00
## Median :10.00
## Mean :10.23
## 3rd Qu.:12.00
## Max. :19.00
## NA's :1229
dphss_covid_clean$free_day_diff_positive_recover <- free_day_diff_positive_recover$dphss_covid_clean.day_diff_positive_recover
• Plot the distribution of these recovery times and create a visualization comparing the average recovery time across different ethnicities.
# Creates basic histogram to visualize the distribution of these days.
hist2 <- ggplot(data = dphss_covid_clean, aes(x = free_day_diff_positive_recover)) +
geom_histogram(aes(y = after_stat(density)), binwidth = 1, color = "black", fill = "white")+
labs(title = "Distribution on Days between a Positive Test Result and Recovery",
x = "Number of Days",
y = "Frequency")
print(hist2)
## Warning: Removed 1229 rows containing non-finite outside the scale range
## (`stat_bin()`).
#
avg_days_per_ethnic <- dphss_covid_clean %>%
group_by(Ethnicity) %>%
summarize(free_day_diff_positive_recover = mean(free_day_diff_positive_recover, na.rm = TRUE))
barplot_gg2 <- ggplot(avg_days_per_ethnic, aes(x = Ethnicity, y = free_day_diff_positive_recover, fill = Ethnicity)) +
geom_bar(stat = "identity") +
labs(title = "Average Days between a Positive Test Result and Recovery Date by Month",
x = "Month",
y = "Average Days",
fill = "Ethncicities") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5))
print(barplot_gg2)
Plot the number of case identifications over time, segmented by month-year intervals. Analyze and discuss any patterns or trends you observe.
# Convert to date format in order to get month and year.
dphss_covid_clean$month_year_full <- as.Date(dphss_covid_clean$PCR.Lab.Reported.Date, format = "%Y-%m-%d")
# Aggregating to get the count of cases for each unique month-year and case id.
case_counts <- dphss_covid_clean %>%
group_by(month_year_full, clean_id) %>%
summarize(count = n(), .groups = 'drop')
# Putting the date in order for graph to be depicted correctly.
case_counts <- case_counts %>%
arrange(month_year_full)
bar_plot <- ggplot(case_counts, aes(format(month_year_full, "%B %Y"), y = count, fill = clean_id)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Month-Year", y = "Count of Case Identifications",
title = "Counts of Case Identifications by Month-Year") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_discrete(name = "Case Identification") +
scale_x_discrete(limits = unique(format(case_counts$month_year_full, "%B %Y")))
print(bar_plot)
One-hot encode the variables for sex, ethnicity (using the cleaned ethnicity data from our previous group work), and case identification. This will involve creating a binary column for each unique value in these categories, where each row indicates whether it matches that specific value. Here’s a visual
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# First I go the new dataframe with just thedesired variables to one-hot encode.
dphss_covid_clean_vill_sex_ci <- dphss_covid_clean %>%
select(id, new_vill, Ethnicity, clean_id)
dphss_covid_clean_vill_sex_ci
## # A tibble: 19,668 × 4
## id new_vill Ethnicity clean_id
## <dbl> <fct> <fct> <fct>
## 1 20201 DEDEDO Filipino Clinical evaluation
## 2 20202 DEDEDO Filipino Clinical evaluation
## 3 20203 CHALAN PAGO-ORDOT Chamorro Clinical evaluation
## 4 20204 DEDEDO Filipino Contact tracing
## 5 20205 TALOFOFO Chamorro Routine surveillance
## 6 20206 CHALAN PAGO-ORDOT Chamorro Contact tracing
## 7 20207 DEDEDO Filipino Routine surveillance
## 8 20208 DEDEDO Filipino Clinical evaluation
## 9 20209 TAMUNING-TUMON-HARMON Filipino Contact tracing
## 10 202010 TALOFOFO Chamorro Contact tracing
## # ℹ 19,658 more rows
# I used the caret function to one-hot encode all desired variables (which were the data the class cleaned).
dummy_var <- dummyVars(" ~ .", data = dphss_covid_clean_vill_sex_ci)
one_hot_encoded_dphss_covid_clean_vill_sex_ci <- data.frame(predict(dummy_var, newdata = dphss_covid_clean_vill_sex_ci))
# one_hot_encoded_dphss_covid_clean_vill_sex_ciConduct a Chi-square Goodness-of-Fit test to determine whether the distribution of COVID-19 cases by village in 2020 is statistically significant. The observed values will be the number of cases by village from the dataset, and the expected values should be calculated using village populations (village population/total population * total COVID-19 cases in 2020). You can find the 2020 population data here: Census of Guam 2020. https://bsp.guam.gov/census-of-guam/ To perform a chi-squared test in R, you can use the chisq.test() function. For a goodness of fit test, you can specify into the functions your observed and expected values. Interpret and discuss your findings.
# Getting the counts for each village using the cleaned data.
vill_count <- table(dphss_covid_clean$new_vill)
print(vill_count)
##
## AGANA HEIGHTS AGAT ASAN-MAINA
## 228 397 136
## BARRIGADA CHALAN PAGO-ORDOT DEDEDO
## 1235 558 5416
## HAGATNA INARAJAN MANGILAO
## 279 225 1819
## MERIZO MONGMONG-TOTO-MAITE OOJ
## 106 654 115
## PITI SANTA RITA-SUMAI SINAJANA
## 179 866 285
## TALOFOFO TAMUNING-TUMON-HARMON UMATAC
## 350 2336 83
## UNKNOWN YIGO YONA
## 996 2738 667
# Making a dataframe out of the total counts and villages.
vill_count_df <- as.data.frame(vill_count)
colnames(vill_count_df) <- c("village", "count")
print(vill_count_df)
## village count
## 1 AGANA HEIGHTS 228
## 2 AGAT 397
## 3 ASAN-MAINA 136
## 4 BARRIGADA 1235
## 5 CHALAN PAGO-ORDOT 558
## 6 DEDEDO 5416
## 7 HAGATNA 279
## 8 INARAJAN 225
## 9 MANGILAO 1819
## 10 MERIZO 106
## 11 MONGMONG-TOTO-MAITE 654
## 12 OOJ 115
## 13 PITI 179
## 14 SANTA RITA-SUMAI 866
## 15 SINAJANA 285
## 16 TALOFOFO 350
## 17 TAMUNING-TUMON-HARMON 2336
## 18 UMATAC 83
## 19 UNKNOWN 996
## 20 YIGO 2738
## 21 YONA 667
# Removing OOJ and UNKNOWN rows to avoid row mismatch for calculations and chi square.
vill_count_df <- subset(vill_count_df, !(village %in% c("OOJ", "UNKNOWN")))
print(vill_count_df)
## village count
## 1 AGANA HEIGHTS 228
## 2 AGAT 397
## 3 ASAN-MAINA 136
## 4 BARRIGADA 1235
## 5 CHALAN PAGO-ORDOT 558
## 6 DEDEDO 5416
## 7 HAGATNA 279
## 8 INARAJAN 225
## 9 MANGILAO 1819
## 10 MERIZO 106
## 11 MONGMONG-TOTO-MAITE 654
## 13 PITI 179
## 14 SANTA RITA-SUMAI 866
## 15 SINAJANA 285
## 16 TALOFOFO 350
## 17 TAMUNING-TUMON-HARMON 2336
## 18 UMATAC 83
## 20 YIGO 2738
## 21 YONA 667
vill_pop_2020 <- c(3673, 4515, 2011, 7956, 7064, 44908, 943, 2317, 13476, 1604, 6380, 1585, 6470, 2611, 3550, 18489, 647, 19339, 6298)
# Calculating expected values.
vill_pop_2020_exp <- (vill_pop_2020/sum(vill_pop_2020))* sum(vill_count_df$count)
print(vill_pop_2020_exp)
## [1] 443.06834 544.63750 242.58384 959.72004 852.11945 5417.18295
## [7] 113.75264 279.49615 1625.58915 193.48805 769.60958 191.19611
## [13] 780.46615 314.96091 428.23104 2230.29962 78.04661 2332.83382
## [19] 759.71805
chi_square <- chisq.test(x = vill_count_df$count, p = (vill_pop_2020_exp/sum(vill_pop_2020_exp)))
print(chi_square)
##
## Chi-squared test for given probabilities
##
## data: vill_count_df$count
## X-squared = 816.64, df = 18, p-value < 2.2e-16