Homework 4:

Problems

Based off your respective group work, create a visualization that tells me the number of cases by case status, (ethnic group, case identification, or village) and sex.

library(ggplot2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

library(psych)

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

dir <- "C:\\Users\\micha\\OneDrive\\Documents\\Rscript\\MA500 Homework\\cajigalm_hw4"
clean_vill <- read_csv(paste0(dir, "\\cleaned_cdc_village.csv"))

## Rows: 19668 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Village, new_vill
## dbl (1): id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dphss_covid <- read_csv(paste0(dir, "\\dphss_covid.csv"))

## Rows: 19668 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Sex, Case.Status, Travel.location, Symptomatic?, PCR.Lab.Name
## dbl  (9): age_calc, Community, Household, Healthcare, Workplace, Travel.14.d...
## date (4): Symptom.Onset.Date, Specimen.Collected.Date, PCR.Lab.Reported.Date...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Merging datasets by id.
dphss_covid_clean_new_vill <- left_join(dphss_covid, clean_vill, by = "id")

bar1 <- ggplot(dphss_covid_clean_new_vill, aes(x = new_vill, fill = Sex)) + 
  geom_bar(position = "dodge") +
  labs(x = "Gender", 
       y = "Count",
       title =  "Number of Cases by Village and Sex")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(bar1)

# Faceted by sex to see the distribution better.
bar1_alt <- ggplot(dphss_covid_clean_new_vill, aes(x = new_vill, fill = Sex)) + 
  geom_bar() +
  labs(x = "Village", 
       y = "Count",
       title = "Number of Cases by Village and Sex") +
  facet_wrap(~ Sex, ncol = 1) +  # Create one facet for each gender
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(bar1_alt)

Merge the cleaned columns for ethnicity, village, and case identification from our previous class exercise with the dphss_covid.csv dataset, and then summarize the data.

# Setting directory and reading in all clean data sets.
dir <- "C:\\Users\\micha\\OneDrive\\Documents\\Rscript\\MA500 Homework\\cajigalm_hw4"
clean_vill <- read_csv(paste0(dir, "\\cleaned_cdc_village.csv"))

## Rows: 19668 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Village, new_vill
## dbl (1): id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

clean_ethnic <- read_csv(paste0(dir, "\\cdc_ethnicity_clean.csv"))

## Rows: 19668 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Race.Ethnicity, Sex, Case.Status, Ethnicity
## dbl (1): id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

clean_ident <- read_csv(paste0(dir, "\\case_ident_clean.csv"))

## Rows: 19668 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): id by, clean_id
## dbl (1): id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dphss_covid <- read_csv(paste0(dir, "\\dphss_covid.csv"))

## Rows: 19668 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Sex, Case.Status, Travel.location, Symptomatic?, PCR.Lab.Name
## dbl  (9): age_calc, Community, Household, Healthcare, Workplace, Travel.14.d...
## date (4): Symptom.Onset.Date, Specimen.Collected.Date, PCR.Lab.Reported.Date...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Merging datasets by id.
dphss_covid_clean <- left_join(dphss_covid, clean_vill, by = "id")
dphss_covid_clean <- left_join(dphss_covid_clean, clean_ethnic, by = "id")
dphss_covid_clean <- left_join(dphss_covid_clean, clean_ident, by = "id")

# Removing duplicate columns as a result of merging data sets.
dphss_covid_clean <- subset(dphss_covid_clean, select = -Sex.y)
dphss_covid_clean <- subset(dphss_covid_clean, select = -Case.Status.y)

# Changing duplicate column names to their original names.
colnames(dphss_covid_clean)[2] <- "Sex"
colnames(dphss_covid_clean)[3] <- "Case.Status"

# Factoring categorical variables to summarize frequencies for each group.
dphss_covid_clean[, c("Sex", "Case.Status", "Travel.location", "Symptomatic?", "PCR.Lab.Name", "new_vill", "Ethnicity", "clean_id")] <- lapply(dphss_covid_clean[, c("Sex", "Case.Status", "Travel.location", "Symptomatic?", "PCR.Lab.Name", "new_vill","Ethnicity", "clean_id")], factor) 

# Summarizing data set.
summary(dphss_covid_clean)

##     age_calc         Sex           Case.Status      Community    
##  Min.   : 0.00   Female: 8853   Confirmed:15393   Min.   :0.000  
##  1st Qu.:21.00   Male  :10815   Probable : 4275   1st Qu.:0.000  
##  Median :33.00                                    Median :0.000  
##  Mean   :34.49                                    Mean   :0.346  
##  3rd Qu.:48.00                                    3rd Qu.:1.000  
##  Max.   :98.00                                    Max.   :1.000  
##                                                   NA's   :14117  
##    Household       Healthcare      Workplace           Travel.location 
##  Min.   :0.00    Min.   :0.000   Min.   :0.000   Continental US:  546  
##  1st Qu.:0.00    1st Qu.:0.000   1st Qu.:0.000   HAWAII        :   83  
##  Median :1.00    Median :0.000   Median :0.000   PHILIPPINES   :   73  
##  Mean   :0.62    Mean   :0.023   Mean   :0.209   Hawaii        :   19  
##  3rd Qu.:1.00    3rd Qu.:0.000   3rd Qu.:0.000   JAPAN         :   12  
##  Max.   :1.00    Max.   :1.000   Max.   :1.000   (Other)       :   45  
##  NA's   :11454   NA's   :15091   NA's   :9195    NA's          :18890  
##  Travel.14.days.before.onset?  Symptomatic?  Symptom.Onset.Date  
##  Min.   :0.00000              No     :8422   Min.   :2020-03-01  
##  1st Qu.:0.00000              Unknown: 238   1st Qu.:2020-10-13  
##  Median :0.00000              Yes    :8992   Median :2021-08-02  
##  Mean   :0.03956              NA's   :2016   Mean   :2021-04-11  
##  3rd Qu.:0.00000                             3rd Qu.:2021-09-22  
##  Max.   :1.00000                             Max.   :2022-07-25  
##                                              NA's   :10676       
##  Specimen.Collected.Date                     PCR.Lab.Name  
##  Min.   :2020-03-14      GPHL                      :10196  
##  1st Qu.:2020-11-04      DLS                       : 1725  
##  Median :2021-08-25      USNH                      : 1595  
##  Mean   :2021-05-08      Guam Radiology Consultants: 1389  
##  3rd Qu.:2021-09-28      GRMC                      : 1003  
##  Max.   :2021-12-31      GMHA                      :  973  
##  NA's   :95              (Other)                   : 2787  
##  PCR.Lab.Reported.Date     Death           Recovered   Date.of.recovery    
##  Min.   :2020-03-15    Min.   :0.00000   Min.   :1     Min.   :2020-03-27  
##  1st Qu.:2020-11-04    1st Qu.:0.00000   1st Qu.:1     1st Qu.:2020-11-20  
##  Median :2021-08-25    Median :0.00000   Median :1     Median :2021-08-31  
##  Mean   :2021-05-07    Mean   :0.01413   Mean   :1     Mean   :2021-05-19  
##  3rd Qu.:2021-09-28    3rd Qu.:0.00000   3rd Qu.:1     3rd Qu.:2021-10-07  
##  Max.   :2021-12-31    Max.   :1.00000   Max.   :1     Max.   :2022-07-30  
##                                          NA's   :278   NA's   :278         
##        id              Village                           new_vill   
##  Min.   :    20201   Length:19668       DEDEDO               :5416  
##  1st Qu.: 20204918   Class :character   YIGO                 :2738  
##  Median : 20209834   Mode  :character   TAMUNING-TUMON-HARMON:2336  
##  Mean   :108652015                      MANGILAO             :1819  
##  3rd Qu.:202014751                      BARRIGADA            :1235  
##  Max.   :202055406                      UNKNOWN              : 996  
##                                         (Other)              :5128  
##  Race.Ethnicity                      Ethnicity       id by          
##  Length:19668       Chamorro              :6809   Length:19668      
##  Class :character   Filipino              :4560   Class :character  
##  Mode  :character   Chuukese              :1660   Mode  :character  
##                     Other Pacific Islander:1441                     
##                     Other                 :1394                     
##                     Mixed                 :1304                     
##                     (Other)               :2500                     
##                      clean_id    
##  Clinical evaluation     : 1998  
##  Contact tracing         : 6032  
##  Multiple identifications:   13  
##  Other                   :   86  
##  Routine surveillance    :10266  
##  Unknown                 : 1273  
##

Calculate the average number of days between symptom onset (Symptom Onset Date) and a positive test result (PCR Lab Reported Date) using the difftime() function.

# Using difftime() to get the difference in days between symptom date and reported positive test date.
day_diff_symptom_positive <- difftime(dphss_covid_clean$PCR.Lab.Reported.Date, 
                                      dphss_covid_clean$Symptom.Onset.Date, 
                                      units = "days")

# print(dayday_diff_symptom_positive)

# Gets the average number of days between symptom data and reported positive test date.
mean(day_diff_symptom_positive, na.rm = TRUE)

## Time difference of 3.138234 days

• Convert the resulting column into a numeric variable with the as.numeric() function.

dphss_covid_clean$day_diff_symptom_positive <- as.numeric(day_diff_symptom_positive)

• Examine this new variable for any outliers or missing values. Assess whether any data points should be excluded from further analysis.

# Doing different stuff to see if there are any outliers:

# Running summary of difference in days between symptom date and reported positive test date.
summary(dphss_covid_clean$day_diff_symptom_positive)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -360.000    1.000    2.000    3.138    4.000  367.000    10676

# Checing histogram between symptom date and reported positive test date.
histogram_day_diff <- ggplot(dphss_covid_clean, aes(x=dphss_covid_clean$day_diff_symptom_positive)) + geom_histogram()
print(histogram_day_diff)

## Warning: Use of `dphss_covid_clean$day_diff_symptom_positive` is discouraged.
## ℹ Use `day_diff_symptom_positive` instead.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 10676 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Checking boxplot between symptom date and reported positive test date.
boxplot(dphss_covid_clean$day_diff_symptom_positive)

From the brief analyses above, there are a couple outliers to asses. First there exist extreme values on either ends of the distribution. From the summary, there is a max value of 367 days and a min value of -360 days which seem impractical for days to be between symptom data and reported positive test date. So, I have used the detect and remove outliers function from class to handle the outliers.

# detect_and_remove_outliers <- function(df) {
# 
#     # Create a new column to mark outliers
#     df$is_outlier <- FALSE
# 
#     # Loop through each numeric column to detect outliers
#     for (col in colnames(df)) {
#         if (is.numeric(df[[col]])) {
#             # Calculate IQR for the column
#             Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
#             Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
#             IQR_value <- Q3 - Q1
# 
#             lower_bound <- Q1 - 1.5 * IQR_value
#             upper_bound <- Q3 + 1.5 * IQR_value
# 
#             # Identify outliers
#             df$is_outlier <- df$is_outlier | (df[[col]] <= lower_bound |
#                 df[[col]] >= upper_bound)
#         }
#     }
# 
#     # Remove rows that are outliers
#     df_removed <- df[!df$is_outlier, ]
# 
#     # Drop the is_outlier column
#     df_removed$is_outlier <- NULL
# 
#     return(df_removed)
# }
# 
# # percentile way - may be useful for highly skewed data as
# # IQR might miss values on one tail better for large
# # datasets flexible thresholds
# detect_and_remove_outliers_percentile <- function(df) {
# 
#     # Create a new column to mark outliers
#     df$is_outlier <- FALSE
# 
#     # Loop through each numeric column to detect outliers
#     for (col in colnames(df)) {
#         if (is.numeric(df[[col]])) {
#             # Calculate IQR for the column
#             lower_bound <- quantile(df[[col]], 0.05, na.rm = TRUE)
#             upper_bound <- quantile(df[[col]], 0.95, na.rm = TRUE)
# 
#             # Identify outliers
#             df$is_outlier <- df$is_outlier | (df[[col]] <= lower_bound |
#                 df[[col]] >= upper_bound)
#         }
#     }
# 
#     # Remove rows that are outliers
#     df_removed <- df[!df$is_outlier, ]
# 
#     # Drop the is_outlier column
#     df_removed$is_outlier <- NULL
# 
#     return(df_removed)
# }



# # Alternate IQR-based outlliers function that doesnt remove rows but replaces it with NA: 
# I went for the alternative of replacing values with NA as oppose to removing the values due to having errors when trying to clean data set as a result of mismatch number of rows.
# detect_and_replace_outliers <- function(df) {
# 
#     # Loop through each numeric column to detect outliers
#     for (col in colnames(df)) {
#         if (is.numeric(df[[col]])) {
#             # Calculate IQR for the column
#             Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
#             Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
#             IQR_value <- Q3 - Q1
# 
#             lower_bound <- Q1 - 1.5 * IQR_value
#             upper_bound <- Q3 + 1.5 * IQR_value
# 
#             # Replace outliers with NA
#             df[[col]][df[[col]] < lower_bound | df[[col]] > upper_bound] <- NA
#         }
#     }
# 
#     return(df)
# }

# I went with the percentile-based outlier detection as IQR may leave out some outliers.
# Alternatively, for percentile-based outlier detection:
detect_and_replace_outliers_percentile <- function(df) {

    # Loop through each numeric column to detect outliers
    for (col in colnames(df)) {
        if (is.numeric(df[[col]])) {
            # Calculate the 5th and 95th percentile bounds
            lower_bound <- quantile(df[[col]], 0.05, na.rm = TRUE)
            upper_bound <- quantile(df[[col]], 0.95, na.rm = TRUE)

            # Replace outliers with NA
            df[[col]][df[[col]] < lower_bound | df[[col]] > upper_bound] <- NA
        }
    }

    return(df)
}


# Concerting the difference in days between symptom date and report of positive test result date into dataframe to use outlier function.
free_day_diff <- data.frame(dphss_covid_clean$day_diff_symptom_positive)

# Used detect_and_replace_outliers_percentile() function to detect and replace outliers with NA based off percentile.
free_day_diff <- detect_and_replace_outliers_percentile(free_day_diff)

# Running summary once again to see significant changes in mean and values.
summary(free_day_diff)

##  dphss_covid_clean.day_diff_symptom_positive
##  Min.   : 0.000                             
##  1st Qu.: 1.000                             
##  Median : 2.000                             
##  Mean   : 2.713                             
##  3rd Qu.: 4.000                             
##  Max.   :10.000                             
##  NA's   :11371

# Adding this new column into original clean data set.
dphss_covid_clean$free_day_diff <- free_day_diff$dphss_covid_clean.day_diff_symptom_positive

# Questions I have: 1) Is it possible for symptom onset date to come after a positive test result? Hence, this would explain the negative values? 2) Should exclude all extreme values as it may be impractical to have waited 300 days to get a positive result?

Note that I have used an alternative function to the detect_and_remove_outliers() function as a result of different number of rows after removing outlier rows. Therefore, instead of removing the rows, I treated the outliers as NA values by replacing their values with NAs. After removing outliers and running the summary once more, we can see the mean changed from the previous summary that was ran with the outliers. It had went down from 3.138 days to 2.713 days, not a seemingly significant difference, but still probably an important change due to extreme outliers. Also, the max value is now 10 days and the minimum value is now 0 days which then begs the question: Is it plausible for symptom onset date to come after a positive test result ? (i.e. can you be diagnosed with Covid and then experience symptoms?) Hence, this would explain the negative values if plausible.

• Create a histogram to visualize the distribution of these days. Does the average number of days vary between months?

# Questions I have: 1) Are we creating the visualization for the original diff. in days or the one where we left out any outliers? 2) How do we get month?

# Creates basic histogram to visualize the distribution of these days.
hist1 <- ggplot(data = dphss_covid_clean, aes(x = free_day_diff)) + 
  geom_histogram(, binwidth = 1, color = "black", fill = "white") +
  labs(title = "Distribution on Days between Symptom Onset and a Positive Test Result", 
       x = "Number of Days", 
       y = "Frequency")
print(hist1)

## Warning: Removed 11371 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Convert to date format in order to get month.
dphss_covid_clean$month_full <- as.Date(dphss_covid_clean$PCR.Lab.Reported.Date, format = "%Y-%m-%d")
dphss_covid_clean$month_full <- format(dphss_covid_clean$month_full, "%B")

# Factoring month by correct order for visualizations
dphss_covid_clean$month_full <- factor(dphss_covid_clean$month_full, levels = c("January", "February", "March", "April", "May", "June","July", "August", "September", "October", "November", "December"))

# Created a plot density to further investigate if the average number of days vary between months.
free_dens1 <- ggplot(data = dphss_covid_clean, aes(x = free_day_diff)) +
    geom_histogram(aes(y = after_stat(density)), binwidth = 2,
        color = "black", fill = "white") + geom_density(aes(color = month_full,
    fill = month_full), alpha = 0.2)
print(free_dens1)

## Warning: Removed 11371 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 11371 rows containing non-finite outside the scale range
## (`stat_density()`).

# It was a bit difficult to see the variance within the plot density, so I have created boxplots across the different months to view distribution of data which gives insight on mean
box1 <- ggplot(dphss_covid_clean, aes(x = month_full, y = free_day_diff)) + 
  geom_boxplot(aes(fill = factor(month_full))) + 
  labs(title = "Box Plot of Days between Symptom Onset and a Positive Test Result", 
       x = "Number of Cylinders", y = "Miles Per Gallon", 
       fill = "Number of Cylinders") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(box1)

## Warning: Removed 11371 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Still wasn't sure about the average number of days varying between months in previous graphs, so I just made a bar graph of the average days per month by first getting the average days per month and found this was the best way to see if the avergae number of days vary between months.
avg_days_per_month <- dphss_covid_clean %>%
  group_by(month_full) %>%
  summarize(free_day_diff = mean(free_day_diff, na.rm = TRUE))

barplot_gg1 <- ggplot(avg_days_per_month, aes(x = month_full, y = free_day_diff, fill = month_full)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Days between Symptom Onset and a Positive Test Result by Month", 
       x = "Month", 
       y = "Average Days",
       fill = "Month") +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))
print(barplot_gg1)

# # I am further going to see if the average number of days between symptom onset date and positive test date varies using an ANOVA test which will give me further insight if the difference in average days within a month is significant. 
# sum(is.na(avg_days_per_month$free_day_diff))
# sum(is.na(avg_days_per_month$month_full))
# 
# avg_days_per_month$month_full <- as.factor(avg_days_per_month$month_full)
# avg_days_per_month$free_day_diff <- as.numeric(avg_days_per_month$free_day_diff)
# 
# # Running the ANOVA test. 
# aov(free_day_diff ~ month_full, data = avg_days_per_month)
# # Unfortunately it doesnt give the p-value.

On average, the number of days between symptom date and report of positive test result date are all ranging from around 2.5 to 4.25 days. Seemingly, there are no significance in variation of average days between symptom date and report of positive test result date and that they are all about the same with March having the highest average at around 4.25 days.

Calculate the average number of days for patients to recover from COVID-19 after receiving a positive result.

# Using difftime() to get the difference in days between reported positive test date and recovery date.
day_diff_positive_recover <- difftime(dphss_covid_clean$Date.of.recovery, dphss_covid_clean$PCR.Lab.Reported.Date, units = "days")
# print(dayday_diff_symptom_positive)

# Gets the average number of days between positive test result and recovery date.
mean(day_diff_positive_recover, na.rm = TRUE)

## Time difference of 11.14007 days

• Convert the resulting column into a numeric variable with the as.numeric() function.

dphss_covid_clean$day_diff_positive_recover <- as.numeric(day_diff_positive_recover)

• Examine this new variable for any outliers or missing values. Assess whether any data points should be excluded from further analysis.

summary(dphss_covid_clean$day_diff_positive_recover)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -355.00   10.00   10.00   11.14   12.00  365.00     278

histogram_day_diff_positive_recover <- ggplot(dphss_covid_clean, aes(x=dphss_covid_clean$day_diff_positive_recover)) + geom_histogram()
print(histogram_day_diff_positive_recover)

## Warning: Use of `dphss_covid_clean$day_diff_positive_recover` is discouraged.
## ℹ Use `day_diff_positive_recover` instead.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 278 rows containing non-finite outside the scale range
## (`stat_bin()`).

boxplot(dphss_covid_clean$day_diff_positive_recover)

From the brief analyses above, there are a couple outliers to asses. First there exist extreme values on either ends of the distribution. From the summary, there is a max value of 365 days and a min value of -355 days which seem impractical for days to be between reported positive test date and recovery date. So, I have used the detect and remove outliers function from class to handle the outliers.

# # Alternate outlliers function that doesnt remove rows but replaces it with NA:
# detect_and_replace_outliers <- function(df) {
# 
#     # Loop through each numeric column to detect outliers
#     for (col in colnames(df)) {
#         if (is.numeric(df[[col]])) {
#             # Calculate IQR for the column
#             Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
#             Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
#             IQR_value <- Q3 - Q1
# 
#             lower_bound <- Q1 - 1.5 * IQR_value
#             upper_bound <- Q3 + 1.5 * IQR_value
# 
#             # Replace outliers with NA
#             df[[col]][df[[col]] < lower_bound | df[[col]] > upper_bound] <- NA
#         }
#     }
# 
#     return(df)
# }

# I actually used the percentile-based outlier detection.
# Alternatively, for percentile-based outlier detection:
detect_and_replace_outliers_percentile <- function(df) {

    # Loop through each numeric column to detect outliers
    for (col in colnames(df)) {
        if (is.numeric(df[[col]])) {
            # Calculate the 5th and 95th percentile bounds
            lower_bound <- quantile(df[[col]], 0.05, na.rm = TRUE)
            upper_bound <- quantile(df[[col]], 0.95, na.rm = TRUE)

            # Replace outliers with NA
            df[[col]][df[[col]] < lower_bound | df[[col]] > upper_bound] <- NA
        }
    }

    return(df)
}



free_day_diff_positive_recover <- data.frame(dphss_covid_clean$day_diff_positive_recover)
free_day_diff_positive_recover <- detect_and_replace_outliers_percentile(free_day_diff_positive_recover)

summary(free_day_diff_positive_recover)

##  dphss_covid_clean.day_diff_positive_recover
##  Min.   : 0.00                              
##  1st Qu.:10.00                              
##  Median :10.00                              
##  Mean   :10.23                              
##  3rd Qu.:12.00                              
##  Max.   :19.00                              
##  NA's   :1229

dphss_covid_clean$free_day_diff_positive_recover <- free_day_diff_positive_recover$dphss_covid_clean.day_diff_positive_recover

Note that I have used an alternative function to the detect_and_remove_outliers() function as a result of different number of rows after removing outlier rows. Therefore, instead of removing the rows, I treated the outliers as NA values by replacing their values with NAs. After removing outliers and running the summary once more, we can see the mean changed from the previous summary that was ran with the outliers. It had went down from 11.14 days to 10.23 days, not a seemingly significant difference, but still probably an important change due to extreme outliers. Also, the max value is now 19 days and the minimum value is now 0 days.

• Plot the distribution of these recovery times and create a visualization comparing the average recovery time across different ethnicities.

# Creates basic histogram to visualize the distribution of these days.
hist2 <- ggplot(data = dphss_covid_clean, aes(x = free_day_diff_positive_recover)) +
  geom_histogram(aes(y = after_stat(density)), binwidth = 1, color = "black", fill = "white")+
  labs(title = "Distribution on Days between a Positive Test Result and Recovery", 
       x = "Number of Days", 
       y = "Frequency")
print(hist2)

## Warning: Removed 1229 rows containing non-finite outside the scale range
## (`stat_bin()`).

# 
avg_days_per_ethnic <- dphss_covid_clean %>%
  group_by(Ethnicity) %>%
  summarize(free_day_diff_positive_recover = mean(free_day_diff_positive_recover, na.rm = TRUE))

barplot_gg2 <- ggplot(avg_days_per_ethnic, aes(x = Ethnicity, y = free_day_diff_positive_recover, fill = Ethnicity)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Days between a Positive Test Result and Recovery Date by Month",
       x = "Month", 
       y = "Average Days",
       fill = "Ethncicities") +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5))
print(barplot_gg2)

On average, the number of days between report of positive test result date and recovery date are all ranging from around 9 to 11 days across different ethnicities. Seemingly, there are no significance in variation of average days between sreport of positive test result date and recovery date and that they are are all about the same with those who are “Mixed” ethnicities having the lowest average at around 9 days.

Plot the number of case identifications over time, segmented by month-year intervals. Analyze and discuss any patterns or trends you observe.
```
# Convert to date format in order to get month and year.
dphss_covid_clean$month_year_full <- as.Date(dphss_covid_clean$PCR.Lab.Reported.Date, format = "%Y-%m-%d")

# Aggregating to get the count of cases for each unique month-year and case id.
case_counts <- dphss_covid_clean %>%
  group_by(month_year_full, clean_id) %>%
  summarize(count = n(), .groups = 'drop')

# Putting the date in order for graph to be depicted correctly.
case_counts <- case_counts %>%
  arrange(month_year_full)

bar_plot <- ggplot(case_counts, aes(format(month_year_full, "%B %Y"), y = count, fill = clean_id)) +
  geom_bar(stat = "identity", position = "dodge") +  
  labs(x = "Month-Year", y = "Count of Case Identifications", 
       title = "Counts of Case Identifications by Month-Year") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_discrete(name = "Case Identification") +
  scale_x_discrete(limits = unique(format(case_counts$month_year_full, "%B %Y")))

print(bar_plot)
```
- In November 2020, there was a spike in contact tracing upon case identification. I think during this time of the year was when COVID 19 was spreading quickly with more and more new cases. Because of these high new cases, there was a need for more contact tracing that created this sort of cause and effect loop (i.e. because there were more new cases, there was more new contact tracing, and through these new contact tracings there have been more new founded COVID19 cases and as a result more and more new contact tracing has been conducted - perhaps). I also think at this point, a lot of people were seeing more and more other people at gatherings due to fatigueness of having been stuck indoors for months since March of 2020. I am also thinking this was about the time Governor Lou declared less limitations to lock down allowing residents to start going out again and as result COVID had spread again and contact tracing was the main solution for quickly identifying new cases. Later into the timeline approaching the end of the year 2021, we can see multiple spikes in routine surveillance upon case identifications, indicating a significant event pertaining to COVID 19. This was around the time a new variant of COVID 19, Omicron, was founded. As a result of the new variant, routine surveillance was greatly implemented in the monitoring of the new variant.

One-hot encode the variables for sex, ethnicity (using the cleaned ethnicity data from our previous group work), and case identification. This will involve creating a binary column for each unique value in these categories, where each row indicates whether it matches that specific value. Here’s a visual

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

# First I go the new dataframe with just thedesired variables to one-hot encode.
dphss_covid_clean_vill_sex_ci <- dphss_covid_clean %>% 
  select(id, new_vill, Ethnicity, clean_id)

dphss_covid_clean_vill_sex_ci

## # A tibble: 19,668 × 4
##        id new_vill              Ethnicity clean_id            
##     <dbl> <fct>                 <fct>     <fct>               
##  1  20201 DEDEDO                Filipino  Clinical evaluation 
##  2  20202 DEDEDO                Filipino  Clinical evaluation 
##  3  20203 CHALAN PAGO-ORDOT     Chamorro  Clinical evaluation 
##  4  20204 DEDEDO                Filipino  Contact tracing     
##  5  20205 TALOFOFO              Chamorro  Routine surveillance
##  6  20206 CHALAN PAGO-ORDOT     Chamorro  Contact tracing     
##  7  20207 DEDEDO                Filipino  Routine surveillance
##  8  20208 DEDEDO                Filipino  Clinical evaluation 
##  9  20209 TAMUNING-TUMON-HARMON Filipino  Contact tracing     
## 10 202010 TALOFOFO              Chamorro  Contact tracing     
## # ℹ 19,658 more rows

# I used the caret function to one-hot encode all desired variables (which were the data the class cleaned).
dummy_var <- dummyVars(" ~ .", data = dphss_covid_clean_vill_sex_ci)
one_hot_encoded_dphss_covid_clean_vill_sex_ci <- data.frame(predict(dummy_var, newdata = dphss_covid_clean_vill_sex_ci))

# one_hot_encoded_dphss_covid_clean_vill_sex_ci

Conduct a Chi-square Goodness-of-Fit test to determine whether the distribution of COVID-19 cases by village in 2020 is statistically significant. The observed values will be the number of cases by village from the dataset, and the expected values should be calculated using village populations (village population/total population * total COVID-19 cases in 2020). You can find the 2020 population data here: Census of Guam 2020. https://bsp.guam.gov/census-of-guam/ To perform a chi-squared test in R, you can use the chisq.test() function. For a goodness of fit test, you can specify into the functions your observed and expected values. Interpret and discuss your findings.

# Getting the counts for each village using the cleaned data.
vill_count <- table(dphss_covid_clean$new_vill)
print(vill_count)

## 
##         AGANA HEIGHTS                  AGAT            ASAN-MAINA 
##                   228                   397                   136 
##             BARRIGADA     CHALAN PAGO-ORDOT                DEDEDO 
##                  1235                   558                  5416 
##               HAGATNA              INARAJAN              MANGILAO 
##                   279                   225                  1819 
##                MERIZO   MONGMONG-TOTO-MAITE                   OOJ 
##                   106                   654                   115 
##                  PITI      SANTA RITA-SUMAI              SINAJANA 
##                   179                   866                   285 
##              TALOFOFO TAMUNING-TUMON-HARMON                UMATAC 
##                   350                  2336                    83 
##               UNKNOWN                  YIGO                  YONA 
##                   996                  2738                   667

# Making a dataframe out of the total counts and villages.
vill_count_df <- as.data.frame(vill_count)
colnames(vill_count_df) <- c("village", "count")

print(vill_count_df)

##                  village count
## 1          AGANA HEIGHTS   228
## 2                   AGAT   397
## 3             ASAN-MAINA   136
## 4              BARRIGADA  1235
## 5      CHALAN PAGO-ORDOT   558
## 6                 DEDEDO  5416
## 7                HAGATNA   279
## 8               INARAJAN   225
## 9               MANGILAO  1819
## 10                MERIZO   106
## 11   MONGMONG-TOTO-MAITE   654
## 12                   OOJ   115
## 13                  PITI   179
## 14      SANTA RITA-SUMAI   866
## 15              SINAJANA   285
## 16              TALOFOFO   350
## 17 TAMUNING-TUMON-HARMON  2336
## 18                UMATAC    83
## 19               UNKNOWN   996
## 20                  YIGO  2738
## 21                  YONA   667

# Removing OOJ and UNKNOWN rows to avoid row mismatch for calculations and chi square.
vill_count_df <- subset(vill_count_df, !(village %in% c("OOJ", "UNKNOWN")))

print(vill_count_df)

##                  village count
## 1          AGANA HEIGHTS   228
## 2                   AGAT   397
## 3             ASAN-MAINA   136
## 4              BARRIGADA  1235
## 5      CHALAN PAGO-ORDOT   558
## 6                 DEDEDO  5416
## 7                HAGATNA   279
## 8               INARAJAN   225
## 9               MANGILAO  1819
## 10                MERIZO   106
## 11   MONGMONG-TOTO-MAITE   654
## 13                  PITI   179
## 14      SANTA RITA-SUMAI   866
## 15              SINAJANA   285
## 16              TALOFOFO   350
## 17 TAMUNING-TUMON-HARMON  2336
## 18                UMATAC    83
## 20                  YIGO  2738
## 21                  YONA   667

vill_pop_2020 <-  c(3673, 4515, 2011, 7956, 7064, 44908, 943, 2317, 13476, 1604, 6380, 1585, 6470, 2611, 3550, 18489, 647, 19339, 6298)

# Calculating expected values.
vill_pop_2020_exp <- (vill_pop_2020/sum(vill_pop_2020))* sum(vill_count_df$count)
print(vill_pop_2020_exp)

##  [1]  443.06834  544.63750  242.58384  959.72004  852.11945 5417.18295
##  [7]  113.75264  279.49615 1625.58915  193.48805  769.60958  191.19611
## [13]  780.46615  314.96091  428.23104 2230.29962   78.04661 2332.83382
## [19]  759.71805

chi_square <- chisq.test(x = vill_count_df$count, p = (vill_pop_2020_exp/sum(vill_pop_2020_exp)))

print(chi_square)

## 
##  Chi-squared test for given probabilities
## 
## data:  vill_count_df$count
## X-squared = 816.64, df = 18, p-value < 2.2e-16

Assuming the null hypothesis is that the expected cases are the same as the observed cases, the p-value of < 2.2 * 10^(-16) indicates that the result is highly statistically significant and we reject the null. Therefore, there is strong evidence to suggest that the distribution of COVID-19 cases by village in 2020 is not consistent with the expected distribution. And thus, this implies that the distribution of cases is significantly different from what was expected.

Homework 4:

Michael Cajigal

2024-09-22

Problems