Coding exercise presentation

Amin Sherzad

2022-06-13

Content

  • Business task
  • Data overview
  • R packages
  • Data cleaning & manipulation
  • Descriptive analysis
  • Visualizations or Outputs
  • Q & A
  • Extra descriptive analysis

Business task

Discovering insight from Mexico COVID dataset by doing some descriptive analysis

Data overview

  • Dataset is provided by Palladium, it’s expected to be credible for this case study
  • Dataset is from Mexico COVID records from January to June 2020
  • Dataset contains information about COVID patients along some demographic and comorbidities

R packages

  • ggplot2 for visualization
  • dplyr for data munging or wrangling
  • tidyr for tidying data
  • readxl for importing the dataset
library(ggplot2)
library(dplyr)
library(tidyr)
library(readxl)

Data cleaning and manipulation

  • Missing values; particularly for comorbidities, only “Yes” is considered
  • Outlier check on age variable, remove outliers
boxplot(covid_df$age)

#remove outliers, make clean dataset
covid_df_cln <- covid_df %>% 
  filter(age > 0 & age < 107)

Data cleaning and manipulation (Cont)

  • Inconsistency check on sex variable, resolve inconsistencies
#checking responses case sensitivity
table(covid_df_cln$sex)

#recoding to proper spelling
covid_df_cln <- covid_df_cln %>% mutate(
  sex = case_when(
    sex == "Femalee" ~ "Female",
    sex == "Males" ~ "Male",
    TRUE ~ sex
  )
)

Data cleaning and manipulation (Cont)

  • Binarize date_died variable to ease dealing with it in further steps
#adding column to indicate deid or not died(this helps the coding to be clear and readable in later steps)
covid_df_cln <- covid_df_cln %>% mutate(
  died_yn = case_when(
    !is.na(date_died) ~ "Yes",
    TRUE ~ "No"
  )
)

Descriptive analysis

  • Summary of dataset
  • Question 1, calculating Mean, Median, Mode, Range of age of died patients
#summary of variables
summary(covid_df_cln)

## Answering to question 1
#calculate the summary of people who died
died_smry <- covid_df_cln %>% 
  filter(!is.na(date_died)) %>%
  summarise(
    died_mean = mean(age, na.rm = T),
    died_median = median(age, na.rm = T),
    died_mode = getMode(age),
    died_range = max(age, na.rm = T) - min(age, na.rm = T)
  )

#getMode function
getMode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

Descriptive analysis (Cont)

  • Question 2, Relationship between ICU admission and Sex
## Answer to question 2
#finding any relationship between ICU admission and sex of patient
icu_sex_rel <- covid_df_cln %>% 
  filter(!is.na(icu)) %>% 
  group_by(sex) %>% 
  summarise(
    n_of_patients = n()
  )

Descriptive analysis (Cont)

  • Question 3, Associated comorbidities with ICU and Dying
## Answer to question 3
#calculating comorbidities for died
dying_comorbidities <- covid_df_cln %>% group_by(ICU_died = died_yn) %>%
  filter(ICU_died == "Yes") %>% 
  summarise(
    n_of_patients = n(),
    diabetes = length(diabetes[diabetes == "Yes"]),
    copd = length(asthma[asthma == "Yes"]),
    asthma = length(asthma[asthma == "Yes"]),
    inmsupr = length(inmsupr[inmsupr == "Yes"]),
    hypertension = length(hypertension[hypertension == "Yes"]),
    cardiovascular = length(cardiovascular[cardiovascular == "Yes"]),
    tobacco = length(tobacco[tobacco == "Yes"]),
    renal_chronic = length(renal_chronic[renal_chronic == "Yes"]),
    obesity = length(obesity[obesity == "Yes"]),
  ) %>% mutate(
    ICU_died = case_when(
      ICU_died == "Yes" ~ "Died", TRUE ~ ICU_died
      )
    )

#calculating comorbidities for ICU
icu_comorbidities <- covid_df_cln %>% group_by(ICU_died = icu) %>%
  filter(ICU_died == "Yes") %>% 
  summarise(
    n_of_patients = n(),
    diabetes = length(diabetes[diabetes == "Yes"]),
    copd = length(asthma[asthma == "Yes"]),
    asthma = length(asthma[asthma == "Yes"]),
    inmsupr = length(inmsupr[inmsupr == "Yes"]),
    hypertension = length(hypertension[hypertension == "Yes"]),
    cardiovascular = length(cardiovascular[cardiovascular == "Yes"]),
    tobacco = length(tobacco[tobacco == "Yes"]),
    renal_chronic = length(renal_chronic[renal_chronic == "Yes"]),
    obesity = length(obesity[obesity == "Yes"]),
  ) %>% mutate(
    ICU_died = case_when(
      ICU_died == "Yes" ~ "ICU", TRUE ~ ICU_died
    )
  )

#Merging both ICU and Died into one table
icu_died_associates <- bind_rows(dying_comorbidities, icu_comorbidities)

#Reshaping the data to long format, to make more suitable for visualization
icu_died_associates_lng <- icu_died_associates %>% select(-n_of_patients) %>% 
  gather(key = "Associates", value = "Patients", -ICU_died)

Descriptive analysis (Cont)

  • Question 4, Relationship between Sex and COVID positive result
#Answer to Question 4, Relationship between sex and COVID positive result
sex_res_rel <- covid_df_cln %>% group_by(sex, covid_res) %>% 
  filter(covid_res %in% c("Positive")) %>% 
  summarise(
    n_of_patients = n()
  )

Visualizations and Outputs

Mean, Median, Mode and Range of age of patients who died

died_smry
## # A tibble: 1 x 4
##   died_mean died_median died_mode died_range
##       <dbl>       <dbl>     <dbl>      <dbl>
## 1      61.2          63        65         99

Visualizations and Outputs (Cont)

ICU admission and Sex relationship

icu_sex_rel %>% ggplot(aes(x = sex, y = n_of_patients, fill = sex))+
  geom_bar(stat = "identity", position = "dodge")+
  labs(title = "ICU and Sex relationship", x = "Sex", y = "# of patients in ICU", fill = "Sex")

image

Visualizations and Outputs (Cont)

Associated comorbidities with ICU and Dying

icu_died_associates_lng %>% ggplot(aes(x = Associates, y = Patients, fill = ICU_died))+
  geom_bar(stat = "identity", position = "dodge")+
  labs(title = "ICU or Died associates", y = "# of patients", fill = "ICU or Died")+
  theme(axis.text.x = element_text(angle = 90))

image

Visualizations and Outputs (Cont)

Relationship between sex and COVID positive result

sex_res_rel %>% ggplot(aes(x = sex, y = n_of_patients, fill = sex))+
  geom_bar(stat = "identity", position = "dodge")+
  labs(title = "Sex and Positive result relationship", x = "Sex", y = "# of patient", fill = "Sex")

image

Entry admission after symptoms

covid_df_cln$symptoms_month <- month(covid_df_cln$date_symptoms, label = T)
covid_df_cln %>% ggplot(aes(x = symptoms_month, y = as.numeric(delayed_cure), fill = symptoms_month))+
  geom_bar(stat = "identity", position = "dodge")+
  labs(title = "Entry admission after symptoms", x = "Months", y = "Days after symptoms", fill = "Months")

image

Relation between COVID result and COVID contact

covid_cont_covid_res <- covid_df_cln %>% filter(covid_res %in% c("Positive", "Negative")) %>% 
  group_by(contact_other_covid, covid_res) %>% 
  summarise(
    n_value = n(),
  ) %>% filter(contact_other_covid == "Yes")

covid_cont_covid_res %>% ggplot(aes(x = covid_res, y = n_value, fill = covid_res))+
  geom_bar(stat = "identity", position = "dodge")+
  labs(title = "COVID result by COVID contact", x = "COVID Result", y = "# of patient", fill = "COVID Result")

image

Thank you for your time :)