1. Introduction

This analysis examines the distribution of asthma status across key sociodemographic variables including gender, education level, and income. The data is drawn from a population-based survey and aims to explore potential associations between these variables and lifetime asthma diagnosis.

library(ggplot2)
library(dplyr)

3. Load and Prepare Data

# Read the data
data <- read.csv("/Users/sakibyasar/Library/CloudStorage/Dropbox/Queens MPH 2nd year 1st Semester/EPID 824/All files Assingment 1/Assignment 1/question 2c.csv")

# Convert Gender to factor
data$SDC_GENDER <- factor(data$SDC_GENDER, 
                          levels = c(1, 2), 
                          labels = c("Male", "Female"))

# Convert Education Level to factor
data$SDC_EDU_LEVEL <- factor(data$SDC_EDU_LEVEL, 
                             levels = c(0, 1, 2, 3, 4, 5, 6, 7),
                             labels = c("None", 
                                        "Elementary school", 
                                        "High school", 
                                        "Trade, technical or vocational school", 
                                        "Diploma from a community college", 
                                        "University certificate below bachelor's level", 
                                        "Bachelor's degree", 
                                        "Graduate degree"))

# Convert Income to factor
data$SDC_INCOME <- factor(data$SDC_INCOME, 
                          levels = c(1, 2, 3, 4, 5, 6, 7, 8),
                          labels = c("Less than 10 000 $", 
                                     "10 000 $ - 24 999 $", 
                                     "25 000 $ - 49 999 $", 
                                     "50 000 $ - 74 999 $", 
                                     "75 000 $ - 99 999 $", 
                                     "100 000 $ - 149 999 $", 
                                     "150 000 $ - 199 999 $", 
                                     "200 000 $ or more"))

# Preview the data
head(data)
##          ID DIS_ASTHMA_EVER SDC_GENDER                         SDC_EDU_LEVEL
## 1 SYN_58621               0     Female      Diploma from a community college
## 2 SYN_58622               0     Female      Diploma from a community college
## 3 SYN_58623               2     Female      Diploma from a community college
## 4 SYN_58624               0     Female Trade, technical or vocational school
## 5 SYN_58625               1     Female                     Bachelor's degree
## 6 SYN_58626               2     Female      Diploma from a community college
##              SDC_INCOME PM_STAND_HEIGHT_SR_AVG PM_WEIGHT_SR_AVG PM_HIP_SR_AVG
## 1 100 000 $ - 149 999 $                     NA               NA            NA
## 2 100 000 $ - 149 999 $                 157.48         69.96662       107.950
## 3   50 000 $ - 74 999 $                 167.64         78.69828        94.615
## 4   25 000 $ - 49 999 $                 154.94         71.66759       104.775
## 5                  <NA>                 157.48         80.78480       109.220
## 6   50 000 $ - 74 999 $                     NA               NA            NA
##   PM_WAIST_SR_AVG PM_WAIST_HIP_RATIO_SR PM_BMI_SR
## 1              NA                    NA        NA
## 2         106.045             0.9651163  28.32615
## 3          96.520             0.9642857  25.50819
## 4          72.390             0.6807229  22.45917
## 5         161.290             1.0545829  44.78292
## 6              NA                    NA        NA

4. Data Summary

# Quick overview of the dataset
glimpse(data)
## Rows: 41,187
## Columns: 11
## $ ID                     <chr> "SYN_58621", "SYN_58622", "SYN_58623", "SYN_586…
## $ DIS_ASTHMA_EVER        <int> 0, 0, 2, 0, 1, 2, 0, NA, 0, 0, NA, 0, 0, 0, 0, …
## $ SDC_GENDER             <fct> Female, Female, Female, Female, Female, Female,…
## $ SDC_EDU_LEVEL          <fct> "Diploma from a community college", "Diploma fr…
## $ SDC_INCOME             <fct> 100 000 $ - 149 999 $, 100 000 $ - 149 999 $, 5…
## $ PM_STAND_HEIGHT_SR_AVG <dbl> NA, 157.48, 167.64, 154.94, 157.48, NA, 162.56,…
## $ PM_WEIGHT_SR_AVG       <dbl> NA, 69.96662, 78.69828, 71.66759, 80.78480, NA,…
## $ PM_HIP_SR_AVG          <dbl> NA, 107.9500, 94.6150, 104.7750, 109.2200, NA, …
## $ PM_WAIST_SR_AVG        <dbl> NA, 106.0450, 96.5200, 72.3900, 161.2900, NA, 8…
## $ PM_WAIST_HIP_RATIO_SR  <dbl> NA, 0.9651163, 0.9642857, 0.6807229, 1.0545829,…
## $ PM_BMI_SR              <dbl> NA, 28.32615, 25.50819, 22.45917, 44.78292, NA,…
# Check for missing values
cat("Missing values per column:\n")
## Missing values per column:
colSums(is.na(data))
##                     ID        DIS_ASTHMA_EVER             SDC_GENDER 
##                      0                   1228                      0 
##          SDC_EDU_LEVEL             SDC_INCOME PM_STAND_HEIGHT_SR_AVG 
##                    875                   5934                   9801 
##       PM_WEIGHT_SR_AVG          PM_HIP_SR_AVG        PM_WAIST_SR_AVG 
##                  11707                  17349                  16400 
##  PM_WAIST_HIP_RATIO_SR              PM_BMI_SR 
##                  17734                  11976

5. Visualizations

The following bar plots show the distribution of asthma status across gender, education level, and income groups.

categorical_vars <- c("SDC_GENDER", "SDC_EDU_LEVEL", "SDC_INCOME")
titles_cat <- c("Gender", "Education Level", "Income")

for (i in 1:length(categorical_vars)) {
  plot <- ggplot(data, aes_string(x = categorical_vars[i], 
                                  fill = "as.factor(DIS_ASTHMA_EVER)")) +
    geom_bar(position = "dodge") +
    geom_text(aes(label = ..count..), 
              stat = "count", 
              position = position_dodge(width = 0.9), 
              vjust = -0.5,
              size = 3) +
    labs(title = paste("Distribution of Asthma Status by", titles_cat[i]),
         x = titles_cat[i],
         y = "Count",
         fill = "Asthma Status") +
    scale_fill_manual(values = c("skyblue", "salmon", "lightgreen"), 
                      labels = c("Never had asthma", 
                                 "Ever had asthma", 
                                 "Presumed - Never had asthma")) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          plot.title = element_text(face = "bold", size = 13),
          legend.position = "top")
  
  print(plot)
}