This analysis examines the distribution of asthma status across key sociodemographic variables including gender, education level, and income. The data is drawn from a population-based survey and aims to explore potential associations between these variables and lifetime asthma diagnosis.
library(ggplot2)
library(dplyr)
# Read the data
data <- read.csv("/Users/sakibyasar/Library/CloudStorage/Dropbox/Queens MPH 2nd year 1st Semester/EPID 824/All files Assingment 1/Assignment 1/question 2c.csv")
# Convert Gender to factor
data$SDC_GENDER <- factor(data$SDC_GENDER,
levels = c(1, 2),
labels = c("Male", "Female"))
# Convert Education Level to factor
data$SDC_EDU_LEVEL <- factor(data$SDC_EDU_LEVEL,
levels = c(0, 1, 2, 3, 4, 5, 6, 7),
labels = c("None",
"Elementary school",
"High school",
"Trade, technical or vocational school",
"Diploma from a community college",
"University certificate below bachelor's level",
"Bachelor's degree",
"Graduate degree"))
# Convert Income to factor
data$SDC_INCOME <- factor(data$SDC_INCOME,
levels = c(1, 2, 3, 4, 5, 6, 7, 8),
labels = c("Less than 10 000 $",
"10 000 $ - 24 999 $",
"25 000 $ - 49 999 $",
"50 000 $ - 74 999 $",
"75 000 $ - 99 999 $",
"100 000 $ - 149 999 $",
"150 000 $ - 199 999 $",
"200 000 $ or more"))
# Preview the data
head(data)
## ID DIS_ASTHMA_EVER SDC_GENDER SDC_EDU_LEVEL
## 1 SYN_58621 0 Female Diploma from a community college
## 2 SYN_58622 0 Female Diploma from a community college
## 3 SYN_58623 2 Female Diploma from a community college
## 4 SYN_58624 0 Female Trade, technical or vocational school
## 5 SYN_58625 1 Female Bachelor's degree
## 6 SYN_58626 2 Female Diploma from a community college
## SDC_INCOME PM_STAND_HEIGHT_SR_AVG PM_WEIGHT_SR_AVG PM_HIP_SR_AVG
## 1 100 000 $ - 149 999 $ NA NA NA
## 2 100 000 $ - 149 999 $ 157.48 69.96662 107.950
## 3 50 000 $ - 74 999 $ 167.64 78.69828 94.615
## 4 25 000 $ - 49 999 $ 154.94 71.66759 104.775
## 5 <NA> 157.48 80.78480 109.220
## 6 50 000 $ - 74 999 $ NA NA NA
## PM_WAIST_SR_AVG PM_WAIST_HIP_RATIO_SR PM_BMI_SR
## 1 NA NA NA
## 2 106.045 0.9651163 28.32615
## 3 96.520 0.9642857 25.50819
## 4 72.390 0.6807229 22.45917
## 5 161.290 1.0545829 44.78292
## 6 NA NA NA
# Quick overview of the dataset
glimpse(data)
## Rows: 41,187
## Columns: 11
## $ ID <chr> "SYN_58621", "SYN_58622", "SYN_58623", "SYN_586…
## $ DIS_ASTHMA_EVER <int> 0, 0, 2, 0, 1, 2, 0, NA, 0, 0, NA, 0, 0, 0, 0, …
## $ SDC_GENDER <fct> Female, Female, Female, Female, Female, Female,…
## $ SDC_EDU_LEVEL <fct> "Diploma from a community college", "Diploma fr…
## $ SDC_INCOME <fct> 100 000 $ - 149 999 $, 100 000 $ - 149 999 $, 5…
## $ PM_STAND_HEIGHT_SR_AVG <dbl> NA, 157.48, 167.64, 154.94, 157.48, NA, 162.56,…
## $ PM_WEIGHT_SR_AVG <dbl> NA, 69.96662, 78.69828, 71.66759, 80.78480, NA,…
## $ PM_HIP_SR_AVG <dbl> NA, 107.9500, 94.6150, 104.7750, 109.2200, NA, …
## $ PM_WAIST_SR_AVG <dbl> NA, 106.0450, 96.5200, 72.3900, 161.2900, NA, 8…
## $ PM_WAIST_HIP_RATIO_SR <dbl> NA, 0.9651163, 0.9642857, 0.6807229, 1.0545829,…
## $ PM_BMI_SR <dbl> NA, 28.32615, 25.50819, 22.45917, 44.78292, NA,…
# Check for missing values
cat("Missing values per column:\n")
## Missing values per column:
colSums(is.na(data))
## ID DIS_ASTHMA_EVER SDC_GENDER
## 0 1228 0
## SDC_EDU_LEVEL SDC_INCOME PM_STAND_HEIGHT_SR_AVG
## 875 5934 9801
## PM_WEIGHT_SR_AVG PM_HIP_SR_AVG PM_WAIST_SR_AVG
## 11707 17349 16400
## PM_WAIST_HIP_RATIO_SR PM_BMI_SR
## 17734 11976
The following bar plots show the distribution of asthma status across gender, education level, and income groups.
categorical_vars <- c("SDC_GENDER", "SDC_EDU_LEVEL", "SDC_INCOME")
titles_cat <- c("Gender", "Education Level", "Income")
for (i in 1:length(categorical_vars)) {
plot <- ggplot(data, aes_string(x = categorical_vars[i],
fill = "as.factor(DIS_ASTHMA_EVER)")) +
geom_bar(position = "dodge") +
geom_text(aes(label = ..count..),
stat = "count",
position = position_dodge(width = 0.9),
vjust = -0.5,
size = 3) +
labs(title = paste("Distribution of Asthma Status by", titles_cat[i]),
x = titles_cat[i],
y = "Count",
fill = "Asthma Status") +
scale_fill_manual(values = c("skyblue", "salmon", "lightgreen"),
labels = c("Never had asthma",
"Ever had asthma",
"Presumed - Never had asthma")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold", size = 13),
legend.position = "top")
print(plot)
}