Uploading Packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flextable)
## 
## Attaching package: 'flextable'
## 
## The following object is masked from 'package:purrr':
## 
##     compose
library(table1)
## 
## Attaching package: 'table1'
## 
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

Uploading Data

data = readRDS(file = "nhanesMetS_version 2 (1).rds")

Question 1

Part (a)

histogram =
  data %>% 
  ggplot(aes(x = sys)) + 
  geom_histogram(aes(y = ..density..),
                 fill = "red", color = "black", binwidth = 2) + 
  geom_density(color = "blue", linetype = 2)

histogram

The plot conveys a bell-shaped distribution with a very high peak.

Part (b)

density1 = 
  data %>% 
  ggplot(aes(x = sys)) + 
  geom_density(aes(fill = mets.ind, linetype = mets.ind),
               color = "black",
               linewidth = 1,
               alpha = 0.1) + 
  xlab("Systolic Blood Pressure") + 
  ylab(NULL) + 
  scale_x_continuous(trans = "log",
                     breaks = c(10, 20, 30, 40, 50)) + 
  scale_fill_manual(values = c("lightblue", "lightpink")) + 
  theme_classic()

density1

Based on the plot, stratified by MetS status, shows that individuals without MetS are on average have lower systolic blood pressure although the difference is not too significant.

Part (c)

density2 = 
data %>% 
  group_by(mets.ind) %>% 
  mutate(m = mean(sys, na.rm = T)) %>% 
  ungroup %>%
  ggplot(aes(x = sys)) + 
  geom_density(aes(fill = mets.ind, linetype = mets.ind),
               color = "black",
               linewidth = 1,
               alpha = 0.1) + 
  geom_vline(aes(xintercept = m, color = mets.ind),
             linetype = 2, 
             size =2) + 
  xlab("Systolic Blood Pressure") + 
  ylab(NULL) + 
  scale_x_continuous(trans = "log",
                     breaks = c(10, 20, 30, 40, 50)) + 
  scale_fill_manual(values = c("lightblue", "lightpink")) + 
  theme_classic()

density2

As seen in the visualization, the mean for MetS- and MetS+ are right around the peaks of each density plot.

Part (d)

combined_viz = 
  grid.arrange(histogram,
               arrangeGrob(density1, 
                           density2, 
                           ncol = 2),
               heights = c(2, 1))

Question 2

Computing the proportion of MetS+ for each value of sleep and its standard error (SE).

mets_sleep = 
  data %>% 
  group_by(sleep) %>%
  summarise(
    total_count = n(),
    mets_count = sum(mets.ind == "MetS+"),
    proportion = mets_count / total_count,
    standard_error = (proportion * (1 - proportion) / total_count))

mets_sleep
## # A tibble: 25 × 5
##    sleep total_count mets_count proportion standard_error
##    <dbl>       <int>      <int>      <dbl>          <dbl>
##  1  5            108         34      0.315       0.00200 
##  2  5.25          19          5      0.263       0.0102  
##  3  5.5           90         25      0.278       0.00223 
##  4  5.75          27          6      0.222       0.00640 
##  5  6            215         43      0.2         0.000744
##  6  6.25          50          9      0.18        0.00295 
##  7  6.5          267         62      0.232       0.000668
##  8  6.75         111         24      0.216       0.00153 
##  9  7            533        115      0.216       0.000317
## 10  7.25         144         39      0.271       0.00137 
## # ℹ 15 more rows

Part (a)

metssleep_viz = 
mets_sleep %>% 
  ggplot(aes(x = sleep,
             y = proportion)) +
  geom_point() + 
  geom_errorbar(aes(ymin = proportion - standard_error,
                    ymax = proportion + standard_error),
                width = 0.1) + 
  labs(x = "Sleep Duration (hrs)",
       y = "Proportion of MetS+",
       title = "Relationship between MetS+ and Sleep Duration") + 
  theme_classic()

metssleep_viz

Based on the visualization, there seems to be a downward trend between the proportions of MetS+ and sleep duration.

Part (b)

data %>% 
  group_by(sleep) %>%
  mutate(
    total_count = n(),
    mets_count = sum(mets.ind == "MetS+"),
    proportion = mets_count / total_count,
    standard_error = (proportion * (1 - proportion) / total_count)) %>% 
  ggplot(aes(x = sleep,
             y = proportion,
             color = gender)) + 
  geom_point() + 
  geom_errorbar(aes(ymin = proportion - standard_error,
                    ymax = proportion + standard_error),
                width = 0.1,
                position = position_dodge(width = 0.2)) + 
  labs(x = "Sleep Duration (hrs)", 
       y = "Proportion of MetS+",
       title = "Relationship between MetS+ and Sleep Duration, by gender",
       color = "gender") + 
  theme_classic()

Based on the plot, it seems as if females get more sleep on average and are less likely to be MetS+.

Part (c)

data %>% 
  group_by(sleep) %>%
  mutate(
    total_count = n(),
    mets_count = sum(mets.ind == "MetS+"),
    proportion = mets_count / total_count,
    standard_error = (proportion * (1 - proportion) / total_count)) %>% 
  ggplot(aes(x = sleep,
             y = proportion)) + 
  geom_point() + 
  geom_errorbar(aes(ymin = proportion - standard_error,
                    ymax = proportion + standard_error),
                width = 0.1) + 
  labs(x = "Sleep Duration (hrs)", 
       y = "Proportion of MetS+",
       title = "Relationship between MetS+ and Sleep Duration, by sex") + 
  facet_wrap(~ gender, ncol = 1) + 
  theme_classic()

Question 3

Reshaping the data

pivot_data = 
  data %>% 
  pivot_longer(cols = c("crit1", "crit2", "crit3", "crit4", "crit5"),
               names_to = "criterion",
               values_to = "met_criterion")

Calculating Proportions and SEs

criterion_data = 
pivot_data %>% 
  group_by(criterion, met_criterion) %>% 
  summarise(
    total_count = n(),
    mets_count = sum(mets.ind == "MetS+"),
    proportion = mets_count / total_count,
    standard_error = (proportion * (1 - proportion) / total_count),
    mean_age = mean(age, na.rm = T),
    sys = mean(sys)) %>% 
  filter(met_criterion == 1) %>% 
  select(-met_criterion)
## `summarise()` has grouped output by 'criterion'. You can override using the
## `.groups` argument.
criterion_data
## # A tibble: 5 × 7
## # Groups:   criterion [5]
##   criterion total_count mets_count proportion standard_error mean_age   sys
##   <chr>           <int>      <int>      <dbl>          <dbl>    <dbl> <dbl>
## 1 crit1            2147        976      0.455       0.000115     53.3  130.
## 2 crit2             450        322      0.716       0.000452     52.7  131.
## 3 crit3            1405        872      0.621       0.000168     51.1  129.
## 4 crit4            1973        772      0.391       0.000121     59.9  143.
## 5 crit5            1503        682      0.454       0.000165     55.4  131.
# I was struggling to find a way to incorporate MetS+ and MetS-, 
# so for this last part of the assignment, I took those that had met the criterion

Plotting

ggplot(criterion_data, aes(x = criterion, 
                           y = proportion)) +
  geom_point(aes(size = mean_age, 
                 color = sys)) +
  geom_errorbar(aes(ymin = proportion - standard_error, 
                    ymax = proportion + standard_error, 
                    color = sys), 
                width = 0.2) +
  geom_hline(yintercept = mean(criterion_data$proportion), 
             linetype = "dotted",
             color = "black") +
  scale_x_discrete(labels = c("Criterion 1", 
                              "Criterion 2", 
                              "Criterion 3", 
                              "Criterion 4", 
                              "Criterion 5")) +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1)) +
  xlab("Criterion") +
  ylab("Proportion") +
  theme_classic() +
  scale_color_gradient(low = "lightblue", high = "lightpink") +
  scale_size_continuous(range = c(3, 8)) +
  guides(size = FALSE)