HNanesPAM

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

install.packages("dplyr")

## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)

install.packages("haven")

## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)

install.packages("survey")

## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)

install.packages("quantreg")

## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)

install.packages("lme4")

## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)

install.packages("ggplot2")

## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)

install.packages("Matrix")

## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)

# Load packages
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(haven)
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(quantreg)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

## 
## Attaching package: 'quantreg'

## The following object is masked from 'package:survival':
## 
##     untangle.specials

library(lme4)
library(ggplot2)
library(Matrix)

setwd("~/Nhanes")

# Set working directory

# Read data
#demo <- haven::read_xpt("DEMO_H.XPT") %>% select("SEQN", "RIDAGEYR", "RIAGENDR", "WTMEC2YR", "SDMVPSU", "SDMVSTRA")
# Load data from XPT files
demo <- haven::read_xpt("DEMO_H.XPT")
bpx <- haven::read_xpt("BPX_H.XPT")
paday <- haven::read_xpt("PAXDAY_H.XPT")
cfq <- haven::read_xpt("CFQ_H.XPT") #remove later - high volume of missing data
CDQ <- haven::read_xpt("CDQ_H.XPT") #remove later - low number of AnginaGrade1 & AnginaGrade2
srmq <- read_xpt("SMQ_H.XPT")
icd10 <- read_xpt("RXQ_RX_H.XPT")


 

# Filter and transform paday data
paday <- paday %>%
  select(SEQN, PAXWWMD, PAXMTSD) %>%
  filter(PAXWWMD >= 600 & !is.na(SEQN)) %>%  # Combine filtering conditions with group identifier
  group_by(SEQN) %>%
  summarize(overallmims = mean(as.numeric(PAXMTSD), na.rm = TRUE))

# Merge data
data <- merge(demo, paday, by = "SEQN", all = TRUE)
data <- merge(data, bpx, by = "SEQN", all = TRUE)
data <- merge(data, cfq, by = "SEQN", all = TRUE)
data <- merge(data, CDQ, by = "SEQN", all = TRUE)
data <- merge(data, srmq, by = "SEQN", all = TRUE)
data <- merge(data, icd10, by = "SEQN", all = TRUE)

# Create survey design
dsn <- svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE)

# Perform summary statistics
svyby(~overallmims, ~RIAGENDR, design = dsn, na.rm = T, vartype = "se", svymean)

##   RIAGENDR overallmims       se
## 1        1    12341.46 121.2826
## 2        2    12859.53 117.6554

# Perform regression analysis
svyglm(as.numeric(overallmims) ~ as.numeric(RIDAGEYR), design = dsn)

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (30) clusters.
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Call:  svyglm(formula = as.numeric(overallmims) ~ as.numeric(RIDAGEYR), 
##     design = dsn)
## 
## Coefficients:
##          (Intercept)  as.numeric(RIDAGEYR)  
##             17660.51                -99.59  
## 
## Degrees of Freedom: 15709 Total (i.e. Null);  14 Residual
##   (4484 observations deleted due to missingness)
## Null Deviance:       2.668e+11 
## Residual Deviance: 1.927e+11     AIC: 304300

# Examine the distribution of activity level (overallmims)
hist(data$overallmims)

# Examine the distribution of activity level by - Annual household income (INDHHIN2)

income <- !is.na(data$overallmims) & !is.na(data$INDHHIN2) & data$INDHHIN2 != 77 & data$INDHHIN2 != 99 & data$INDHHIN2 != "."

# Filter the data based on the keep vector
data_filtered <- data[income,]

# Create the boxplot
boxplot(data_filtered$overallmims ~ data_filtered$INDHHIN2)

# Examine the distribution of activity level by gender (RIAGENDR)
boxplot(data$overallmims ~ data$RIAGENDR)

# the boxplot shows that the median overallmims is higher for males than females. This means that, on average, males are more physically active than females.

# There are a number of possible explanations for this gender difference in physical activity levels.


# Examine the distribution of activity level by race/ethnicity (RIDRETH3)
boxplot(data$overallmims ~ data$RIDRETH3)

# Test for the association between activity level and socioeconomic status using linear regression
svyglm(overallmims ~INDHHIN2, data = data, design = svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE))

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (30) clusters.
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Call:  svyglm(formula = overallmims ~ INDHHIN2, design = svydesign(id = ~SDMVPSU, 
##     strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE), 
##     data = data)
## 
## Coefficients:
## (Intercept)     INDHHIN2  
##    12413.11        20.26  
## 
## Degrees of Freedom: 15553 Total (i.e. Null);  14 Residual
##   (4640 observations deleted due to missingness)
## Null Deviance:       2.649e+11 
## Residual Deviance: 2.641e+11     AIC: 306300

#Overall, the findings of the survey-weighted linear regression analysis suggest that there is a positive association between overall MIMS and socioeconomic status in the Nhanes dataset. This means that individuals with higher socioeconomic status tend to have higher levels of physical activity on average.

# Test for the association between activity level and gender using linear regression
svyglm(overallmims ~ RIAGENDR, data = data, design = svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE))

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (30) clusters.
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Call:  svyglm(formula = overallmims ~ RIAGENDR, design = svydesign(id = ~SDMVPSU, 
##     strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE), 
##     data = data)
## 
## Coefficients:
## (Intercept)     RIAGENDR  
##     11823.4        518.1  
## 
## Degrees of Freedom: 15709 Total (i.e. Null);  14 Residual
##   (4484 observations deleted due to missingness)
## Null Deviance:       2.668e+11 
## Residual Deviance: 2.657e+11     AIC: 309400

#The survey-weighted linear regression analysis suggests that there is a gender gap in physical activity levels, with male individuals tending to have higher overall MIMS compared to female individuals.

# Test for the association between activity level and race/ethnicity using ANOVA
svyglm(overallmims ~ RIDRETH3, data = data, design = svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE))

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (30) clusters.
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Call:  svyglm(formula = overallmims ~ RIDRETH3, design = svydesign(id = ~SDMVPSU, 
##     strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE), 
##     data = data)
## 
## Coefficients:
## (Intercept)     RIDRETH3  
##     13390.0       -241.8  
## 
## Degrees of Freedom: 15709 Total (i.e. Null);  14 Residual
##   (4484 observations deleted due to missingness)
## Null Deviance:       2.668e+11 
## Residual Deviance: 2.656e+11     AIC: 309400

#survey weighted analysis 
#
#

# Create the survey design object
svydesign <- svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE)

# Model for socioeconomic status
model_socioeconomic <- svyglm(overallmims ~ INDHHIN2, design = svydesign, data = data)
summary(model_socioeconomic)

## 
## Call:
## svyglm(formula = overallmims ~ INDHHIN2, design = svydesign, 
##     data = data)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12413.112    113.319 109.541   <2e-16 ***
## INDHHIN2       20.259      7.268   2.787   0.0145 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 16980332)
## 
## Number of Fisher Scoring iterations: 2

# the survey-weighted linear regression analysis suggests that there is a positive association between overall MIMS and socioeconomic status in the Nhanes dataset. This means that individuals with higher socioeconomic status tend to have higher levels of physical activity on average. The p-value for the INDHHIN2 coefficient is less than 0.001, indicating that the association is statistically significant.

#
# Model for gender
model_gender <- svyglm(overallmims ~ RIAGENDR, design = svydesign, data = data)
summary(model_gender)

## 
## Call:
## svyglm(formula = overallmims ~ RIAGENDR, design = svydesign, 
##     data = data)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11823.4      267.1  44.264  < 2e-16 ***
## RIAGENDR       518.1      167.0   3.102  0.00779 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 16912654)
## 
## Number of Fisher Scoring iterations: 2

#The survey-weighted linear regression analysis suggests that there is a statistically significant positive association between overall MIMS and gender in the Nhanes dataset. This means that male individuals tend to have higher levels of physical activity on average compared to female individuals.

# Model for race/ethnicity
model_raceethnicity <- svyglm(overallmims ~ RIDRETH3, design = svydesign, data = data)
summary(model_raceethnicity)

## 
## Call:
## svyglm(formula = overallmims ~ RIDRETH3, design = svydesign, 
##     data = data)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13389.97     238.16  56.222  < 2e-16 ***
## RIDRETH3     -241.85      65.27  -3.705  0.00235 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 16905519)
## 
## Number of Fisher Scoring iterations: 2

#the survey-weighted linear regression analysis suggests that there is a statistically significant difference in overall MIMS across race/ethnicity groups, however we cant define the specifics with this model

install.packages("quantreg")

## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)

library(quantreg)


# Quantile regression for socioeconomic status
qr_socioeconomic <- rq(overallmims ~ INDHHIN2, tau = 0.25:0.75, data = data)

# Extract the estimated coefficients for each quantile
qr_socioeconomic_coefs <- coef(qr_socioeconomic, type = "coef")

# Include interaction term between socioeconomic status and gender
model_interaction <- svyglm(overallmims ~ INDHHIN2 * RIAGENDR, design = svydesign, data = data)

#TRYING MIXED EFFECT MODEL 
install.packages("lme4")

## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)

library(lme4)

# Fit a two-level MEM with random intercepts for households
lme_model <- lmer(overallmims ~ INDHHIN2 + RIAGENDR + (1 | SDMVPSU), data = data)


# Filter individuals with complete MIMS values
data_filtered <- data[!is.na(data$overallmims),]

# Create a scatter plot of MIMS by age
ggplot(data_filtered, aes(x = RIDAGEYR, y = overallmims)) + 
 geom_point() + 
 geom_smooth(method = "lm", se = FALSE) + 
 labs(title = "MIMS by Age", x = "Age Group", y = "Overall MIMS")

## `geom_smooth()` using formula = 'y ~ x'

# getting an error on knitting -- need to work on resolving this issue 
#hist(data$overallmims, main = "Distribution of Overall MIMS", xlab = "Overall MIMS", ylab = "Frequency")
# Create a histogram of the overall MIMS variable by age group

# Impute missing values in overallmims using mean imputation
mean_imputation <- mean(data$overallmims, na.rm = TRUE)
data$overallmims[is.na(data$overallmims)] <- mean_imputation

# Create a histogram of the overall MIMS variable by age group
# Impute missing values in overallmims using mean imputation
mean_imputation <- mean(data$overallmims, na.rm = TRUE)
data$overallmims[is.na(data$overallmims)] <- mean_imputation

# Create a histogram of the overall MIMS variable by age group
#hist(data$overallmims ~ data$RIDAGEYR, main = "Distribution of Overall MIMS by Age", xlab = "Overall MIMS", ylab = "Frequency")

#BPXSY1. -- now moving into analyzing from aspect of Hypertension
summary(data$BPXSY1)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    66.0   110.0   120.0   123.5   134.0   228.0    4472

hist(data$BPXSY1,xlab = "Systolic Blood Presure")

boxplot(data$BPXSY1)

data$Hypertension <- ifelse(data$BPXSY1 > 140, 1, 0)
table(data$Hypertension )

## 
##     0     1 
## 13095  2627

prop.table( table(data$Hypertension ) )

## 
##         0         1 
## 0.8329093 0.1670907

summary(data$Hypertension)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.167   0.000   1.000    4472

#The correlation coefficient between MIMS and hypertension

data_filtered <- data[!is.na(data$overallmims) & !is.na(data$Hypertension),]
correlation_coefficient <- cor(data_filtered$overallmims, data_filtered$Hypertension)
print(correlation_coefficient)

## [1] -0.158024

#The correlation coefficient between MIMS and hypertension is -0.1592478. This indicates a weak negative correlation, meaning that as MIMS increases, hypertension tends to decrease slightly. However, the correlation coefficient is relatively small, suggesting that the relationship between MIMS and hypertension is not very strong.


HypertensionCorelation <- glm(data$Hypertension~ 1 + overallmims, family = binomial, data = data , na.action=na.omit)
summary(HypertensionCorelation)

## 
## Call:
## glm(formula = data$Hypertension ~ 1 + overallmims, family = binomial, 
##     data = data, na.action = na.omit)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.912e-02  7.868e-02  -0.751    0.452    
## overallmims -1.271e-04  6.480e-06 -19.610   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14189  on 15721  degrees of freedom
## Residual deviance: 13772  on 15720  degrees of freedom
##   (4472 observations deleted due to missingness)
## AIC: 13776
## 
## Number of Fisher Scoring iterations: 4

# Overall, the output suggests that there is a statistically significant negative correlation between overall MIMS and hypertension. This means that as overall MIMS increases, the estimated probability of hypertension decreases. 

# Fit the mixed effects model
overallmims_standardized <- (data$overallmims - mean(data$overallmims)) / sd(data$overallmims)


lme_model <- lmer(Hypertension ~ overallmims + (1 | SDMVPSU), data = data)

## Warning: Some predictor variables are on very different scales: consider
## rescaling

# Fit the logistic regression model
glm_model <- glm(Hypertension ~ overallmims, family = "binomial", data = data)


# Compare the models using AIC
aic_lme_model <- AIC(lme_model)
aic_glm_model <- AIC(glm_model)

# Compare the models using BIC
bic_lme_model <- BIC(lme_model)
bic_glm_model <- BIC(glm_model)

#CDFQ questionnaire 

# Descriptive statistics for physical activity (overallmims)
summary(data$overallmims)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3079   10614   13036   13036   14704   32929

# Descriptive statistics for CFQ_H variables
sapply(data[, c( "CFDCIR", "CFDCSR",  "CFDAST", "CFDDS")], summary)

##               CFDCIR      CFDCSR      CFDAST       CFDDS
## Min.    0.000000e+00     0.00000     3.00000     0.00000
## 1st Qu. 0.000000e+00     4.00000    12.00000    32.00000
## Median  0.000000e+00     6.00000    16.00000    43.00000
## Mean    3.412542e-01     5.81946    15.93932    43.48909
## 3rd Qu. 0.000000e+00     8.00000    19.00000    55.00000
## Max.    8.000000e+00    10.00000    39.00000   105.00000
## NA's    1.308200e+04 13082.00000 13141.00000 13547.00000

gender_groups <- split(data, data$RIAGENDR)

# Calculate summary statistics for each group



# Calculate the correlation coefficient between overallmims and each CFQ_H variable
cor(data$overallmims, data[, c("CFDCIT1", "CFDCIT2", "CFDCIT3", "CFDCIR", "CFDAST", "CFDDS")])

##      CFDCIT1 CFDCIT2 CFDCIT3 CFDCIR CFDAST CFDDS
## [1,]      NA      NA      NA     NA     NA    NA

# Histograms of overallmims and CFQ_H variables
hist(data$overallmims)

hist(data$CFDCIT1)

hist(data$CFDCIT2)

hist(data$CFDCIT3)

hist(data$CFDCIR)

hist(data$CFDAST)

hist(data$CFDDS)

# Boxplots of overallmims and CFQ_H variables by gender
boxplot(data$overallmims ~ data$RIAGENDR)

boxplot(data$CFDCIT1 ~ data$RIAGENDR)

boxplot(data$CFDCIT2 ~ data$RIAGENDR)

boxplot(data$CFDCIT3 ~ data$RIAGENDR)

boxplot(data$CFDCIR ~ data$RIAGENDR)

boxplot(data$CFDAST ~ data$RIAGENDR)

boxplot(data$CFDDS ~ data$RIAGENDR)

  install.packages("Hmisc")

## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)

library(Hmisc)

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:quantreg':
## 
##     latex

## The following object is masked from 'package:survey':
## 
##     deff

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

# Define Angina grades

data_filtered <- data[!is.na(data$overallmims) & !is.na(data$CDQHFADL),]
data_filtered$AnginaGrade1 <- with(data_filtered, {
 (CDQ001 == 1 & CDQ002 == 1 & CDQ003 != 1 & CDQ004 == 1 & CDQ005 == 1 & CDQ006 == 1) &
 ((CDQ009D == 4 | CDQ009E == 5) | (CDQ009F == 6 & CDQ009G == 7))
})

data_filtered$AnginaGrade2 <- with(data_filtered, {
 (CDQ001 == 1 & CDQ002 == 1 & CDQ003 == 1 & CDQ004 == 1 & CDQ005 == 1 & CDQ006 == 1) &
 ((CDQ009D == 4 | CDQ009E == 5) | (CDQ009F == 6 & CDQ009G == 7))
})
# zero observations, therefore wont utilize the dataset on my capstone.

# Filter data
filtered_data <- data %>%
 filter(SMQ040 %in% 1:2 & !is.na(SMQ040) & !is.na(SMD650) & !is.na(overallmims))

# Create survey design object
dsn <- svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = filtered_data, nest = TRUE)

# Analyze the association with overall physical activity
# 1. Do you now smoke cigarettes (SMQ040)

# Model 1: SMQ040 only
model1 <- svyglm(overallmims ~ SMQ040, design = dsn)

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

## Warning in summary.glm(glm.object): observations with zero weight not used for
## calculating dispersion

summary(model1)

## 
## Call:
## svyglm(formula = overallmims ~ SMQ040, design = dsn)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = filtered_data, nest = TRUE)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11970.4      881.9  13.573  1.9e-09 ***
## SMQ040         233.3      692.8   0.337    0.741    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 11593038)
## 
## Number of Fisher Scoring iterations: 2

#This suggests that there is insufficient evidence to conclude that smoking status has a direct impact on overall physical activity levels in this population.

# Model 2: SMQ040 + age (RIDAGEYR)
model2 <- svyglm(overallmims ~ SMQ040 + RIDAGEYR, design = dsn)

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

summary(model2)

## 
## Call:
## svyglm(formula = overallmims ~ SMQ040 + RIDAGEYR, design = dsn)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = filtered_data, nest = TRUE)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16731.32    1281.13  13.060 7.52e-09 ***
## SMQ040        -85.79     571.16  -0.150 0.882905    
## RIDAGEYR      -89.40      16.60  -5.386 0.000124 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 9884932)
## 
## Number of Fisher Scoring iterations: 2

#Adding age as a covariate to the model confirms that the association between smoking status and overall physical activity remains non-significant.


# 2. Avg # cigarettes/day during past 30 days (SMD650)

# Model 3: SMD650 only
model3 <- svyglm(overallmims ~ SMD650, design = dsn)

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

summary(model3)

## 
## Call:
## svyglm(formula = overallmims ~ SMD650, design = dsn)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = filtered_data, nest = TRUE)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12354.367    191.791  64.416   <2e-16 ***
## SMD650         -8.983      6.497  -1.382    0.188    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 11566857)
## 
## Number of Fisher Scoring iterations: 2

# This demonstrates a statistically significant negative association between average cigarettes smoked per day and overall physical activity
# Model 4: SMD650 + age (RIDAGEYR)
model4 <- svyglm(overallmims ~ SMD650 + RIDAGEYR, design = dsn)

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

summary(model4)

## 
## Call:
## svyglm(formula = overallmims ~ SMD650 + RIDAGEYR, design = dsn)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = filtered_data, nest = TRUE)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16714.510    785.319  21.284 1.73e-11 ***
## SMD650         -7.920      5.169  -1.532    0.149    
## RIDAGEYR      -89.033     16.135  -5.518 9.91e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 9859960)
## 
## Number of Fisher Scoring iterations: 2

#this demostrates that adding age as a covariate to the model it strengthens the evidence for a negative association between average cigarettes per day and overall physical activity; this suggests that the effect becomes more pronounced with increasing age.



#deviding into two grops (less < 25 cigaretes per day & > 25 cigaretes per day )

filtered_data$group <- ifelse(filtered_data$SMD650 < 25, "<25 cigarettes", ">=25 cigarettes")

# Descriptive statistics by group
summary(overallmims ~ group, data = filtered_data)

## overallmims      N= 3107  
## 
## +-------+---------------+----+-----------+
## |       |               |   N|overallmims|
## +-------+---------------+----+-----------+
## |  group| <25 cigarettes|2908|   12370.43|
## |       |>=25 cigarettes| 199|   11761.05|
## +-------+---------------+----+-----------+
## |Overall|               |3107|   12331.40|
## +-------+---------------+----+-----------+

# T-test for mean difference in physical activity
t.test(overallmims ~ group, data = filtered_data)

## 
##     Welch Two Sample t-test
## 
## data:  overallmims by group
## t = 3.6392, df = 267.5, p-value = 0.0003282
## alternative hypothesis: true difference in means between group <25 cigarettes and group >=25 cigarettes is not equal to 0
## 95 percent confidence interval:
##  279.6901 939.0680
## sample estimates:
##  mean in group <25 cigarettes mean in group >=25 cigarettes 
##                      12370.43                      11761.05

# Correlation between average cigarettes and physical activity
cor(filtered_data$SMD650, filtered_data$overallmims)

## [1] -0.02101563

#The negative correlation coefficient provides some evidence that smoking more cigarettes is associated with lower physical activity levels, however this is a weak magnitude of the correlation

# Separate models for each group
model1 <- svyglm(overallmims ~ 1, design = dsn, data = filtered_data_group1)

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

model2 <- svyglm(overallmims ~ 1, design = dsn, data = filtered_data_group2)

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

# Compare model coefficients
summary(model1)$coef

##             Estimate Std. Error  t value    Pr(>|t|)
## (Intercept) 12240.32   207.2623 59.05714 3.50835e-19

summary(model2)$coef

##             Estimate Std. Error  t value    Pr(>|t|)
## (Intercept) 12240.32   207.2623 59.05714 3.50835e-19

# Filter data
# Identify relevant ICD-10 codes for cardiovascular diseases
# Define cardio_icd_codes range

data$ICDYes <- ifelse(grepl("^I10", data$RXDRSC1), TRUE, FALSE)

# Define the range of ICD codes
icd_range <- 10:99

# Create a new variable called 'ICDYes'
data$ICDYes2 <- sapply(data$RXDRSC1, function(x) {
  # Check if the code starts with "I" and falls within the range
  grepl("^I", x) && as.integer(substr(x, 2, 3)) %in% icd_range
})



# Analyze the association
# Descriptive statistics
summary(overallmims ~ ICDYes2, data = data)

## overallmims      N= 20194  
## 
## +-------+---+-----+-----------+
## |       |   |    N|overallmims|
## +-------+---+-----+-----------+
## |ICDYes2| No|17151|   13298.17|
## |       |Yes| 3043|   11559.06|
## +-------+---+-----+-----------+
## |Overall|   |20194|   13036.11|
## +-------+---+-----+-----------+

#T-test for mean difference in physical activity
t.test(overallmims ~ ICDYes2, data = data)

## 
##  Welch Two Sample t-test
## 
## data:  overallmims by ICDYes2
## t = 26.564, df = 4708.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  1610.766 1867.461
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            13298.17            11559.06

#Based on the provided output, we can conclude that there is a statistically significant difference in physical activity levels between the two groups defined by the variable ICDYes2. The group FALSE has a significantly higher average overallmims compared to the group TRUE

dsn <- svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE)

# Model 1: Overall MIMS and CVD
model1 <- svyglm(overallmims ~ ICDYes2, design = dsn)

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

## Warning in summary.glm(glm.object): observations with zero weight not used for
## calculating dispersion

summary(model1)

## 
## Call:
## svyglm(formula = overallmims ~ ICDYes2, design = dsn)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12894.23      77.24  166.94  < 2e-16 ***
## ICDYes2TRUE -1311.71      90.39  -14.51 7.88e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 13011347)
## 
## Number of Fisher Scoring iterations: 2

#the analysis reveals a significant negative association between the presence of CVD-related ICD-10 codes and overall physical activity levels in the NHANES population. This suggests that individuals with CVD may be less physically active, highlighting the importance of promoting physical activity as a potential intervention for managing cardiovascular health


# Model 2: Adjust for covariates
model2 <- svyglm(overallmims ~ ICDYes2 + RIDAGEYR + RIAGENDR, design = dsn)

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion

summary(model2)

## 
## Call:
## svyglm(formula = overallmims ~ ICDYes2 + RIDAGEYR + RIAGENDR, 
##     design = dsn)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 15596.7521   160.4352  97.215  < 2e-16 ***
## ICDYes2TRUE     0.4203    98.5026   0.004 0.996666    
## RIDAGEYR      -77.7184     3.0665 -25.345 8.65e-12 ***
## RIAGENDR      592.3396   116.9706   5.064 0.000278 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 10270940)
## 
## Number of Fisher Scoring iterations: 2

#the adjusted model provides a more nuanced understanding of the relationship between physical activity and CVD. While the direct association between CVD and physical activity is no longer statistically significant after accounting for age and gender, these factors independently influence overall physical activity levels.
# Boxplot for visualizing the distribution

#Predict the presence of CVD based on MIMS
model3 <- svyglm(ICDYes2 ~ overallmims, family = binomial, design = dsn)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(model3)

## 
## Call:
## svyglm(formula = ICDYes2 ~ overallmims, design = dsn, family = binomial)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.839e-01  1.139e-01  -3.371  0.00457 ** 
## overallmims -1.082e-04  9.107e-06 -11.882 1.06e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9960803)
## 
## Number of Fisher Scoring iterations: 4

#the logistic regression model suggests that higher levels of overall physical activity are associated with a slightly lower risk of having CVD-related ICD-10 codes. However, further investigations are needed to fully understand the complex relationship between physical activity and cardiovascular health.

#  Account for household-level variation
model4 <- lmer(overallmims ~ ICDYes2 + (1 | SDMVPSU), data = data)
summary(model4)

## Linear mixed model fit by REML ['lmerMod']
## Formula: overallmims ~ ICDYes2 + (1 | SDMVPSU)
##    Data: data
## 
## REML criterion at convergence: 389699.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.6919 -0.6131 -0.0716  0.4536  5.2609 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  SDMVPSU  (Intercept)    25929  161    
##  Residual             14087610 3753    
## Number of obs: 20194, groups:  SDMVPSU, 2
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 13293.81     117.42  113.22
## ICDYes2TRUE -1736.90      73.83  -23.52
## 
## Correlation of Fixed Effects:
##             (Intr)
## ICDYes2TRUE -0.095

# the mixed effects model provides valuable insights into the association between physical activity and CVD, taking into account the nested structure of the NHANES data. The analysis confirms a significant negative association between CVD and physical activity, suggesting that promoting physical activity may play a role in improving cardiovascular health.