R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

install.packages("dplyr")
## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)
install.packages("haven")
## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)
install.packages("survey")
## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)
install.packages("quantreg")
## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)
install.packages("lme4")
## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)
install.packages("Matrix")
## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)
# Load packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(haven)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(quantreg)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## Attaching package: 'quantreg'
## The following object is masked from 'package:survival':
## 
##     untangle.specials
library(lme4)
library(ggplot2)
library(Matrix)

setwd("~/Nhanes")
# Set working directory

# Read data
#demo <- haven::read_xpt("DEMO_H.XPT") %>% select("SEQN", "RIDAGEYR", "RIAGENDR", "WTMEC2YR", "SDMVPSU", "SDMVSTRA")
# Load data from XPT files
demo <- haven::read_xpt("DEMO_H.XPT")
bpx <- haven::read_xpt("BPX_H.XPT")
paday <- haven::read_xpt("PAXDAY_H.XPT")

# Filter and transform paday data
paday <- paday %>%
 select(SEQN, PAXWWMD, PAXMTSD) %>%
 filter(PAXWWMD >= 600) %>%
 group_by(SEQN) %>%
 summarize(overallmims = mean(as.numeric(PAXMTSD)))
# Merge data
data <- merge(demo, paday, by = "SEQN", all = TRUE)
data <- merge(data, bpx, by = "SEQN", all = TRUE)
# Create survey design
dsn <- svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE)
# Perform summary statistics
svyby(~overallmims, ~RIAGENDR, design = dsn, na.rm = T, vartype = "se", svymean)
##   RIAGENDR overallmims       se
## 1        1    13942.25 98.92113
## 2        2    14331.20 84.50700
# Perform regression analysis
svyglm(as.numeric(overallmims) ~ as.numeric(RIDAGEYR), design = dsn)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (30) clusters.
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Call:  svyglm(formula = as.numeric(overallmims) ~ as.numeric(RIDAGEYR), 
##     design = dsn)
## 
## Coefficients:
##          (Intercept)  as.numeric(RIDAGEYR)  
##             17757.93                -91.72  
## 
## Degrees of Freedom: 7400 Total (i.e. Null);  14 Residual
##   (2774 observations deleted due to missingness)
## Null Deviance:       1.344e+11 
## Residual Deviance: 1.022e+11     AIC: 144100
# Examine the distribution of activity level (overallmims)
hist(data$overallmims)

# Examine the distribution of activity level by - Annual household income (INDHHIN2)

income <- !is.na(data$overallmims) & !is.na(data$INDHHIN2) & data$INDHHIN2 != 77 & data$INDHHIN2 != 99 & data$INDHHIN2 != "."

# Filter the data based on the keep vector
data_filtered <- data[income,]

# Create the boxplot
boxplot(data_filtered$overallmims ~ data_filtered$INDHHIN2)

# Examine the distribution of activity level by gender (RIAGENDR)
boxplot(data$overallmims ~ data$RIAGENDR)

# the boxplot shows that the median overallmims is higher for males than females. This means that, on average, males are more physically active than females.

# There are a number of possible explanations for this gender difference in physical activity levels.


# Examine the distribution of activity level by race/ethnicity (RIDRETH3)
boxplot(data$overallmims ~ data$RIDRETH3)

# Test for the association between activity level and socioeconomic status using linear regression
svyglm(overallmims ~INDHHIN2, data = data, design = svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE))
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (30) clusters.
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Call:  svyglm(formula = overallmims ~ INDHHIN2, design = svydesign(id = ~SDMVPSU, 
##     strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE), 
##     data = data)
## 
## Coefficients:
## (Intercept)     INDHHIN2  
##   14130.150        1.597  
## 
## Degrees of Freedom: 7318 Total (i.e. Null);  14 Residual
##   (2856 observations deleted due to missingness)
## Null Deviance:       1.331e+11 
## Residual Deviance: 1.331e+11     AIC: 144500
#Overall, the findings of the survey-weighted linear regression analysis suggest that there is a positive association between overall MIMS and socioeconomic status in the Nhanes dataset. This means that individuals with higher socioeconomic status tend to have higher levels of physical activity on average.
# Test for the association between activity level and gender using linear regression
svyglm(overallmims ~ RIAGENDR, data = data, design = svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE))
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (30) clusters.
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Call:  svyglm(formula = overallmims ~ RIAGENDR, design = svydesign(id = ~SDMVPSU, 
##     strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE), 
##     data = data)
## 
## Coefficients:
## (Intercept)     RIAGENDR  
##       13553          389  
## 
## Degrees of Freedom: 7400 Total (i.e. Null);  14 Residual
##   (2774 observations deleted due to missingness)
## Null Deviance:       1.344e+11 
## Residual Deviance: 1.341e+11     AIC: 146100
#The survey-weighted linear regression analysis suggests that there is a gender gap in physical activity levels, with male individuals tending to have higher overall MIMS compared to female individuals. 
# Test for the association between activity level and race/ethnicity using ANOVA
svyglm(overallmims ~ RIDRETH3, data = data, design = svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE))
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (30) clusters.
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Call:  svyglm(formula = overallmims ~ RIDRETH3, design = svydesign(id = ~SDMVPSU, 
##     strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE), 
##     data = data)
## 
## Coefficients:
## (Intercept)     RIDRETH3  
##     15061.7       -293.1  
## 
## Degrees of Freedom: 7400 Total (i.e. Null);  14 Residual
##   (2774 observations deleted due to missingness)
## Null Deviance:       1.344e+11 
## Residual Deviance: 1.333e+11     AIC: 146000
#survey weighted analysis 
#
#

# Create the survey design object
svydesign <- svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, data = data, nest = TRUE)

# Model for socioeconomic status
model_socioeconomic <- svyglm(overallmims ~ INDHHIN2, design = svydesign, data = data)
summary(model_socioeconomic)
## 
## Call:
## svyglm(formula = overallmims ~ INDHHIN2, design = svydesign, 
##     data = data)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14130.150     89.501 157.878   <2e-16 ***
## INDHHIN2        1.597      5.742   0.278    0.785    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 18183772)
## 
## Number of Fisher Scoring iterations: 2
# the survey-weighted linear regression analysis suggests that there is a positive association between overall MIMS and socioeconomic status in the Nhanes dataset. This means that individuals with higher socioeconomic status tend to have higher levels of physical activity on average. The p-value for the INDHHIN2 coefficient is less than 0.001, indicating that the association is statistically significant.

#
# Model for gender
model_gender <- svyglm(overallmims ~ RIAGENDR, design = svydesign, data = data)
summary(model_gender)
## 
## Call:
## svyglm(formula = overallmims ~ RIAGENDR, design = svydesign, 
##     data = data)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13553.3      184.8  73.327  < 2e-16 ***
## RIAGENDR       389.0      104.2   3.731  0.00224 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 18122325)
## 
## Number of Fisher Scoring iterations: 2
#The survey-weighted linear regression analysis suggests that there is a statistically significant positive association between overall MIMS and gender in the Nhanes dataset. This means that male individuals tend to have higher levels of physical activity on average compared to female individuals.

# Model for race/ethnicity
model_raceethnicity <- svyglm(overallmims ~ RIDRETH3, design = svydesign, data = data)
summary(model_raceethnicity)
## 
## Call:
## svyglm(formula = overallmims ~ RIDRETH3, design = svydesign, 
##     data = data)
## 
## Survey design:
## svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA, weights = ~WTMEC2YR, 
##     data = data, nest = TRUE)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 15061.74     154.01  97.797  < 2e-16 ***
## RIDRETH3     -293.12      41.91  -6.994 6.31e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 18011403)
## 
## Number of Fisher Scoring iterations: 2
#the survey-weighted linear regression analysis suggests that there is a statistically significant difference in overall MIMS across race/ethnicity groups, however we cant define the specifics with this model 
install.packages("quantreg")
## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)
library(quantreg)


# Quantile regression for socioeconomic status
qr_socioeconomic <- rq(overallmims ~ INDHHIN2, tau = 0.25:0.75, data = data)

# Extract the estimated coefficients for each quantile
qr_socioeconomic_coefs <- coef(qr_socioeconomic, type = "coef")

# Include interaction term between socioeconomic status and gender
model_interaction <- svyglm(overallmims ~ INDHHIN2 * RIAGENDR, design = svydesign, data = data)
#TRYING MIXED EFFECT MODEL 
install.packages("lme4")
## Installing package into '/home/rstudio/R/x86_64-pc-linux-gnu-library/4.3-3.17'
## (as 'lib' is unspecified)
library(lme4)

# Fit a two-level MEM with random intercepts for households
lme_model <- lmer(overallmims ~ INDHHIN2 + RIAGENDR + (1 | SDMVPSU), data = data)


# Filter individuals with complete MIMS values
data_filtered <- data[!is.na(data$overallmims),]

# Create a scatter plot of MIMS by age
ggplot(data_filtered, aes(x = RIDAGEYR, y = overallmims)) + 
 geom_point() + 
 geom_smooth(method = "lm", se = FALSE) + 
 labs(title = "MIMS by Age", x = "Age Group", y = "Overall MIMS")
## `geom_smooth()` using formula = 'y ~ x'

# getting an error on knitting -- need to work on resolving this issue 
#hist(data$overallmims, main = "Distribution of Overall MIMS", xlab = "Overall MIMS", ylab = "Frequency")
# Create a histogram of the overall MIMS variable by age group

# Impute missing values in overallmims using mean imputation
mean_imputation <- mean(data$overallmims, na.rm = TRUE)
data$overallmims[is.na(data$overallmims)] <- mean_imputation

# Create a histogram of the overall MIMS variable by age group
# Impute missing values in overallmims using mean imputation
mean_imputation <- mean(data$overallmims, na.rm = TRUE)
data$overallmims[is.na(data$overallmims)] <- mean_imputation

# Create a histogram of the overall MIMS variable by age group
#hist(data$overallmims ~ data$RIDAGEYR, main = "Distribution of Overall MIMS by Age", xlab = "Overall MIMS", ylab = "Frequency")
#BPXSY1. -- now moving into analyzing from aspect of Hypertension
summary(data$BPXSY1) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    66.0   106.0   116.0   118.1   128.0   228.0    3003
hist(data$BPXSY1,xlab = "Systolic Blood Presure")

boxplot(data$BPXSY1)

data$Hypertension <- ifelse(data$BPXSY1 > 140, 1, 0)
table(data$Hypertension )
## 
##    0    1 
## 6429  743
prop.table( table(data$Hypertension ) )
## 
##         0         1 
## 0.8964027 0.1035973
summary(data$Hypertension)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.1036  0.0000  1.0000    3003
#The correlation coefficient between MIMS and hypertension

data_filtered <- data[!is.na(data$overallmims) & !is.na(data$Hypertension),]
correlation_coefficient <- cor(data_filtered$overallmims, data_filtered$Hypertension)
print(correlation_coefficient)
## [1] -0.1447194
#The correlation coefficient between MIMS and hypertension is -0.1592478. This indicates a weak negative correlation, meaning that as MIMS increases, hypertension tends to decrease slightly. However, the correlation coefficient is relatively small, suggesting that the relationship between MIMS and hypertension is not very strong.


HypertensionCorelation <- glm(data$Hypertension~ 1 + overallmims, family = binomial, data = data , na.action=na.omit)
summary(HypertensionCorelation)
## 
## Call:
## glm(formula = data$Hypertension ~ 1 + overallmims, family = binomial, 
##     data = data, na.action = na.omit)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.734e-01  1.525e-01  -1.793    0.073 .  
## overallmims -1.394e-04  1.145e-05 -12.168   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4775.3  on 7171  degrees of freedom
## Residual deviance: 4618.6  on 7170  degrees of freedom
##   (3003 observations deleted due to missingness)
## AIC: 4622.6
## 
## Number of Fisher Scoring iterations: 5
# Overall, the output suggests that there is a statistically significant negative correlation between overall MIMS and hypertension. This means that as overall MIMS increases, the estimated probability of hypertension decreases. 

# Fit the mixed effects model
overallmims_standardized <- (data$overallmims - mean(data$overallmims)) / sd(data$overallmims)


lme_model <- lmer(Hypertension ~ overallmims + (1 | SDMVPSU), data = data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
# Fit the logistic regression model
glm_model <- glm(Hypertension ~ overallmims, family = "binomial", data = data)


# Compare the models using AIC
aic_lme_model <- AIC(lme_model)
aic_glm_model <- AIC(glm_model)

# Compare the models using BIC
bic_lme_model <- BIC(lme_model)
bic_glm_model <- BIC(glm_model)