nhanes_data = read.csv("nhanesi_class_dataset.csv", header=TRUE)
library(DescTools)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
library(OSTools)
library(ggplot2)
nhanes_data = read.csv("nhanesi_class_dataset.csv", header=TRUE)
datatemp = nhanes_data
datatemp$treatment = as.numeric(datatemp$physically.inactive)
datatemp$outcome = datatemp$years.lived.since.1971.up.to.1992
datatemp$working.during.last.three.months <- as.numeric(datatemp$working.last.three.months)
datatemp$marital.status <- as.numeric(datatemp$married)
ipw.est <- function(data, propscore.formula){
psmodel <- glm(propscore.formula, family=binomial, data=data)
propscore <- predict(psmodel, type="response")
treatment <- data$treatment
outcome <- data$outcome
weight <- (1/propscore)*treatment + (1/(1-propscore))*(1-treatment)
weight.norm <- weight
weight.norm[treatment==1] <- weight[treatment==1] / sum(weight[treatment==1])
weight.norm[treatment==0] <- weight[treatment==0] / sum(weight[treatment==0]*(1-treatment))
effect.norm <- sum(weight.norm*outcome*treatment) -
sum(weight.norm*outcome*(1-treatment))
return(effect.norm)
}
datatemp_eda <- datatemp
datatemp_eda$Activity_Status <- factor(datatemp_eda$treatment,
levels = c(0, 1),
labels = c("Active (Control)", "Inactive (Treated)"))
datatemp_eda$Smoking <- datatemp_eda$smoking.status
plot_a <- ggplot(datatemp_eda, aes(x = Activity_Status, fill = Smoking)) +
geom_bar(position = "fill") +
labs(title = "Smoking Status Distribution by Physical Activity Level",
x = "Physical Activity Status",
y = "Proportion",
fill = "Smoking Status") +
scale_y_continuous(labels = scales::percent) +
theme_minimal()
print(plot_a)
This chart shows that while the proportion of individuals who have never
smoked is nearly identical between the two groups, physically inactive
individuals have a slightly higher proportion of current smokers and a
lower proportion of former smokers compared to those who are physically
active.
datatemp$income.poverty.ratio.mis <- is.na(datatemp$income.poverty.ratio)
datatemp$dietary.adequacy.mis <- is.na(datatemp$dietary.adequacy)
mean_income <- mean(datatemp$income.poverty.ratio, na.rm=TRUE)
mean_dietary <- mean(datatemp$dietary.adequacy, na.rm=TRUE)
datatemp$income.poverty.ratio[is.na(datatemp$income.poverty.ratio)] <- mean_income
datatemp$dietary.adequacy[is.na(datatemp$dietary.adequacy)] <- mean_dietary
categorical_vars <- c("sex", "smoking.status", "race", "education", "alcohol.consumption")
for (v in categorical_vars) {
datatemp[[v]] <- factor(datatemp[[v]])
}
datatemp$working.during.last.three.months <- factor(datatemp$working.during.last.three.months)
datatemp$marital.status <- factor(datatemp$marital.status)
datatemp$income.poverty.ratio.mis <- factor(datatemp$income.poverty.ratio.mis)
datatemp$dietary.adequacy.mis <- factor(datatemp$dietary.adequacy.mis)
covariates <- formula(treatment ~ sex + smoking.status + income.poverty.ratio + age.at.interview +
race + education + working.during.last.three.months + marital.status +
alcohol.consumption + dietary.adequacy +
income.poverty.ratio.mis + dietary.adequacy.mis)
psmodel <- glm(covariates, family=binomial, data=datatemp, x=TRUE, y=TRUE)
datatemp$logit.ps <- predict(psmodel)
res <- overlap(datatemp$logit.ps, datatemp$treatment)
which.remove <- res$index.lack.overlap
no_excluded <- res$no.treated.lack.overlap + res$no.control.lack.overlap
print(paste("Number of treated units lacking overlap:", res$no.treated.lack.overlap))
## [1] "Number of treated units lacking overlap: 1"
print(paste("Number of control units lacking overlap:", res$no.control.lack.overlap))
## [1] "Number of control units lacking overlap: 0"
print(paste("Total units excluded:", no_excluded))
## [1] "Total units excluded: 1"
dmy.original <- dummyVars(covariates, data=datatemp)
Xmat.original <- data.frame(predict(dmy.original, newdata=datatemp))
if (no_excluded > 0) {
datatemp_overlap <- datatemp[-which.remove,]
Xmat <- Xmat.original[-which.remove,]
datatemp.full <- rbind(datatemp_overlap, datatemp[which.remove,])
Xmat.full <- rbind(Xmat, Xmat.original[which.remove,])
} else {
datatemp_overlap <- datatemp
Xmat <- Xmat.original
datatemp.full <- datatemp
Xmat.full <- Xmat.original
}
rownames(datatemp_overlap) <- 1:nrow(datatemp_overlap)
rownames(Xmat) <- 1:nrow(Xmat)
btb_smd_before <- balance.check(Xmat.full, Xmat.full,
datatemp.full$treatment, datatemp.full$treatment)
smd_before <- abs(btb_smd_before[, "Before SMD"])
imbalanced_vars <- names(smd_before[smd_before > 0.1])
print("Confounders with Absolute Standardized Differences > 0.1 (Before Adjustment):")
## [1] "Confounders with Absolute Standardized Differences > 0.1 (Before Adjustment):"
print(imbalanced_vars)
## [1] "sex.Female"
## [2] "sex.Male"
## [3] "smoking.status.Current"
## [4] "smoking.status.Former"
## [5] "income.poverty.ratio"
## [6] "age.at.interview"
## [7] "race.Nonwhite"
## [8] "race.White"
## [9] "education.0.8"
## [10] "education.12"
## [11] "education.Some.College"
## [12] "working.during.last.three.months.0"
## [13] "working.during.last.three.months.1"
## [14] "marital.status.0"
## [15] "marital.status.1"
## [16] "alcohol.consumption.1.4.times.per.month"
## [17] "alcohol.consumption.Never"
## [18] "dietary.adequacy"
propscore_overlap <- predict(psmodel, newdata = datatemp_overlap, type = "response")
weight <- ifelse(datatemp_overlap$treatment == 1,
1 / propscore_overlap,
1 / (1 - propscore_overlap))
lm_ipw <- lm(outcome ~ treatment, data = datatemp_overlap, weights = weight)
summary(lm_ipw)
##
## Call:
## lm(formula = outcome ~ treatment, data = datatemp_overlap, weights = weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -39.881 -4.181 3.237 3.549 21.339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.9036 0.1757 101.927 < 2e-16 ***
## treatment -1.6147 0.2465 -6.551 7.31e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.748 on 1948 degrees of freedom
## Multiple R-squared: 0.02155, Adjusted R-squared: 0.02105
## F-statistic: 42.91 on 1 and 1948 DF, p-value: 7.31e-11
ate_estimate_ipw <- coef(lm_ipw)["treatment"]
conf_interval_ipw <- confint(lm_ipw)["treatment", ]
print(paste("IPW Point Estimate (from WLS):", round(ate_estimate_ipw, 4)))
## [1] "IPW Point Estimate (from WLS): -1.6147"
print("95% Confidence Interval for IPW Estimate:")
## [1] "95% Confidence Interval for IPW Estimate:"
print(conf_interval_ipw)
## 2.5 % 97.5 %
## -2.098185 -1.131313
propscore <- predict(psmodel, type="response", newdata=datatemp_overlap)
datatemp_overlap$propscore <- propscore
no.subclasses <- 5
cutoffs <- quantile(propscore, seq(1/no.subclasses, (no.subclasses-1)/no.subclasses, 1/no.subclasses))
subclass <- rep(1, length(propscore))
for (i in 1:(no.subclasses-1)){
subclass <- subclass + 1*(propscore >= cutoffs[i])
}
datatemp_overlap$subclass <- subclass
psmodel.overlap <- glm(covariates, family=binomial, data=datatemp_overlap, x=TRUE)
nvar <- ncol(psmodel.overlap$x) - 1
smd.before.subclass <- numeric(nvar)
smd.after.subclass <- numeric(nvar)
for (i in 2:(nvar+1)){
xi <- psmodel.overlap$x[, i]
poolsd <- sqrt((var(xi[datatemp_overlap$treatment==1]) + var(xi[datatemp_overlap$treatment==0])) / 2)
smd.before.subclass[i-1] <- (mean(xi[datatemp_overlap$treatment==1]) - mean(xi[datatemp_overlap$treatment==0])) / poolsd
tempdf <- matrix(nrow=no.subclasses, ncol=2)
for (s in 1:(no.subclasses)){
tempdf[s, 1] <- mean(xi[datatemp_overlap$treatment==1 & datatemp_overlap$subclass==s], na.rm=TRUE)
tempdf[s, 2] <- mean(xi[datatemp_overlap$treatment==0 & datatemp_overlap$subclass==s], na.rm=TRUE)
}
smd.after.subclass[i-1] <- sum((tempdf[, 1] - tempdf[, 2]) * table(subclass) / nrow(datatemp_overlap), na.rm=TRUE) / poolsd
}
love.plot(abs(smd.before.subclass), abs(smd.after.subclass), colnames(psmodel.overlap$x)[2:(nvar+1)])
Based on the Love plot, the adjustment is acceptable.
max_smd_after <- max(abs(smd.after.subclass), na.rm=TRUE)
print(paste("Maximum Adjusted Absolute SMD:", round(max_smd_after, 4)))
## [1] "Maximum Adjusted Absolute SMD: 0.0665"
if (max_smd_after < 0.2) {
print("Balance after subclassification is acceptable.")
} else {
print("Balance after subclassification is NOT acceptable.")
}
## [1] "Balance after subclassification is acceptable."
subclass.treatment.effect <- rep(0, no.subclasses)
subclass.treatment.effect.se <- rep(0, no.subclasses)
subclass.no.subjects <- rep(0, no.subclasses)
for (i in 1:no.subclasses){
tempmodel <- lm(outcome ~ treatment, subset=(subclass==i), data=datatemp_overlap)
subclass.treatment.effect[i] <- coef(tempmodel)[2]
subclass.treatment.effect.se[i] <- sqrt(vcov(tempmodel)[2, 2])
subclass.no.subjects[i] <- sum(datatemp_overlap$subclass == i, na.rm=TRUE)
}
subclass.weight <- subclass.no.subjects / sum(subclass.no.subjects)
avg.treat.effect.est <- sum(subclass.treatment.effect * subclass.weight, na.rm=TRUE)
avg.treat.effect.se <- sqrt(sum(subclass.weight^2 * subclass.treatment.effect.se^2, na.rm=TRUE))
avg.treat.effect.lci <- avg.treat.effect.est - 1.96 * avg.treat.effect.se
avg.treat.effect.uci <- avg.treat.effect.est + 1.96 * avg.treat.effect.se
print(paste("Subclassification ATE Estimate:", round(avg.treat.effect.est, 4)))
## [1] "Subclassification ATE Estimate: -1.7295"
print(paste0("95% CI: (", round(avg.treat.effect.lci, 4), ", ", round(avg.treat.effect.uci, 4), ")"))
## [1] "95% CI: (-2.3118, -1.1472)"
age_var <- "age.at.interview"
sex_vars <- grep("^sex", colnames(Xmat), value = TRUE)
smoking_vars <- grep("^smoking.status", colnames(Xmat), value = TRUE)
Xmatmahal_vars <- c(age_var, sex_vars, smoking_vars)
Xmatmahal <- Xmat[, Xmatmahal_vars]
distmat <- maha.dense(datatemp_overlap$treatment, Xmatmahal, matrix=TRUE)
datatemp_overlap$logit.ps <- predict(psmodel, newdata=datatemp_overlap)
distmat <- add.caliper(distmat, datatemp_overlap$treatment, datatemp_overlap$logit.ps,
rg=.5, stdev=TRUE, penalty=1000, constant=FALSE)
rownames(distmat) <- rownames(datatemp_overlap)[as.numeric(rownames(distmat))]
colnames(distmat) <- rownames(datatemp_overlap)[as.numeric(colnames(distmat))]
result1 <- match.os(datatemp_overlap$treatment, distmat, datatemp_overlap, ncontrol=1)
matcheddata1 <- result1$data
Xmat.matched1 <- Xmat[as.numeric(rownames(matcheddata1)), ]
btb_smd1 <- balance.check(Xmat.full, Xmat.matched1,
datatemp.full$treatment, matcheddata1$treatment)
print("Standardized Differences After Pair Matching (n=1):")
## [1] "Standardized Differences After Pair Matching (n=1):"
print(round(btb_smd1[, c("After SMD", "Before SMD")], 3))
## After SMD Before SMD
## sex.Female -0.004 0.244
## sex.Male 0.004 -0.244
## smoking.status.Current 0.009 0.141
## smoking.status.Former -0.005 -0.142
## smoking.status.Never -0.004 -0.018
## income.poverty.ratio -0.122 -0.420
## age.at.interview -0.002 0.282
## race.Nonwhite 0.093 0.251
## race.White -0.093 -0.251
## education.0.8 0.127 0.310
## education.12 -0.069 -0.194
## education.College.Grad -0.055 0.037
## education.Missing -0.054 0.004
## education.Ninth.11th.grade 0.021 -0.098
## education.Some.College -0.083 -0.158
## working.during.last.three.months.0 0.046 0.588
## working.during.last.three.months.1 -0.046 -0.588
## marital.status.0 0.071 0.350
## marital.status.1 -0.071 -0.350
## alcohol.consumption..1.time.per.month -0.054 0.015
## alcohol.consumption.1.4.times.per.month 0.000 -0.125
## alcohol.consumption.2..times.per.week 0.036 -0.069
## alcohol.consumption.Just.about.everyday.everyday -0.054 -0.073
## alcohol.consumption.Never 0.062 0.189
## dietary.adequacy -0.100 -0.305
## income.poverty.ratio.mis.FALSE 0.205 0.013
## income.poverty.ratio.mis.TRUE -0.205 -0.013
## dietary.adequacy.mis.FALSE 0.245 0.068
## dietary.adequacy.mis.TRUE -0.245 -0.068
# Love Plot [cite: 275]
love.plot(abs(btb_smd1[, "Before SMD"]), abs(btb_smd1[, "After SMD"]), rownames(btb_smd1))
## (h) Matching with Multiple Controls
# n control=2
result2 <- match.os(datatemp_overlap$treatment, distmat, datatemp_overlap, ncontrol=2)
matcheddata2 <- result2$data
Xmat.matched2 <- Xmat[as.numeric(rownames(matcheddata2)), ]
btb_smd2 <- balance.check(Xmat.full, Xmat.matched2,
datatemp.full$treatment, matcheddata2$treatment)
smd_after2 <- abs(btb_smd2[, "After SMD"])
print("Maximum Absolute SMD for n=2 controls:")
## [1] "Maximum Absolute SMD for n=2 controls:"
print(round(max(smd_after2), 4))
## [1] 0.2124
Looks like n=2 is not the most balanced option with a lot of SMD > 0.2.
# n control=3
result3 <- match.os(datatemp_overlap$treatment, distmat, datatemp_overlap, ncontrol=3)
matcheddata3 <- result3$data
Xmat.matched3 <- Xmat[as.numeric(rownames(matcheddata3)), ]
btb_smd3 <- balance.check(Xmat.full, Xmat.matched3,
datatemp.full$treatment, matcheddata3$treatment)
smd_after3 <- abs(btb_smd3[, "After SMD"])
print("Maximum Absolute SMD for n=3 controls:")
## [1] "Maximum Absolute SMD for n=3 controls:"
print(round(max(smd_after3), 4))
## [1] 0.5343
n=3 still does not provide a good balance with many SMD > 0.2. ## (i) Full Matching
result_full <- match.os.full(datatemp_overlap$treatment, distmat, datatemp_overlap)
matcheddata_full <- result_full$data
print(paste("Full Match Effective Sample Size:", round(result_full$effsize, 2)))
## [1] "Full Match Effective Sample Size: 572.28"
missing.mat <- matrix(0, nrow=nrow(Xmat.full), ncol=ncol(Xmat.full))
colnames(missing.mat) <- colnames(Xmat.full)
missing.mat[, "income.poverty.ratio"] <- as.numeric(datatemp.full$income.poverty.ratio.mis == "TRUE")
missing.mat[, "dietary.adequacy"] <- as.numeric(datatemp.full$dietary.adequacy.mis == "TRUE")
Xmat.without.missing.full <- Xmat.full
Xmat.without.missing.full[missing.mat[, "income.poverty.ratio"] == 1, "income.poverty.ratio"] <- NA
Xmat.without.missing.full[missing.mat[, "dietary.adequacy"] == 1, "dietary.adequacy"] <- NA
Xmat.matched_full <- Xmat.full[as.numeric(rownames(matcheddata_full)), ]
missing.matched <- missing.mat[as.numeric(rownames(matcheddata_full)), ]
Xmat.without.missing.matched <- Xmat.matched_full
Xmat.without.missing.matched[missing.matched[, "income.poverty.ratio"] == 1, "income.poverty.ratio"] <- NA
Xmat.without.missing.matched[missing.matched[, "dietary.adequacy"] == 1, "dietary.adequacy"] <- NA
smd.full <- balance.check.full(
Xmat.without.missing.full,
Xmat.without.missing.matched,
datatemp.full$treatment,
matcheddata_full$treatment,
matcheddata_full$mset,
missing.mat,
missing.matched,
'ate'
)
print("Standardized Differences After Full Matching:")
## [1] "Standardized Differences After Full Matching:"
print(round(smd.full, 3))
## smd.before smd.after
## sex.Female 0.244 0.004
## sex.Male -0.244 -0.004
## smoking.status.Current 0.141 -0.001
## smoking.status.Former -0.142 -0.002
## smoking.status.Never -0.018 0.002
## income.poverty.ratio -0.499 -0.049
## age.at.interview 0.282 -0.004
## race.Nonwhite 0.251 0.018
## race.White -0.251 -0.018
## education.0.8 0.310 0.036
## education.12 -0.194 -0.019
## education.College.Grad 0.037 0.011
## education.Missing 0.004 -0.012
## education.Ninth.11th.grade -0.098 -0.008
## education.Some.College -0.158 -0.030
## working.during.last.three.months.0 0.588 0.070
## working.during.last.three.months.1 -0.588 -0.070
## marital.status.0 0.350 0.020
## marital.status.1 -0.350 -0.020
## alcohol.consumption..1.time.per.month 0.015 -0.012
## alcohol.consumption.1.4.times.per.month -0.125 -0.047
## alcohol.consumption.2..times.per.week -0.069 0.002
## alcohol.consumption.Just.about.everyday.everyday -0.073 -0.019
## alcohol.consumption.Never 0.189 0.062
## dietary.adequacy -0.348 -0.041
## income.poverty.ratio.mis.FALSE 0.013 0.032
## income.poverty.ratio.mis.TRUE -0.013 -0.032
## dietary.adequacy.mis.FALSE 0.068 0.032
## dietary.adequacy.mis.TRUE -0.068 -0.032
max_smd_full <- max(abs(smd.full[, "smd.after"]))
print(paste("Maximum Absolute SMD after full matching:", round(max_smd_full, 4)))
## [1] "Maximum Absolute SMD after full matching: 0.0701"
if (max_smd_full < 0.2) {
print("Balance for full matching is ACCEPTABLE.")
} else {
print("Balance for full matching is NOT acceptable.")
}
## [1] "Balance for full matching is ACCEPTABLE."
Much more balanced with SMD < 0.2 for all covariates. ## (j) Best Matching Justification Based on the results of questions (g), (h), and (i), the SMDs from full matching are all ≤ 0.2, and it also makes use of as many data points as possible. Therefore, it is the best choice, as it minimizes bias caused by observable confounding to the greatest extent.
treated.y1 <- matcheddata1$outcome[matcheddata1$treatment==1]
control.y1 <- matcheddata1$outcome[matcheddata1$treatment==0]
y1 <- cbind(treated.y1, control.y1)
ci1 <- sens.ci(y1, alternative='two-sided')
print("Pair Matching (n=1) Estimate (ATT):")
## [1] "Pair Matching (n=1) Estimate (ATT):"
print(paste0("Point Estimate: ", round(ci1$PointEstimate[1], 4)))
## [1] "Point Estimate: -2.4615"
print(paste0("95% CI: (", round(ci1$Confidence.Interval[1], 4), ", ", round(ci1$Confidence.Interval[2], 4), ")"))
## [1] "95% CI: (-3.1544, -1.8105)"
ci_full <- sens.ci.full(matcheddata_full$outcome, matcheddata_full$treatment, matcheddata_full$mset)
print("Chosen Match (Full Matching) Estimate (ATE):")
## [1] "Chosen Match (Full Matching) Estimate (ATE):"
print(paste0("Point Estimate: ", round(ci_full$PointEstimates[1], 4)))
## [1] "Point Estimate: -1.6456"
print(paste0("95% CI: (", round(ci_full$ConfidenceInterval[1], 4), ", ", round(ci_full$ConfidenceInterval[2], 4), ")"))
## [1] "95% CI: (-2.4584, -1.2105)"
Variables that influence both physical inactivity and longevity but were not measured, such as genetic factors and unmeasured aspects of diet quality.
datatemp_m <- datatemp.full
correct_current_smoker_label <- "Current"
datatemp_m$treatment_smoking <- as.numeric(datatemp_m$smoking.status == correct_current_smoker_label)
smoking_covariates <- formula(treatment_smoking ~ sex + income.poverty.ratio + age.at.interview +
race + education + working.during.last.three.months + marital.status +
alcohol.consumption + dietary.adequacy + physically.inactive +
income.poverty.ratio.mis + dietary.adequacy.mis)
psmodel_smoking <- glm(smoking_covariates, family=binomial, data=datatemp_m, x=TRUE, y=TRUE)
datatemp_m$logit.ps_smoking <- predict(psmodel_smoking)
res_smoking <- overlap(datatemp_m$logit.ps_smoking, datatemp_m$treatment_smoking)
which.remove_smoking <- res_smoking$index.lack.overlap
datatemp_overlap_smoking <- datatemp_m[-which.remove_smoking, ]
dmy_smoking <- dummyVars(smoking_covariates, data=datatemp_overlap_smoking)
Xmat_smoking <- data.frame(predict(dmy_smoking, newdata=datatemp_overlap_smoking))
datatemp_overlap_smoking$treatment <- datatemp_overlap_smoking$treatment_smoking
age_var_m <- "age.at.interview"
sex_vars_m <- grep("^sex", colnames(Xmat_smoking), value = TRUE)
Xmatmahal_smoking <- Xmat_smoking[, c(age_var_m, sex_vars_m)]
distmat_smoking <- maha.dense(datatemp_overlap_smoking$treatment, Xmatmahal_smoking, matrix=TRUE)
distmat_smoking <- add.caliper(distmat_smoking, datatemp_overlap_smoking$treatment,
datatemp_overlap_smoking$logit.ps_smoking, rg=.5,
stdev=TRUE, penalty=1000, constant=FALSE)
result_smoking <- match.os(datatemp_overlap_smoking$treatment, distmat_smoking,
datatemp_overlap_smoking, ncontrol=1)
matcheddata_smoking <- result_smoking$data
treated.y_smoking <- matcheddata_smoking$outcome[matcheddata_smoking$treatment==1]
control.y_smoking <- matcheddata_smoking$outcome[matcheddata_smoking$treatment==0]
y_smoking <- cbind(treated.y_smoking, control.y_smoking)
ci_smoking <- sens.ci(y_smoking, alternative='two-sided')
print("Smoking Treatment Estimate (ATT):")
## [1] "Smoking Treatment Estimate (ATT):"
print(paste0("Point Estimate: ", round(ci_smoking$PointEstimate[1], 4)))
## [1] "Point Estimate: -1.7886"
print(paste0("95% CI: (", round(ci_smoking$Confidence.Interval[1], 4), ", ", round(ci_smoking$Confidence.Interval[2], 4), ")"))
## [1] "95% CI: (-2.3117, -1.3474)"