file <-"assignment3.sas7bdat"
data <- read_sas(file)
summary(data)
## IMALE IEDUC PAQAGE IMARRIED
## Min. :0.0000 Min. : 1.0 Min. :18.0 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:12.0 1st Qu.:42.0 1st Qu.:0.000
## Median :0.0000 Median :13.0 Median :57.0 Median :1.000
## Mean :0.3827 Mean :13.3 Mean :54.8 Mean :0.592
## 3rd Qu.:1.0000 3rd Qu.:16.0 3rd Qu.:67.0 3rd Qu.:1.000
## Max. :1.0000 Max. :18.0 Max. :98.0 Max. :1.000
## NA's :78 NA's :126 NA's :51 NA's :71
## IADJINCM NCLINDEP CHF MI
## Min. : 625 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.: 12374 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median : 17678 Median :0.0000 Median :0.00000 Median :0.00000
## Mean : 22922 Mean :0.1401 Mean :0.06577 Mean :0.03258
## 3rd Qu.: 31820 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :100000 Max. :1.0000 Max. :1.00000 Max. :1.00000
## NA's :405 NA's :37
## IWHITE sevhyp PSQG PSQT
## Min. :0.0000 Min. :0.00000 Min. : 0.00 Min. : 0.00
## 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.: 50.00 1st Qu.: 57.50
## Median :1.0000 Median :0.00000 Median : 66.67 Median : 67.50
## Mean :0.7916 Mean :0.03349 Mean : 63.92 Mean : 67.06
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.: 79.17 3rd Qu.: 77.50
## Max. :1.0000 Max. :1.00000 Max. :100.00 Max. :100.00
## NA's :117 NA's :59 NA's :41 NA's :42
## PSQI PSQF PSQTI PSQA
## Min. : 3.571 Min. : 0.00 Min. : 0.00 Min. : 6.25
## 1st Qu.: 64.286 1st Qu.: 53.12 1st Qu.: 50.00 1st Qu.: 62.50
## Median : 75.000 Median : 68.75 Median : 75.00 Median : 72.92
## Mean : 72.006 Mean : 65.39 Mean : 65.02 Mean : 71.08
## 3rd Qu.: 82.143 3rd Qu.: 81.25 3rd Qu.: 75.00 3rd Qu.: 81.25
## Max. :100.000 Max. :100.00 Max. :100.00 Max. :100.00
## NA's :100 NA's :96 NA's :71 NA's :55
## severity
## Min. :1.000
## 1st Qu.:2.000
## Median :3.000
## Mean :2.839
## 3rd Qu.:3.000
## Max. :4.000
## NA's :142
data$IMALE <- as.factor(data$IMALE)
data$IMARRIED <- as.factor(data$IMARRIED)
data$NCLINDEP <- as.factor(data$NCLINDEP)
data$CHF <- as.factor(data$CHF)
data$MI <- as.factor(data$MI)
data$IWHITE <- as.factor(data$IWHITE)
data$sevhyp <- as.factor(data$sevhyp)
Two plots are generated to understand the distribution of missing values, including their combination. The first plot is scaled so that cell height is proportional to N, while the second is not. Combined, the plots illustrate that most cases are complete. Of cases with missing values, 73% are missing income alone. Very few cases have combinations of missing values.
Summary tables from the previous section indicate a low incidence of missing values amongst variables of interest:
41 cases are missing the response variable (PSQG) (1%)
142 are missing pain severity (4%)
37 are missing clinical depression (1%)
Overall, 72.99% of cases are complete.
aggr(data, prop = T, col=("blue"),numbers = T,bars=TRUE,only.miss=TRUE,combined=TRUE,sortCombs=TRUE,varheight=TRUE)
aggr(data, prop = T, col=("blue"),numbers = T,bars=TRUE,only.miss=TRUE,combined=TRUE,sortCombs=TRUE)
Three plots are generated to look for signs that missingness is conditional on the value of key variables. Quality scores appear independent of the missingness of income, pain severity, and depression, and vice-versa.
Given that income has the highest incidence of missing data, I think it is important to assess for non-random missingness (even if will only be included as potential confound). Marginplot for income by education conditoned on missing value status suggest a non-random association, specifically non-response bias for individiuals of low socioeconomic status.
# Key response and predictor variables
marginplot(data[c(11, 6)], col = c("lightblue", "red", "orange"))
marginplot(data[c(6, 17)], col = c("lightblue", "red", "orange"))
marginplot(data[c(11, 17)], col = c("lightblue", "red", "orange"))
t.test(severity~is.na(PSQG),data=data)
##
## Welch Two Sample t-test
##
## data: severity by is.na(PSQG)
## t = -0.10144, df = 27.39, p-value = 0.9199
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3960767 0.3587338
## sample estimates:
## mean in group FALSE mean in group TRUE
## 2.838471 2.857143
# Income (highest incidence of missing data)
marginplot(data[c(11, 5)], col = c("lightblue", "red", "orange"))
prop.table(table(data$NCLINDEP,is.na(data$severity)),2)
##
## FALSE TRUE
## 0 0.8593499 0.8714286
## 1 0.1406501 0.1285714
marginplot(data[c(2, 5)], col = c("lightblue", "red", "orange"),numbers=T)
t.test(IEDUC~is.na(IADJINCM),data=data)
##
## Welch Two Sample t-test
##
## data: IEDUC by is.na(IADJINCM)
## t = 7.7473, df = 385.5, p-value = 8.393e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.9696183 1.6291405
## sample estimates:
## mean in group FALSE mean in group TRUE
## 13.42356 12.12418
One largely discredited technqiue for managing missing data is to impute a measure of central tendency (e.g. median, or mode for categorical data). A second approach, complete case analysis, will also be used later.
# Mode function from https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# Impute function, I created
impute.mean <- function(x) {
if (class(x) == "numeric") {
x[is.na(x)] <- median(x,na.rm=TRUE) # Use median if numeric
return(as.numeric(x))
}
else if (class(x) == "factor") {
x[is.na(x)] <- getmode(x) # Otherwise use mode
return(as.factor(x))
}
}
# I thought I'd be able to use apply but it coereces all data to the same data type, not sure if there is an equivalent function that doesn't do that
data.mean <- data
data.mean$IMALE <- impute.mean(data$IMALE)
data.mean$IEDUC <- impute.mean(data$IEDUC)
data.mean$PAQAGE <- impute.mean(data$PAQAGE)
data.mean$IMARRIED <- impute.mean(data$IMARRIED)
data.mean$IADJINCM <- impute.mean(data$IADJINCM)
data.mean$NCLINDEP <- impute.mean(data$NCLINDEP)
data.mean$CHF <- impute.mean(data$CHF)
data.mean$MI <- impute.mean(data$MI)
data.mean$IWHITE <- impute.mean(data$IWHITE)
data.mean$sevhyp <- impute.mean(data$sevhyp)
data.mean$PSQG <- impute.mean(data$PSQG)
data.mean$PSQT <- impute.mean(data$PSQT)
data.mean$PSQI <- impute.mean(data$PSQI)
data.mean$PSQF <- impute.mean(data$PSQF)
data.mean$PSQTI <- impute.mean(data$PSQTI)
data.mean$PSQA <- impute.mean(data$PSQA)
data.mean$severity <- impute.mean(data$severity)
# Sanity check
summary(data)
## IMALE IEDUC PAQAGE IMARRIED IADJINCM
## 0 :1979 Min. : 1.0 Min. :18.0 0 :1311 Min. : 625
## 1 :1227 1st Qu.:12.0 1st Qu.:42.0 1 :1902 1st Qu.: 12374
## NA's: 78 Median :13.0 Median :57.0 NA's: 71 Median : 17678
## Mean :13.3 Mean :54.8 Mean : 22922
## 3rd Qu.:16.0 3rd Qu.:67.0 3rd Qu.: 31820
## Max. :18.0 Max. :98.0 Max. :100000
## NA's :126 NA's :51 NA's :405
## NCLINDEP CHF MI IWHITE sevhyp PSQG
## 0 :2792 0:3068 0:3177 0 : 660 0 :3117 Min. : 0.00
## 1 : 455 1: 216 1: 107 1 :2507 1 : 108 1st Qu.: 50.00
## NA's: 37 NA's: 117 NA's: 59 Median : 66.67
## Mean : 63.92
## 3rd Qu.: 79.17
## Max. :100.00
## NA's :41
## PSQT PSQI PSQF PSQTI
## Min. : 0.00 Min. : 3.571 Min. : 0.00 Min. : 0.00
## 1st Qu.: 57.50 1st Qu.: 64.286 1st Qu.: 53.12 1st Qu.: 50.00
## Median : 67.50 Median : 75.000 Median : 68.75 Median : 75.00
## Mean : 67.06 Mean : 72.006 Mean : 65.39 Mean : 65.02
## 3rd Qu.: 77.50 3rd Qu.: 82.143 3rd Qu.: 81.25 3rd Qu.: 75.00
## Max. :100.00 Max. :100.000 Max. :100.00 Max. :100.00
## NA's :42 NA's :100 NA's :96 NA's :71
## PSQA severity
## Min. : 6.25 Min. :1.000
## 1st Qu.: 62.50 1st Qu.:2.000
## Median : 72.92 Median :3.000
## Mean : 71.08 Mean :2.839
## 3rd Qu.: 81.25 3rd Qu.:3.000
## Max. :100.00 Max. :4.000
## NA's :55 NA's :142
summary(data.mean)
## IMALE IEDUC PAQAGE IMARRIED IADJINCM
## 0:2057 Min. : 1.00 Min. :18.00 0:1311 Min. : 625
## 1:1227 1st Qu.:12.00 1st Qu.:43.00 1:1973 1st Qu.: 12500
## Median :13.00 Median :57.00 Median : 17678
## Mean :13.29 Mean :54.84 Mean : 22275
## 3rd Qu.:16.00 3rd Qu.:67.00 3rd Qu.: 25981
## Max. :18.00 Max. :98.00 Max. :100000
## NCLINDEP CHF MI IWHITE sevhyp PSQG
## 0:2829 0:3068 0:3177 0: 660 0:3176 Min. : 0.00
## 1: 455 1: 216 1: 107 1:2624 1: 108 1st Qu.: 50.00
## Median : 66.67
## Mean : 63.96
## 3rd Qu.: 79.17
## Max. :100.00
## PSQT PSQI PSQF PSQTI
## Min. : 0.00 Min. : 3.571 Min. : 0.00 Min. : 0.00
## 1st Qu.: 57.50 1st Qu.: 64.286 1st Qu.: 53.12 1st Qu.: 50.00
## Median : 67.50 Median : 75.000 Median : 68.75 Median : 75.00
## Mean : 67.06 Mean : 72.097 Mean : 65.49 Mean : 65.23
## 3rd Qu.: 77.50 3rd Qu.: 82.143 3rd Qu.: 79.17 3rd Qu.: 75.00
## Max. :100.00 Max. :100.000 Max. :100.00 Max. :100.00
## PSQA severity
## Min. : 6.25 Min. :1.000
## 1st Qu.: 62.50 1st Qu.:2.000
## Median : 72.92 Median :3.000
## Mean : 71.11 Mean :2.846
## 3rd Qu.: 79.55 3rd Qu.:3.000
## Max. :100.00 Max. :4.000
table(complete.cases(data.mean))
##
## TRUE
## 3284
From Bair et al., the goal of the study is to determine the effect of depression and pain severity on patient satisfaction. I will model effects on general satisfaction (PSQG). The dependent variables include depression and pain, with other variables included as potential confounds.
As one approach to variable selection I tried using the regsubsets function of the leaps package. This function calculates model BIC for all combinations of included variables, displayed as a heatmap. With this approach, we can see that optimal model BIC is achieved with education, age, depression and severity.
This procedure could be repeated for each of the datasets (complete, imputed mean, and each frame of multiple imputation). However, given the low incidence of missing data in this dataset, I do not think this is worthwhile.
data.mean <- data.mean[-c(12:16)]
data <- data[-c(12:16)]
complete.leaps <- regsubsets(PSQG~.,data=data,nbest=10)
plot(complete.leaps, scale="bic")
mean.m1 <- lm(PSQG~severity,data=data.mean)
mean.m2 <- lm(PSQG~severity+NCLINDEP,data=data.mean)
mean.m3 <- lm(PSQG~severity+NCLINDEP+PAQAGE,data=data.mean)
mean.m4 <- lm(PSQG~severity+NCLINDEP+PAQAGE+IEDUC,data=data.mean)
multiplot(mean.m1,mean.m2,mean.m3,mean.m4,intercept=FALSE)
summary(mean.m4)
##
## Call:
## lm(formula = PSQG ~ severity + NCLINDEP + PAQAGE + IEDUC, data = data.mean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66.504 -11.851 1.123 12.981 50.783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.40266 2.52435 21.551 < 2e-16 ***
## severity 3.22648 0.39083 8.255 < 2e-16 ***
## NCLINDEP1 -6.74893 0.99972 -6.751 1.73e-11 ***
## PAQAGE 0.18355 0.02301 7.978 2.04e-15 ***
## IEDUC -0.65912 0.11564 -5.700 1.30e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.75 on 3279 degrees of freedom
## Multiple R-squared: 0.08199, Adjusted R-squared: 0.08087
## F-statistic: 73.22 on 4 and 3279 DF, p-value: < 2.2e-16
plot(mean.m4)
complete.m4 <- lm(PSQG~severity+NCLINDEP+PAQAGE+IEDUC,data=data)
summary(complete.m4)
##
## Call:
## lm(formula = PSQG ~ severity + NCLINDEP + PAQAGE + IEDUC, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.067 -11.932 1.222 13.116 50.766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.41222 2.66735 20.399 < 2e-16 ***
## severity 3.25821 0.41142 7.919 3.36e-15 ***
## NCLINDEP1 -6.28411 1.05109 -5.979 2.52e-09 ***
## PAQAGE 0.18954 0.02444 7.755 1.22e-14 ***
## IEDUC -0.70073 0.12258 -5.717 1.20e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.83 on 2914 degrees of freedom
## (365 observations deleted due to missingness)
## Multiple R-squared: 0.08357, Adjusted R-squared: 0.08231
## F-statistic: 66.43 on 4 and 2914 DF, p-value: < 2.2e-16
multiplot(complete.m4,mean.m4,intercept=FALSE)
We have been asked to use multiple imputation as an alternative to complete case analysis and other simpler techniques. For the imputation, I will use the mice package. I have chosen predictive mean matching ‘pmm’ as the imputation method. Alternatives based on linear regression are probably not appropriate, as many of the included variables for modeling are discrete variables.
As a rough check of the imputed values, I have plotted the case-wise standard deviations for each continuous variable (i.e. distribution of calculated SD for each case across the 10 imputations). Standard deviations are large, suggesting that imputations are imprecise (which I think is expected).
data.mi <- mice(data,m=10,method="pmm",seed=1,print=FALSE)
hist(apply(data.mi$imp[[2]],2,sd)) # Histogram of case-wise SDs for imputed education
hist(apply(data.mi$imp[[3]],2,sd)) # Histogram of case-wise SDs for imputed age
hist(apply(data.mi$imp[[5]],2,sd)) # Histogram of case-wise SDs for imputed income
hist(apply(data.mi$imp[[11]],2,sd)) # Histogram of case-wise SDs for imputed quality score
mi.m4 <- with(data.mi,lm(PSQG~severity+NCLINDEP+PAQAGE+IEDUC))
pool.m4 <- pool(mi.m4)
summary(pool.m4)
## estimate std.error statistic df p.value
## (Intercept) 53.7635923 2.58585680 20.791404 1328.2053 0.000000e+00
## severity 3.2772647 0.39604773 8.274924 1614.8537 2.220446e-16
## NCLINDEP1 -6.6218868 1.02803187 -6.441324 1646.1429 1.531997e-10
## PAQAGE 0.1893172 0.02361668 8.016248 1724.6321 1.998401e-15
## IEDUC -0.6446773 0.11865339 -5.433282 943.7374 6.321459e-08
summary(mean.m4)
##
## Call:
## lm(formula = PSQG ~ severity + NCLINDEP + PAQAGE + IEDUC, data = data.mean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66.504 -11.851 1.123 12.981 50.783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.40266 2.52435 21.551 < 2e-16 ***
## severity 3.22648 0.39083 8.255 < 2e-16 ***
## NCLINDEP1 -6.74893 0.99972 -6.751 1.73e-11 ***
## PAQAGE 0.18355 0.02301 7.978 2.04e-15 ***
## IEDUC -0.65912 0.11564 -5.700 1.30e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.75 on 3279 degrees of freedom
## Multiple R-squared: 0.08199, Adjusted R-squared: 0.08087
## F-statistic: 73.22 on 4 and 3279 DF, p-value: < 2.2e-16
summary(complete.m4)
##
## Call:
## lm(formula = PSQG ~ severity + NCLINDEP + PAQAGE + IEDUC, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.067 -11.932 1.222 13.116 50.766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.41222 2.66735 20.399 < 2e-16 ***
## severity 3.25821 0.41142 7.919 3.36e-15 ***
## NCLINDEP1 -6.28411 1.05109 -5.979 2.52e-09 ***
## PAQAGE 0.18954 0.02444 7.755 1.22e-14 ***
## IEDUC -0.70073 0.12258 -5.717 1.20e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.83 on 2914 degrees of freedom
## (365 observations deleted due to missingness)
## Multiple R-squared: 0.08357, Adjusted R-squared: 0.08231
## F-statistic: 66.43 on 4 and 2914 DF, p-value: < 2.2e-16
multiplot(mi.m4$analyses,intercept = FALSE)
multiplot(complete.m4,mean.m4,intercept = FALSE)