Library

library(openxlsx)
library(sjPlot)
library(boot)

 nhefs<-read.csv(file='C:/Users/antho/Documents/Keck-graduate-school/PM617/lectures class/week2/lab/NHEFS.csv')
 nhefs_code<-read.xlsx('C:/Users/antho/Documents/Keck-graduate-school/PM617/lectures class/week2/lab/NHEFS_Codebook.xlsx')

Cholesterol association with smoking (years)

the expected cholesterol in someone to have never smoked a full year, or non-smokers, is 200.92 (196-206) mmHg (p\(<0.001)\). the change in cholesterol for each year of additional smoking is 0.77 (0.59,0.94; \(p<0.001)\).

  fit<-lm(cholesterol~ smokeyrs , nhefs)
  tab_model(fit,pred.labels = c('intercept','Smoke years'))

	cholesterol
Predictors	Estimates	CI	p
intercept	200.92	195.99 – 205.86	<0.001
Smoke years	0.77	0.59 – 0.94	<0.001
Observations	1613
R² / R² adjusted	0.042 / 0.042

Cholesterol association with smoking (years) adjusted by sex

  fit<-lm(cholesterol~ smokeyrs+sex+race+income, nhefs)
  tab_model(fit,pred.labels = c('intercept','Smoke years','Sex','Race',"Income"))

	cholesterol
Predictors	Estimates	CI	p
intercept	172.50	154.25 – 190.76	<0.001
Smoke years	0.85	0.66 – 1.03	<0.001
Sex	8.08	3.56 – 12.60	<0.001
Race	-6.47	-13.23 – 0.29	0.061
Income	1.27	0.39 – 2.15	0.005
Observations	1552
R² / R² adjusted	0.054 / 0.052

Bootstrapping

We estimate the confidence intervals, which can be determined by \(\pm 2*s.e.\)

func = function(nhefs,idx){
coef(lm(cholesterol~ smokeyrs+sex+race+income, nhefs[idx,]))
}
B = boot(nhefs,func,R=500)
 print(B)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = nhefs, statistic = func, R = 500)
## 
## 
## Bootstrap Statistics :
##        original       bias    std. error
## t1* 172.5020797  0.021722573  9.16040506
## t2*   0.8451926  0.001623109  0.09664551
## t3*   8.0809569 -0.043159562  2.29453156
## t4*  -6.4679362  0.235714265  3.65264729
## t5*   1.2699687  0.000806083  0.44337031

Sex modification

there is effect modification by sex (\(p<0.001\)) from multiplicativity

  fit<-lm(cholesterol~ smokeyrs*sex+race+income, nhefs)
  tab_model(fit,pred.labels = c('intercept','Smoke years','Sex','Race',"Income", 'Smoke years : Sex'))

	cholesterol
Predictors	Estimates	CI	p
intercept	186.91	168.21 – 205.61	<0.001
Smoke years	0.37	0.13 – 0.62	0.003
Sex	-18.80	-28.89 – -8.72	<0.001
Race	-6.45	-13.14 – 0.24	0.059
Income	1.19	0.32 – 2.06	0.007
Smoke years : Sex	1.09	0.73 – 1.46	<0.001
Observations	1552
R² / R² adjusted	0.074 / 0.071

Linearity

Using 2 methods we detected non-linearity in the smoking years term

  fit<-lm(cholesterol~ smokeyrs+sex+race+income+I(smokeyrs^2), nhefs)
  tab_model(fit,pred.labels = c('intercept','Smoke years','Sex','Race',"Income", 'Smoke years^2'))

	cholesterol
Predictors	Estimates	CI	p
intercept	163.91	145.12 – 182.69	<0.001
Smoke years	2.10	1.39 – 2.82	<0.001
Sex	7.38	2.85 – 11.90	0.001
Race	-6.76	-13.50 – -0.02	0.049
Income	1.04	0.15 – 1.92	0.022
Smoke years^2	-0.02	-0.04 – -0.01	<0.001
Observations	1552
R² / R² adjusted	0.062 / 0.059

  fit2<-lm(cholesterol~ poly(smokeyrs,2,raw=TRUE)+sex+race+income, nhefs)
  tab_model(fit2)

	cholesterol
Predictors	Estimates	CI	p
(Intercept)	163.91	145.12 – 182.69	<0.001
smokeyrs [1st degree]	2.10	1.39 – 2.82	<0.001
smokeyrs [2nd degree]	-0.02	-0.04 – -0.01	<0.001
sex	7.38	2.85 – 11.90	0.001
race	-6.76	-13.50 – -0.02	0.049
income	1.04	0.15 – 1.92	0.022
Observations	1552
R² / R² adjusted	0.062 / 0.059

logistic regression

The log-odds of chronic cough, holding all other terms constant, is significantly positively associated with smoking (years); for 1 year increasing of smoking the log-odds increased 1.04 (1.02,1.06, \(p<0.001)\)

 fit<-glm(chroniccough~ smokeyrs+sex+race+income, family='binomial',nhefs)
   tab_model(fit)

	chroniccough
Predictors	Odds Ratios	CI	p
(Intercept)	0.05	0.01 – 0.27	0.001
smokeyrs	1.04	1.02 – 1.06	<0.001
sex	1.26	0.79 – 2.00	0.330
race	0.61	0.26 – 1.24	0.204
income	0.94	0.87 – 1.02	0.146
Observations	1567
R² Tjur	0.019

Binary exerise

Comparing the log-odds of little to no exercise to much exericse/moderate exercise, there was positively association with smoke years (1.02, 1.01-1.03 ; \(p<0.001\))

 nhefs$bin_exercise<-nhefs$exercise
 nhefs$bin_exercise[nhefs$bin_exercise==2]<-NA
 
 fit<-glm(bin_exercise~ smokeyrs+sex+race+income,family='binomial',nhefs)
    tab_model(fit)

	bin_exercise
Predictors	Odds Ratios	CI	p
(Intercept)	0.65	0.20 – 2.19	0.490
smokeyrs	1.02	1.01 – 1.03	<0.001
sex	2.09	1.57 – 2.79	<0.001
race	1.54	0.93 – 2.64	0.100
income	1.02	0.96 – 1.08	0.572
Observations	965
R² Tjur	0.035

Odds ratio

the odds ratio is problematic because we filtered out levels of binary exercise, a multinomial regression would have been more appropriate.

Predictive models

a causal model is measures or tests the association that \(P(Y^{a=1}=1) \neq P(Y^{a=0}=1)\) which uses the counterfactual terms to identify and test contrasts between outcomes comparing all individuals treated and all individuals that would not have been treated. A predictive model is the best model that has the highest explained variance between the association between outcome and independent predictors.

Predictive model

using a training split, and considering only 1st order terms only, interactions, and higher order terms could be explored. 8 features were identified qsmk, modth, dbp, age, school, wt82_71, smokeintensity and birthcontrol were predictive features using a linear model for smoke years. note that smokeyears is a count variable so negative binomial regression is ideal

the training R2 is 0.7864, and the validation R2 was 0.79 using a linear model.

hist(nhefs$smokeyrs)

 mean(nhefs$smokeyrs) # 24.8710

## [1] 24.87109

 var(nhefs$smokeyrs) ## 148

## [1] 148.793

 nhefs$smokeyrs<-asinh(nhefs$smokeyrs)
 library(shinythemes)
library(modelgrid)
library(magrittr)
library(caret)
library(foreach)
library(recipes)
library(plyr)
library(purrr)
library(dplyr)
library(pROC)
library(BradleyTerry2)
library(corrplot)
library(DataExplorer)
library(randomForest)
library(glmnet)
library(xgboost)
library(e1071)
 # load data
set.seed(9650)
data_set <- nhefs
data_set <- data_set%>%dplyr::select(-"seqn") 
data_set$smokeyrs <- as.factor(data_set$smokeyrs)
######################################
# Pre-Processing the data Telco_Customer_Churn  using recipe
set.seed(9650)
Attri_rec_2 <- 
    recipe(smokeyrs ~ ., data = data_set) %>%        # Formula.
    #step_dummy(all_predictors())%>%               # convert nominal data into one or more numeric.
    step_impute_knn(all_predictors())%>%           # impute missing data using nearest neighbors.
    step_zv(all_predictors()) %>%                 # remove variables that are highly sparse and unbalanced.
    step_corr(all_predictors()) %>%               # remove variables that have large absolute correlations with 
                                                  # other variables.
    step_center(all_predictors()) %>%             # normalize numeric data to have a mean of zero.
    step_scale(all_predictors()) %>%              # normalize numeric data to have a standard deviation of one.
    prep(training = data_set, retain = TRUE)   

data_rec_2 <- as.data.frame(juice(Attri_rec_2))
# add the response variable
data_rec_2$smokeyrs <- data_set$smokeyrs
# str(data_rec_2)
# Create a Validation Dataset (training data 70% and validation data 30% of the original data)
set.seed(9650)
validation_index <- createDataPartition(data_rec_2$smokeyrs,times= 1,  p= 0.70, list=FALSE)

## Warning in createDataPartition(data_rec_2$smokeyrs, times = 1, p = 0.7, : Some
## classes have a single record ( 4.78756117999381, 4.85209129348869 ) and these
## will be selected for the sample

# select 30% of the data for validation
data_validation <- data_rec_2[-validation_index,]
# use the remaining 70% of data to train the models
data_train <- data_rec_2[validation_index, ]
# For traing the models  
x_train <- data_train%>%select(-smokeyrs) # Predictors
y_train <- data_train[["smokeyrs"]] # Response
# for validation/test
x_validation <- data_validation%>%select(-smokeyrs)
y_validation <- data_validation[["smokeyrs"]]
 
 trainD<-data.frame(x_train,smokeyrs=as.numeric(y_train))
  testD<-data.frame(x_validation,smokeyrs=as.numeric(y_validation))

 ## neg. bin reg
  library(ISLR);library(magrittr)
 library(leaps)
regfit_full = regsubsets(smokeyrs~., data = trainD,really.big=TRUE,nvmax=8)

 reg_summary = summary(regfit_full)

 summary(regfit_full)

## Subset selection object
## Call: regsubsets.formula(smokeyrs ~ ., data = trainD, really.big = TRUE, 
##     nvmax = 8)
## 55 Variables  (and intercept)
##                   Forced in Forced out
## qsmk                  FALSE      FALSE
## death                 FALSE      FALSE
## yrdth                 FALSE      FALSE
## modth                 FALSE      FALSE
## dadth                 FALSE      FALSE
## sbp                   FALSE      FALSE
## dbp                   FALSE      FALSE
## age                   FALSE      FALSE
## race                  FALSE      FALSE
## income                FALSE      FALSE
## marital               FALSE      FALSE
## school                FALSE      FALSE
## ht                    FALSE      FALSE
## wt71                  FALSE      FALSE
## wt82                  FALSE      FALSE
## wt82_71               FALSE      FALSE
## birthplace            FALSE      FALSE
## smokeintensity        FALSE      FALSE
## smkintensity82_71     FALSE      FALSE
## asthma                FALSE      FALSE
## bronch                FALSE      FALSE
## tb                    FALSE      FALSE
## hf                    FALSE      FALSE
## hbp                   FALSE      FALSE
## pepticulcer           FALSE      FALSE
## colitis               FALSE      FALSE
## hepatitis             FALSE      FALSE
## chroniccough          FALSE      FALSE
## hayfever              FALSE      FALSE
## polio                 FALSE      FALSE
## tumor                 FALSE      FALSE
## nervousbreak          FALSE      FALSE
## alcoholpy             FALSE      FALSE
## alcoholfreq           FALSE      FALSE
## alcoholtype           FALSE      FALSE
## alcoholhowmuch        FALSE      FALSE
## headache              FALSE      FALSE
## otherpain             FALSE      FALSE
## weakheart             FALSE      FALSE
## allergies             FALSE      FALSE
## nerves                FALSE      FALSE
## lackpep               FALSE      FALSE
## wtloss                FALSE      FALSE
## infection             FALSE      FALSE
## active                FALSE      FALSE
## exercise              FALSE      FALSE
## birthcontrol          FALSE      FALSE
## pregnancies           FALSE      FALSE
## cholesterol           FALSE      FALSE
## hightax82             FALSE      FALSE
## price71               FALSE      FALSE
## price82               FALSE      FALSE
## tax82                 FALSE      FALSE
## price71_82            FALSE      FALSE
## bin_exercise          FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          qsmk death yrdth modth dadth sbp dbp age race income marital school
## 1  ( 1 ) " "  " "   " "   " "   " "   " " " " "*" " "  " "    " "     " "   
## 2  ( 1 ) " "  " "   " "   " "   " "   " " " " "*" " "  " "    " "     " "   
## 3  ( 1 ) " "  " "   " "   " "   " "   " " " " "*" " "  " "    " "     " "   
## 4  ( 1 ) " "  " "   " "   " "   " "   " " " " "*" " "  " "    " "     "*"   
## 5  ( 1 ) "*"  " "   " "   " "   " "   " " " " "*" " "  " "    " "     "*"   
## 6  ( 1 ) "*"  " "   " "   " "   " "   " " "*" "*" " "  " "    " "     "*"   
## 7  ( 1 ) "*"  " "   " "   "*"   " "   " " "*" "*" " "  " "    " "     "*"   
## 8  ( 1 ) "*"  " "   " "   "*"   " "   " " "*" "*" " "  " "    " "     "*"   
##          ht  wt71 wt82 wt82_71 birthplace smokeintensity smkintensity82_71
## 1  ( 1 ) " " " "  " "  " "     " "        " "            " "              
## 2  ( 1 ) " " " "  " "  " "     " "        " "            " "              
## 3  ( 1 ) " " " "  " "  " "     " "        "*"            " "              
## 4  ( 1 ) " " " "  " "  " "     " "        "*"            " "              
## 5  ( 1 ) " " " "  " "  " "     " "        "*"            " "              
## 6  ( 1 ) " " " "  " "  " "     " "        "*"            " "              
## 7  ( 1 ) " " " "  " "  " "     " "        "*"            " "              
## 8  ( 1 ) " " " "  " "  "*"     " "        "*"            " "              
##          asthma bronch tb  hf  hbp pepticulcer colitis hepatitis chroniccough
## 1  ( 1 ) " "    " "    " " " " " " " "         " "     " "       " "         
## 2  ( 1 ) " "    " "    " " " " " " " "         " "     " "       " "         
## 3  ( 1 ) " "    " "    " " " " " " " "         " "     " "       " "         
## 4  ( 1 ) " "    " "    " " " " " " " "         " "     " "       " "         
## 5  ( 1 ) " "    " "    " " " " " " " "         " "     " "       " "         
## 6  ( 1 ) " "    " "    " " " " " " " "         " "     " "       " "         
## 7  ( 1 ) " "    " "    " " " " " " " "         " "     " "       " "         
## 8  ( 1 ) " "    " "    " " " " " " " "         " "     " "       " "         
##          hayfever polio tumor nervousbreak alcoholpy alcoholfreq alcoholtype
## 1  ( 1 ) " "      " "   " "   " "          " "       " "         " "        
## 2  ( 1 ) " "      " "   " "   " "          " "       " "         " "        
## 3  ( 1 ) " "      " "   " "   " "          " "       " "         " "        
## 4  ( 1 ) " "      " "   " "   " "          " "       " "         " "        
## 5  ( 1 ) " "      " "   " "   " "          " "       " "         " "        
## 6  ( 1 ) " "      " "   " "   " "          " "       " "         " "        
## 7  ( 1 ) " "      " "   " "   " "          " "       " "         " "        
## 8  ( 1 ) " "      " "   " "   " "          " "       " "         " "        
##          alcoholhowmuch headache otherpain weakheart allergies nerves lackpep
## 1  ( 1 ) " "            " "      " "       " "       " "       " "    " "    
## 2  ( 1 ) " "            " "      " "       " "       " "       " "    " "    
## 3  ( 1 ) " "            " "      " "       " "       " "       " "    " "    
## 4  ( 1 ) " "            " "      " "       " "       " "       " "    " "    
## 5  ( 1 ) " "            " "      " "       " "       " "       " "    " "    
## 6  ( 1 ) " "            " "      " "       " "       " "       " "    " "    
## 7  ( 1 ) " "            " "      " "       " "       " "       " "    " "    
## 8  ( 1 ) " "            " "      " "       " "       " "       " "    " "    
##          wtloss infection active exercise birthcontrol pregnancies cholesterol
## 1  ( 1 ) " "    " "       " "    " "      " "          " "         " "        
## 2  ( 1 ) " "    " "       " "    " "      "*"          " "         " "        
## 3  ( 1 ) " "    " "       " "    " "      "*"          " "         " "        
## 4  ( 1 ) " "    " "       " "    " "      "*"          " "         " "        
## 5  ( 1 ) " "    " "       " "    " "      "*"          " "         " "        
## 6  ( 1 ) " "    " "       " "    " "      "*"          " "         " "        
## 7  ( 1 ) " "    " "       " "    " "      "*"          " "         " "        
## 8  ( 1 ) " "    " "       " "    " "      "*"          " "         " "        
##          hightax82 price71 price82 tax82 price71_82 bin_exercise
## 1  ( 1 ) " "       " "     " "     " "   " "        " "         
## 2  ( 1 ) " "       " "     " "     " "   " "        " "         
## 3  ( 1 ) " "       " "     " "     " "   " "        " "         
## 4  ( 1 ) " "       " "     " "     " "   " "        " "         
## 5  ( 1 ) " "       " "     " "     " "   " "        " "         
## 6  ( 1 ) " "       " "     " "     " "   " "        " "         
## 7  ( 1 ) " "       " "     " "     " "   " "        " "         
## 8  ( 1 ) " "       " "     " "     " "   " "        " "

 reg_summary$rsq

## [1] 0.7570685 0.7740525 0.7782912 0.7818122 0.7832056 0.7844366 0.7855678
## [8] 0.7865087

 par(mfrow = c(2,2))
plot(reg_summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
plot(reg_summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted RSq", type = "l")

# We will now plot a red dot to indicate the model with the largest adjusted R^2 statistic.
# The which.max() function can be used to identify the location of the maximum point of a vector
adj_r2_max = which.max(reg_summary$adjr2) # 11

# The points() command works like the plot() command, except that it puts points 
# on a plot that has already been created instead of creating a new plot
points(adj_r2_max, reg_summary$adjr2[adj_r2_max], col ="red", cex = 2, pch = 20)

# We'll do the same for C_p and BIC, this time looking for the models with the SMALLEST statistic
plot(reg_summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
cp_min = which.min(reg_summary$cp) # 10
points(cp_min, reg_summary$cp[cp_min], col = "red", cex = 2, pch = 20)

plot(reg_summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")
bic_min = which.min(reg_summary$bic) # 6
points(bic_min, reg_summary$bic[bic_min], col = "red", cex = 2, pch = 20)

plot(regfit_full, scale = "Cp")

coef(regfit_full, 8)

##    (Intercept)           qsmk          modth            dbp            age 
##     24.7974352     -0.5008218     -0.4221154     -0.5052096     10.5723136 
##         school        wt82_71 smokeintensity   birthcontrol 
##     -0.7492200      0.3927721      0.7609951      1.6070478

Training model performance under a linear model

Although we are using a linear model, the residuals are quite normal, and we do have some heteroskedasticity suggesting that the model can be improved by either log-XR of the response variable or a count model.

 regfit_tr = lm(smokeyrs~qsmk+modth+dbp+age+school+wt82_71+smokeintensity+birthcontrol, data = trainD)
 residuals(regfit_tr)%>%hist(main='residuals')

 plot(trainD$smokeyrs,residuals(regfit_tr))

summary(regfit_tr) ## R2 0.7864

## 
## Call:
## lm(formula = smokeyrs ~ qsmk + modth + dbp + age + school + wt82_71 + 
##     smokeintensity + birthcontrol, data = trainD)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.094  -1.984   0.536   3.009  18.348 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     24.7974     0.1680 147.622  < 2e-16 ***
## qsmk            -0.5008     0.1722  -2.909  0.00370 ** 
## modth           -0.4221     0.1695  -2.490  0.01293 *  
## dbp             -0.5052     0.1720  -2.936  0.00339 ** 
## age             10.5723     0.1798  58.791  < 2e-16 ***
## school          -0.7492     0.1736  -4.317 1.72e-05 ***
## wt82_71          0.3928     0.1738   2.260  0.02400 *  
## smokeintensity   0.7610     0.1704   4.465 8.77e-06 ***
## birthcontrol     1.6070     0.1778   9.036  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.737 on 1159 degrees of freedom
## Multiple R-squared:  0.7865, Adjusted R-squared:  0.785 
## F-statistic: 533.7 on 8 and 1159 DF,  p-value: < 2.2e-16

Validation performance under linear model

##  fit in test/validation set

regfit_val = lm(smokeyrs~qsmk+modth+dbp+age+school+wt82_71+smokeintensity+birthcontrol, data = testD)

regfit_val%>%summary

## 
## Call:
## lm(formula = smokeyrs ~ qsmk + modth + dbp + age + school + wt82_71 + 
##     smokeintensity + birthcontrol, data = testD)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.7328  -1.9629   0.6996   3.1256  13.2469 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     24.9926     0.2496 100.135  < 2e-16 ***
## qsmk            -0.3537     0.2665  -1.327 0.185156    
## modth            0.3876     0.2720   1.425 0.154842    
## dbp              0.2198     0.2515   0.874 0.382776    
## age             10.3054     0.2866  35.962  < 2e-16 ***
## school          -1.0804     0.2767  -3.905 0.000109 ***
## wt82_71          0.1711     0.2733   0.626 0.531591    
## smokeintensity   0.5961     0.2763   2.158 0.031489 *  
## birthcontrol     1.7408     0.2723   6.393 4.06e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.329 on 452 degrees of freedom
## Multiple R-squared:  0.7964, Adjusted R-squared:  0.7928 
## F-statistic:   221 on 8 and 452 DF,  p-value: < 2.2e-16

Optimal performance

I split the data into training and validation and used a best subsets model that choose the best features using smallest Mallow’s Cp, and optimized R2 value. I used a linear model for a count outcome which is not entirely correct, but the residuals of the linear fit were arguably sufficiently satisfied.

lab 1

ac

2022-08-30