# ~ CRP 245 Part 2 Day 2 ~
# Framingham Data
# Dataset: framBMI
# The Framingham Study is a longitudinal investigation of
# constitutional and environmental factors influencing the
# development of CVD in men and women. Examination of participants
# has taken place every two years and the cohort has been followed
# for morbidity and mortality over that time period.
# Data Dictionary:
# SYSBP systolic blood pressure (mean of last
# two of three measurements)(mmHg)
# BMI Body Mass Index
# AGE Age at exam in years
# SEX; Sex at Birth; 1 = Male; 2 = Female
# BPMEDS Antihypertensive medication use at exam
# (0=not currently using; 1=current use)
# DIABP diastolic blood pressure (mean of last
# two of three measurements)(mmHg)
# DIABETES
# CURSMOKE Current cigarette smoking at exam
# (0=not current smoker; 1=current smoker)
# CIGPDAY 0=not current smoker; 1-99 cig per day
# TOTCHOL Serum Total Cholesterol (mg/dL)
# Download and load the FramBMI dataset:
load(url("http://www.duke.edu/~sgrambow/crp241data/FramBMI.RData"))
# examine data structure
str(FramBMI)
## 'data.frame': 4434 obs. of 10 variables:
## $ SYSBP : num 106 121 128 150 130 ...
## $ BMI : num 27 28.7 25.3 28.6 23.1 ...
## $ AGE : int 39 46 48 61 46 43 63 45 52 43 ...
## $ SEX : int 1 2 1 2 2 2 2 2 1 1 ...
## $ BPMEDS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DIABP : num 70 81 80 95 84 110 71 71 89 107 ...
## $ DIABETES: int 0 0 0 0 0 0 0 0 0 0 ...
## $ CURSMOKE: int 0 0 1 1 1 0 0 1 0 1 ...
## $ CIGPDAY : int 0 0 20 30 23 0 0 20 0 30 ...
## $ TOTCHOL : int 195 250 245 225 285 228 205 313 260 225 ...
# summary statistics
summary(FramBMI)
## SYSBP BMI AGE SEX
## Min. : 83.5 Min. :15.54 Min. :32.00 Min. :1.000
## 1st Qu.:117.5 1st Qu.:23.09 1st Qu.:42.00 1st Qu.:1.000
## Median :129.0 Median :25.45 Median :49.00 Median :2.000
## Mean :132.9 Mean :25.85 Mean :49.93 Mean :1.562
## 3rd Qu.:144.0 3rd Qu.:28.09 3rd Qu.:57.00 3rd Qu.:2.000
## Max. :295.0 Max. :56.80 Max. :70.00 Max. :2.000
## NA's :19
## BPMEDS DIABP DIABETES CURSMOKE
## Min. :0.00000 Min. : 48.00 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.: 75.00 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median : 82.00 Median :0.00000 Median :0.0000
## Mean :0.03293 Mean : 83.08 Mean :0.02729 Mean :0.4919
## 3rd Qu.:0.00000 3rd Qu.: 90.00 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :142.50 Max. :1.00000 Max. :1.0000
## NA's :61
## CIGPDAY TOTCHOL
## Min. : 0.000 Min. :107
## 1st Qu.: 0.000 1st Qu.:206
## Median : 0.000 Median :234
## Mean : 8.966 Mean :237
## 3rd Qu.:20.000 3rd Qu.:264
## Max. :70.000 Max. :696
## NA's :32 NA's :52
# distribution of SYYBP
# visualize Y with a nice looking
# boxplot that adds some extra labeling
boxplot(FramBMI$SYSBP,
main="Boxplot of SYSBP showing variability in Outcome",
ylab = "Outcome (SYSBP)")
# add mean to boxplot with points function
points(mean(FramBMI$SYSBP),cex=2,pch=16,col="red")
# add individual points to boxplot to show spread
# using stripchart function
# add a legend
legend("topright", c("Sample Mean","Sample Median"),
cex=0.8,lty=c(1,1),lwd=c(3,3),
col = c("red","black",bty ="n"))
# fit a multiple linear regression model with
# Y = systolic blood pressure (SYSBP)
# X1 = BMI (BMI)
# X2 = age (AGE)
# X3 = sex (SEX); 1 = Male; 2 = Female
# X4 = antihypertensive medication use (BPMEDS)
# ANSWER:
fit.mlr1 <- lm(SYSBP ~ BMI + AGE + SEX + BPMEDS, data = FramBMI)
summary(fit.mlr1)
##
## Call:
## lm(formula = SYSBP ~ BMI + AGE + SEX + BPMEDS, data = FramBMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.867 -12.556 -2.297 10.121 130.723
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.55750 2.56194 18.563 < 2e-16 ***
## BMI 1.47058 0.07135 20.612 < 2e-16 ***
## AGE 0.86062 0.03386 25.420 < 2e-16 ***
## SEX 2.27589 0.58281 3.905 9.56e-05 ***
## BPMEDS 24.32760 1.63934 14.840 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.01 on 4349 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.276, Adjusted R-squared: 0.2754
## F-statistic: 414.6 on 4 and 4349 DF, p-value: < 2.2e-16
# Create a new variable for SEX which is an indicator
# for male sex
# tutorial
# http://sphweb.bumc.bu.edu/otlt/MPH-Modules/PH717-QuantCore/PH717_MultipleVariableRegression/PH717_MultipleVariableRegression4.html
# we will use the ifelse function -- format is ifelse(a,b,c)
# a is the condition of interest (here male which is 1)
# b is the value you want to assign to those who have the
# condition of interest (we want to replace 1 with a 1).
# c is the value for those who do not have the condition;
# here that is 2=females and we map them to 0.
# NOTE: We want to be careful about missing values so they
# are not incorrectly coded to 0 along with females.
# The first ifelse below codes males from 1 to 1 (keeping them as 1's)
# The second ifelse codes females from 2 to 0
# The third ifelse codes everything else to missing.
# We add this as a security measure to make sure we dont
# accidentally code other values or missing values to 0
# (counting them as females)
# any missing values?
summary(FramBMI$SEX)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.562 2.000 2.000
# NO
FramBMI$MALE <- ifelse(FramBMI$SEX==1,1,
ifelse(FramBMI$SEX==2,0,
NA))
# Check recoding:
head(FramBMI[c("SEX","MALE")],5)
## SEX MALE
## 1 1 1
## 2 2 0
## 3 1 1
## 4 2 0
## 5 2 0
tail(FramBMI[c("SEX","MALE")],5)
## SEX MALE
## 4430 2 0
## 4431 2 0
## 4432 2 0
## 4433 1 1
## 4434 2 0
table(FramBMI$SEX,FramBMI$MALE,useNA = "ifany")
##
## 0 1
## 1 0 1944
## 2 2490 0
fit.mlr2 <- lm(SYSBP ~ BMI + AGE + MALE + BPMEDS, data = FramBMI)
summary(fit.mlr2)
##
## Call:
## lm(formula = SYSBP ~ BMI + AGE + MALE + BPMEDS, data = FramBMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.867 -12.556 -2.297 10.121 130.723
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.10927 2.35264 22.149 < 2e-16 ***
## BMI 1.47058 0.07135 20.612 < 2e-16 ***
## AGE 0.86062 0.03386 25.420 < 2e-16 ***
## MALE -2.27589 0.58281 -3.905 9.56e-05 ***
## BPMEDS 24.32760 1.63934 14.840 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.01 on 4349 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.276, Adjusted R-squared: 0.2754
## F-statistic: 414.6 on 4 and 4349 DF, p-value: < 2.2e-16
# Compare the beta coefficient in fit.mlr for SEX
# with the beta coefficient in fit.mlr2 for MALE
# fit.mlr -- SEX is 2.27589
# fit.mlr2 -- MALE is -2.27589
# they are exactly opposite and have the same p-value
# fit.mlr is modeling women and fit.mlr2 is modeling men
# which results in a sign flip.
# RECODING BMI AS CATEGORICAL (OPTION 1 -- INDICATOR VARIABLES)
# Lets recode BMI as a categorical variable with categories
# Underweight
# Normal
# Overweight
# Obese
# Lets make normal weight the reference category
# We need 3 indicator variables for a 4-level variable
# BMI Cate Underweight Normal Overweight Obese
# u.wgt 1 0 0 0
# o.wgt 0 0 1 0
# obese 0 0 0 1
#
# NOTE: n.wgt does not meet any conditions specified above,
# so will have values of 0 for u.wgt, o.wgt, and obese;
# this feature specifies n.wgt as reference group
# which is the absence of all three indicators!
# NOTE: The variable we are recoding is continuous and contains
# missing values. As long as those values are correctly
# reflected in the original variable, the coding scheme
# below will correctly handle NAs so that a missing BMI
# Will be coded as follows:
# Obs u.wgt o.wgt obese BMI
# 1 0 1 0 26.97
# 2 NA NA NA NA
# Create the indicator variables using the same ifelse approach as above
FramBMI$u.wgt<-ifelse(FramBMI$BMI>0 & FramBMI$BMI<18.5, 1, 0)
FramBMI$o.wgt<-ifelse(FramBMI$BMI>=25 & FramBMI$BMI<30, 1, 0)
FramBMI$obese<-ifelse(FramBMI$BMI>=30, 1, 0)
# check your recoding visually
head(FramBMI[c("u.wgt","o.wgt","obese","BMI")],5)
## u.wgt o.wgt obese BMI
## 1 0 1 0 26.97
## 2 0 1 0 28.73
## 3 0 1 0 25.34
## 4 0 1 0 28.58
## 5 0 0 0 23.10
tail(FramBMI[c("u.wgt","o.wgt","obese","BMI")],5)
## u.wgt o.wgt obese BMI
## 4430 0 0 0 22.00
## 4431 0 0 0 19.16
## 4432 0 0 0 21.47
## 4433 0 1 0 25.60
## 4434 0 0 0 20.91
# table() can be used to verify that missings have been
# correctly recoded
table(FramBMI$u.wgt,useNA = "ifany")
##
## 0 1 <NA>
## 4358 57 19
table(FramBMI$o.wgt,useNA = "ifany")
##
## 0 1 <NA>
## 2570 1845 19
table(FramBMI$obese,useNA = "ifany")
##
## 0 1 <NA>
## 3838 577 19
fit.mlr3 <- lm(SYSBP ~ u.wgt + o.wgt + obese + AGE + MALE + BPMEDS, data = FramBMI)
summary(fit.mlr3)
##
## Call:
## lm(formula = SYSBP ~ u.wgt + o.wgt + obese + AGE + MALE + BPMEDS,
## data = FramBMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.521 -12.845 -2.209 9.979 139.606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84.67507 1.73970 48.672 < 2e-16 ***
## u.wgt -6.46970 2.58809 -2.500 0.0125 *
## o.wgt 7.20117 0.64426 11.177 < 2e-16 ***
## obese 14.85372 0.92743 16.016 < 2e-16 ***
## AGE 0.87289 0.03431 25.444 < 2e-16 ***
## MALE -2.40472 0.59802 -4.021 5.89e-05 ***
## BPMEDS 25.02151 1.65839 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.24 on 4347 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.2585, Adjusted R-squared: 0.2574
## F-statistic: 252.5 on 6 and 4347 DF, p-value: < 2.2e-16
# RECODING BMI AS CATEGORICAL (OPTION 2 -- CHARACTER VARIABLE)
# Now lets use a different approach to code categorical
# recoding tutorial
# https://www.theanalysisfactor.com/r-tutorial-recoding-values/
# Categorical BMI making it 4 levels like this.
# We use the numerical prefixes to control which category is
# used as the reference by R -- noting that it selects the lowest
# alphanumeric. The nested ifelse approach shown allows us to create
# a single character variable and the last NA codes all other values
# to missing as we did above for the MALE variable
# 0 = 0-normal
# 1 = 1-Under
# 2 = 2-Over
# 3 = 3-Obese
FramBMI$cBMI <- ifelse(FramBMI$BMI>0 & FramBMI$BMI<18.5, "1-Under",
ifelse(FramBMI$BMI>=18.5 & FramBMI$BMI<25, "0-Norm",
ifelse(FramBMI$BMI>=25 & FramBMI$BMI<30, "2-Over",
ifelse(FramBMI$BMI>=30, "3-Obese",NA))))
# check your recoding visually
head(FramBMI[c("cBMI","BMI")],5)
## cBMI BMI
## 1 2-Over 26.97
## 2 2-Over 28.73
## 3 2-Over 25.34
## 4 2-Over 28.58
## 5 0-Norm 23.10
tail(FramBMI[c("cBMI","BMI")],5)
## cBMI BMI
## 4430 0-Norm 22.00
## 4431 0-Norm 19.16
## 4432 0-Norm 21.47
## 4433 2-Over 25.60
## 4434 0-Norm 20.91
# table can be used to verify that missings have been
# correctly recoded
summary(FramBMI$BMI)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 15.54 23.09 25.45 25.85 28.09 56.80 19
table(FramBMI$cBMI,useNA = "ifany")
##
## 0-Norm 1-Under 2-Over 3-Obese <NA>
## 1936 57 1845 577 19
# Now fit this model -- fit.mlr4
fit.mlr4 <- lm(SYSBP ~ cBMI + AGE + MALE + BPMEDS, data = FramBMI)
summary(fit.mlr4)
##
## Call:
## lm(formula = SYSBP ~ cBMI + AGE + MALE + BPMEDS, data = FramBMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.521 -12.845 -2.209 9.979 139.606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84.67507 1.73970 48.672 < 2e-16 ***
## cBMI1-Under -6.46970 2.58809 -2.500 0.0125 *
## cBMI2-Over 7.20117 0.64426 11.177 < 2e-16 ***
## cBMI3-Obese 14.85372 0.92743 16.016 < 2e-16 ***
## AGE 0.87289 0.03431 25.444 < 2e-16 ***
## MALE -2.40472 0.59802 -4.021 5.89e-05 ***
## BPMEDS 25.02151 1.65839 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.24 on 4347 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.2585, Adjusted R-squared: 0.2574
## F-statistic: 252.5 on 6 and 4347 DF, p-value: < 2.2e-16
# Creating categorical BMI using FACTOR variable
# The power of factor variables in R
# Take the original numeric BMI
# and we will change it directly to a factor
# use cut() function to divide numeric ranges into categories
# use include.lowest and right to ensure proper membership in
# each interval.
# include.lowest=TRUE -- intervals are closed on the left
# right = TRUE -- intervals are closed on the right
FramBMI$f.BMI <- cut(FramBMI$BMI,breaks = c(0,18.49,25,29.99,90),
include.lowest=TRUE,right=TRUE,
labels = c("Under","Normal","Over","Obese"))
summary(FramBMI$f.BMI)
## Under Normal Over Obese NA's
## 57 1936 1845 577 19
levels(FramBMI$f.BMI)
## [1] "Under" "Normal" "Over" "Obese"
# we note that the reference level is under and not normal
# We can change the reference level of f.BMI to be normal
# using the relevel function
# syntax: new.variable <- relevel(old.variable, ref="c")
# Note that the relevel function works with
# factor variables
# use relevel to change reference to normal
f.BMI.test <-relevel(FramBMI$f.BMI,ref="Normal")
# check that things changed as expected
levels(f.BMI.test)
## [1] "Normal" "Under" "Over" "Obese"
levels(FramBMI$f.BMI)
## [1] "Under" "Normal" "Over" "Obese"
# yep -- now resave test as f.BMI
FramBMI$f.BMI <- f.BMI.test
# check your recoding visually
head(FramBMI[c("f.BMI","BMI")],5)
## f.BMI BMI
## 1 Over 26.97
## 2 Over 28.73
## 3 Over 25.34
## 4 Over 28.58
## 5 Normal 23.10
tail(FramBMI[c("f.BMI","BMI")],5)
## f.BMI BMI
## 4430 Normal 22.00
## 4431 Normal 19.16
## 4432 Normal 21.47
## 4433 Over 25.60
## 4434 Normal 20.91
# table can be used to verify that missings have been
# correctly recoded
table(FramBMI$f.BMI,useNA = "ifany")
##
## Normal Under Over Obese <NA>
## 1936 57 1845 577 19
# Now fit this model -- fit.mlr5
fit.mlr5 <- lm(SYSBP ~ f.BMI + AGE + MALE + BPMEDS, data = FramBMI)
summary(fit.mlr5)
##
## Call:
## lm(formula = SYSBP ~ f.BMI + AGE + MALE + BPMEDS, data = FramBMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.521 -12.845 -2.209 9.979 139.606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84.67507 1.73970 48.672 < 2e-16 ***
## f.BMIUnder -6.46970 2.58809 -2.500 0.0125 *
## f.BMIOver 7.20117 0.64426 11.177 < 2e-16 ***
## f.BMIObese 14.85372 0.92743 16.016 < 2e-16 ***
## AGE 0.87289 0.03431 25.444 < 2e-16 ***
## MALE -2.40472 0.59802 -4.021 5.89e-05 ***
## BPMEDS 25.02151 1.65839 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.24 on 4347 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.2585, Adjusted R-squared: 0.2574
## F-statistic: 252.5 on 6 and 4347 DF, p-value: < 2.2e-16
# In order to get the overall p-value for the effect of the
# categorical BMI for any of the codings we have used,
# we can use the likelihood ratio test
# First time you use lmtest, you must
# install the package!
# install.packages("lmtest")
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric

# with this test, you are comparing two nested
# models
# FULL model includes the categorical BMI
# REDUCED model leaves it out but is otherwise the same
# lets use the factor version we just created
# we have to use the subset function here because
# there is missing data in the f.BMI variable
# For LRT test, both models need to use the same observations
FULL.model <- fit.mlr5 <- lm(SYSBP ~ f.BMI + AGE + MALE + BPMEDS,
data = FramBMI)
REDUCED.model <- lm(SYSBP ~ AGE + MALE + BPMEDS,
data = FramBMI,subset=!is.na(f.BMI))
# now compare models
lrtest(FULL.model,REDUCED.model)
## Likelihood ratio test
##
## Model 1: SYSBP ~ f.BMI + AGE + MALE + BPMEDS
## Model 2: SYSBP ~ AGE + MALE + BPMEDS
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 8 -19049
## 2 5 -19200 -3 301.25 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# the resulting p-value is very small and significant.
# Use car package Anova function to determine overall
# significance of f.BMI
# For linear regression the lrtest and Anova will provide
# equivalent tests for the overall effect of BMI
# install.packages("car")
# recal the fit.mlr5 is as follows:
fit.mlr5 <- lm(SYSBP ~ f.BMI + AGE + MALE + BPMEDS,data = FramBMI)
# lets reorder terms to make a point about sequential sums of squares
fit.mlr5R <- lm(SYSBP ~ AGE + MALE + BPMEDS+ f.BMI,data = FramBMI)
# The Anova function in the car package defaults to
# adjusted sums of squares which is what we want
# thus fit.mlr5 & fit,ml5R yield the same answer
# same sums of squares
library(car)
## Loading required package: carData
Anova(fit.mlr5)
## Anova Table (Type II tests)
##
## Response: SYSBP
## Sum Sq Df F value Pr(>F)
## f.BMI 115279 3 103.81 < 2.2e-16 ***
## AGE 239647 1 647.39 < 2.2e-16 ***
## MALE 5986 1 16.17 5.889e-05 ***
## BPMEDS 84268 1 227.64 < 2.2e-16 ***
## Residuals 1609157 4347
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(fit.mlr5R)
## Anova Table (Type II tests)
##
## Response: SYSBP
## Sum Sq Df F value Pr(>F)
## AGE 239647 1 647.39 < 2.2e-16 ***
## MALE 5986 1 16.17 5.889e-05 ***
## BPMEDS 84268 1 227.64 < 2.2e-16 ***
## f.BMI 115279 3 103.81 < 2.2e-16 ***
## Residuals 1609157 4347
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# we have used the anova (small a) before
# but this provides sequential sums of squares
# compare fit.mlr5 & fit.mlr5R below
anova(fit.mlr5)
## Analysis of Variance Table
##
## Response: SYSBP
## Df Sum Sq Mean Sq F value Pr(>F)
## f.BMI 3 183891 61297 165.589 < 2.2e-16 ***
## AGE 1 283812 283812 766.693 < 2.2e-16 ***
## MALE 1 8868 8868 23.957 1.021e-06 ***
## BPMEDS 1 84268 84268 227.643 < 2.2e-16 ***
## Residuals 4347 1609157 370
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit.mlr5R)
## Analysis of Variance Table
##
## Response: SYSBP
## Df Sum Sq Mean Sq F value Pr(>F)
## AGE 1 341363 341363 922.164 < 2.2e-16 ***
## MALE 1 3758 3758 10.151 0.001452 **
## BPMEDS 1 100439 100439 271.328 < 2.2e-16 ***
## f.BMI 3 115279 38426 103.805 < 2.2e-16 ***
## Residuals 4347 1609157 370
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# the sums of squares are different because anova
# provides sequential Sums of squares
# CENTERING INTERCEPT IN SYSBP MODEL
# use scale() function to center continuous variables
# recall fit.mlr5 coefficients
summary(fit.mlr5)
##
## Call:
## lm(formula = SYSBP ~ f.BMI + AGE + MALE + BPMEDS, data = FramBMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.521 -12.845 -2.209 9.979 139.606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84.67507 1.73970 48.672 < 2e-16 ***
## f.BMIUnder -6.46970 2.58809 -2.500 0.0125 *
## f.BMIOver 7.20117 0.64426 11.177 < 2e-16 ***
## f.BMIObese 14.85372 0.92743 16.016 < 2e-16 ***
## AGE 0.87289 0.03431 25.444 < 2e-16 ***
## MALE -2.40472 0.59802 -4.021 5.89e-05 ***
## BPMEDS 25.02151 1.65839 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.24 on 4347 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.2585, Adjusted R-squared: 0.2574
## F-statistic: 252.5 on 6 and 4347 DF, p-value: < 2.2e-16
# Now fit this model with centered age -- fit.mlr6
fit.mlr6 <- lm(SYSBP ~ f.BMI + scale(AGE,scale=FALSE) + MALE +
BPMEDS, data = FramBMI)
summary(fit.mlr6)
##
## Call:
## lm(formula = SYSBP ~ f.BMI + scale(AGE, scale = FALSE) + MALE +
## BPMEDS, data = FramBMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.521 -12.845 -2.209 9.979 139.606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 128.25471 0.49548 258.850 < 2e-16 ***
## f.BMIUnder -6.46970 2.58809 -2.500 0.0125 *
## f.BMIOver 7.20117 0.64426 11.177 < 2e-16 ***
## f.BMIObese 14.85372 0.92743 16.016 < 2e-16 ***
## scale(AGE, scale = FALSE) 0.87289 0.03431 25.444 < 2e-16 ***
## MALE -2.40472 0.59802 -4.021 5.89e-05 ***
## BPMEDS 25.02151 1.65839 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.24 on 4347 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.2585, Adjusted R-squared: 0.2574
## F-statistic: 252.5 on 6 and 4347 DF, p-value: < 2.2e-16
# WHAT IF WE DECIDE TO CHANGE REFERENCE LEVEL
# FROM NORMAL TO OBESE -- WHAT HAPPENS TO MODEL
# COEFFICIENTS?
# use relevel to change reference to OBESE
FramBMI$obese.f.BMI <-relevel(FramBMI$f.BMI,ref="Obese")
# check that things changed as expected
levels(FramBMI$obese.f.BMI)
## [1] "Obese" "Normal" "Under" "Over"
levels(FramBMI$f.BMI)
## [1] "Normal" "Under" "Over" "Obese"
# Here is the original factor version of BMI with
# reference as "Normal" and centered intercept
fit.mlr6 <- lm(SYSBP ~ f.BMI + scale(AGE,scale=FALSE) + MALE +
BPMEDS, data = FramBMI)
summary(fit.mlr6)
##
## Call:
## lm(formula = SYSBP ~ f.BMI + scale(AGE, scale = FALSE) + MALE +
## BPMEDS, data = FramBMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.521 -12.845 -2.209 9.979 139.606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 128.25471 0.49548 258.850 < 2e-16 ***
## f.BMIUnder -6.46970 2.58809 -2.500 0.0125 *
## f.BMIOver 7.20117 0.64426 11.177 < 2e-16 ***
## f.BMIObese 14.85372 0.92743 16.016 < 2e-16 ***
## scale(AGE, scale = FALSE) 0.87289 0.03431 25.444 < 2e-16 ***
## MALE -2.40472 0.59802 -4.021 5.89e-05 ***
## BPMEDS 25.02151 1.65839 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.24 on 4347 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.2585, Adjusted R-squared: 0.2574
## F-statistic: 252.5 on 6 and 4347 DF, p-value: < 2.2e-16
# now use new BMI variable with new reference
fit.mlr7 <- lm(SYSBP ~ obese.f.BMI + scale(AGE,scale=FALSE) + MALE +
BPMEDS, data = FramBMI)
summary(fit.mlr7)
##
## Call:
## lm(formula = SYSBP ~ obese.f.BMI + scale(AGE, scale = FALSE) +
## MALE + BPMEDS, data = FramBMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.521 -12.845 -2.209 9.979 139.606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 143.10843 0.85198 167.971 < 2e-16 ***
## obese.f.BMINormal -14.85372 0.92743 -16.016 < 2e-16 ***
## obese.f.BMIUnder -21.32341 2.67961 -7.958 2.22e-15 ***
## obese.f.BMIOver -7.65254 0.92892 -8.238 2.29e-16 ***
## scale(AGE, scale = FALSE) 0.87289 0.03431 25.444 < 2e-16 ***
## MALE -2.40472 0.59802 -4.021 5.89e-05 ***
## BPMEDS 25.02151 1.65839 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.24 on 4347 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.2585, Adjusted R-squared: 0.2574
## F-statistic: 252.5 on 6 and 4347 DF, p-value: < 2.2e-16
# Here is the releveled factor version with
# reference as "3-Obese"
obese.f.BMI.fit <- lm(SYSBP ~ obese.f.BMI + scale(AGE,scale=FALSE)
+ MALE + BPMEDS, data = FramBMI)
summary(obese.f.BMI.fit)
##
## Call:
## lm(formula = SYSBP ~ obese.f.BMI + scale(AGE, scale = FALSE) +
## MALE + BPMEDS, data = FramBMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.521 -12.845 -2.209 9.979 139.606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 143.10843 0.85198 167.971 < 2e-16 ***
## obese.f.BMINormal -14.85372 0.92743 -16.016 < 2e-16 ***
## obese.f.BMIUnder -21.32341 2.67961 -7.958 2.22e-15 ***
## obese.f.BMIOver -7.65254 0.92892 -8.238 2.29e-16 ***
## scale(AGE, scale = FALSE) 0.87289 0.03431 25.444 < 2e-16 ***
## MALE -2.40472 0.59802 -4.021 5.89e-05 ***
## BPMEDS 25.02151 1.65839 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.24 on 4347 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.2585, Adjusted R-squared: 0.2574
## F-statistic: 252.5 on 6 and 4347 DF, p-value: < 2.2e-16
# Nice UCLA tutorial on R contrast coding options
# https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
# Note you can easily convert a character variable
# to a factor variable with the as.factor function
# We can create a new factor version of BMI in
# the FramBMI dataframe
FramBMI$fcBMI <- as.factor(FramBMI$cBMI)
# lets make sure that worked
str(FramBMI$cBMI)
## chr [1:4434] "2-Over" "2-Over" "2-Over" "2-Over" "0-Norm" "3-Obese" ...
str(FramBMI$fcBMI)
## Factor w/ 4 levels "0-Norm","1-Under",..: 3 3 3 3 1 4 4 1 3 1 ...
# we can check the reference level of a factor variable
# with the levels function -- and see that because
# of our numeric coding of cBMI "0-Norm" remains the
# reference category
levels(FramBMI$fcBMI)
## [1] "0-Norm" "1-Under" "2-Over" "3-Obese"
##### END OF PROGRAM