# ~ CRP 245 Part 2 Day 2 ~
# Framingham Data
# Dataset: framBMI

# The Framingham Study is a longitudinal investigation of 
# constitutional and environmental factors influencing the 
# development of CVD in men and women. Examination of participants 
# has taken place every two years and the cohort has been followed 
# for morbidity and mortality over that time period.

# Data Dictionary: 
# SYSBP        systolic blood pressure (mean of last 
#              two of three measurements)(mmHg)
# BMI          Body Mass Index
# AGE          Age at exam in years
# SEX;         Sex at Birth; 1 = Male; 2 = Female
# BPMEDS       Antihypertensive medication use at exam
#              (0=not currently using; 1=current use)
# DIABP        diastolic blood pressure (mean of last 
#              two of three measurements)(mmHg)
# DIABETES     
# CURSMOKE     Current cigarette smoking at exam 
#              (0=not current smoker; 1=current smoker)
# CIGPDAY      0=not current smoker; 1-99 cig per day
# TOTCHOL      Serum Total Cholesterol (mg/dL)

# Download and load the FramBMI dataset: 
load(url("http://www.duke.edu/~sgrambow/crp241data/FramBMI.RData"))

# examine data structure
str(FramBMI)
## 'data.frame':    4434 obs. of  10 variables:
##  $ SYSBP   : num  106 121 128 150 130 ...
##  $ BMI     : num  27 28.7 25.3 28.6 23.1 ...
##  $ AGE     : int  39 46 48 61 46 43 63 45 52 43 ...
##  $ SEX     : int  1 2 1 2 2 2 2 2 1 1 ...
##  $ BPMEDS  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DIABP   : num  70 81 80 95 84 110 71 71 89 107 ...
##  $ DIABETES: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CURSMOKE: int  0 0 1 1 1 0 0 1 0 1 ...
##  $ CIGPDAY : int  0 0 20 30 23 0 0 20 0 30 ...
##  $ TOTCHOL : int  195 250 245 225 285 228 205 313 260 225 ...
# summary statistics
summary(FramBMI)
##      SYSBP            BMI             AGE             SEX       
##  Min.   : 83.5   Min.   :15.54   Min.   :32.00   Min.   :1.000  
##  1st Qu.:117.5   1st Qu.:23.09   1st Qu.:42.00   1st Qu.:1.000  
##  Median :129.0   Median :25.45   Median :49.00   Median :2.000  
##  Mean   :132.9   Mean   :25.85   Mean   :49.93   Mean   :1.562  
##  3rd Qu.:144.0   3rd Qu.:28.09   3rd Qu.:57.00   3rd Qu.:2.000  
##  Max.   :295.0   Max.   :56.80   Max.   :70.00   Max.   :2.000  
##                  NA's   :19                                     
##      BPMEDS            DIABP           DIABETES          CURSMOKE     
##  Min.   :0.00000   Min.   : 48.00   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.: 75.00   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median : 82.00   Median :0.00000   Median :0.0000  
##  Mean   :0.03293   Mean   : 83.08   Mean   :0.02729   Mean   :0.4919  
##  3rd Qu.:0.00000   3rd Qu.: 90.00   3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :142.50   Max.   :1.00000   Max.   :1.0000  
##  NA's   :61                                                           
##     CIGPDAY          TOTCHOL   
##  Min.   : 0.000   Min.   :107  
##  1st Qu.: 0.000   1st Qu.:206  
##  Median : 0.000   Median :234  
##  Mean   : 8.966   Mean   :237  
##  3rd Qu.:20.000   3rd Qu.:264  
##  Max.   :70.000   Max.   :696  
##  NA's   :32       NA's   :52
# distribution of SYYBP
# visualize Y with a nice looking
# boxplot that adds some extra labeling
boxplot(FramBMI$SYSBP,
        main="Boxplot of SYSBP showing variability in Outcome",
        ylab = "Outcome (SYSBP)")
# add mean to boxplot with points function
points(mean(FramBMI$SYSBP),cex=2,pch=16,col="red")
# add individual points to boxplot to show spread
# using stripchart function

# add a legend
legend("topright", c("Sample Mean","Sample Median"),
       cex=0.8,lty=c(1,1),lwd=c(3,3),
       col = c("red","black",bty ="n"))

# fit a multiple linear regression model with 
# Y = systolic blood pressure (SYSBP)
# X1 = BMI (BMI)
# X2 = age (AGE)
# X3 = sex (SEX); 1 = Male; 2 = Female
# X4 = antihypertensive medication use (BPMEDS)

# ANSWER:
fit.mlr1 <- lm(SYSBP ~ BMI + AGE + SEX + BPMEDS, data = FramBMI)
summary(fit.mlr1)
## 
## Call:
## lm(formula = SYSBP ~ BMI + AGE + SEX + BPMEDS, data = FramBMI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.867 -12.556  -2.297  10.121 130.723 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 47.55750    2.56194  18.563  < 2e-16 ***
## BMI          1.47058    0.07135  20.612  < 2e-16 ***
## AGE          0.86062    0.03386  25.420  < 2e-16 ***
## SEX          2.27589    0.58281   3.905 9.56e-05 ***
## BPMEDS      24.32760    1.63934  14.840  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.01 on 4349 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.276,  Adjusted R-squared:  0.2754 
## F-statistic: 414.6 on 4 and 4349 DF,  p-value: < 2.2e-16
# Create a new variable for SEX which is an indicator
# for male sex
# tutorial
# http://sphweb.bumc.bu.edu/otlt/MPH-Modules/PH717-QuantCore/PH717_MultipleVariableRegression/PH717_MultipleVariableRegression4.html

# we will use the ifelse function -- format is ifelse(a,b,c)
# a is the condition of interest (here male which is 1)
# b is the value you want to assign to those who have the 
# condition of interest (we want to replace 1 with a 1).
# c is the value for those who do not have the condition;
# here that is 2=females and we map them to 0. 
# NOTE: We want to be careful about missing values so they 
# are not incorrectly coded to 0 along with females.
# The first ifelse below codes males from 1 to 1 (keeping them as 1's)
# The second ifelse codes females from 2 to 0
# The third ifelse codes everything else to missing.
# We add this as a security measure to make sure we dont
# accidentally code other values or missing values to 0
# (counting them as females)
# any missing values?
summary(FramBMI$SEX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   1.562   2.000   2.000
# NO
FramBMI$MALE <- ifelse(FramBMI$SEX==1,1,
                     ifelse(FramBMI$SEX==2,0,
                            NA))
# Check recoding:
head(FramBMI[c("SEX","MALE")],5)
##   SEX MALE
## 1   1    1
## 2   2    0
## 3   1    1
## 4   2    0
## 5   2    0
tail(FramBMI[c("SEX","MALE")],5)
##      SEX MALE
## 4430   2    0
## 4431   2    0
## 4432   2    0
## 4433   1    1
## 4434   2    0
table(FramBMI$SEX,FramBMI$MALE,useNA = "ifany")
##    
##        0    1
##   1    0 1944
##   2 2490    0
fit.mlr2 <- lm(SYSBP ~ BMI + AGE + MALE + BPMEDS, data = FramBMI)
summary(fit.mlr2)
## 
## Call:
## lm(formula = SYSBP ~ BMI + AGE + MALE + BPMEDS, data = FramBMI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.867 -12.556  -2.297  10.121 130.723 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 52.10927    2.35264  22.149  < 2e-16 ***
## BMI          1.47058    0.07135  20.612  < 2e-16 ***
## AGE          0.86062    0.03386  25.420  < 2e-16 ***
## MALE        -2.27589    0.58281  -3.905 9.56e-05 ***
## BPMEDS      24.32760    1.63934  14.840  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.01 on 4349 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.276,  Adjusted R-squared:  0.2754 
## F-statistic: 414.6 on 4 and 4349 DF,  p-value: < 2.2e-16
# Compare the beta coefficient in fit.mlr for SEX
# with the beta coefficient in fit.mlr2 for MALE
# fit.mlr -- SEX is 2.27589
# fit.mlr2 -- MALE is -2.27589
# they are exactly opposite and have the same p-value
# fit.mlr is modeling women and fit.mlr2 is modeling men
# which results in a sign flip.

# RECODING BMI AS CATEGORICAL (OPTION 1 -- INDICATOR VARIABLES)
# Lets recode BMI as a categorical variable with categories
# Underweight
# Normal
# Overweight
# Obese
# Lets make normal weight the reference category
# We need 3 indicator variables for a 4-level variable
# BMI Cate  Underweight  Normal  Overweight  Obese
#  u.wgt         1         0         0         0
#  o.wgt         0         0         1         0
#  obese         0         0         0         1
#
# NOTE: n.wgt does not meet any conditions specified above, 
#       so will have values of 0 for u.wgt, o.wgt, and obese;
#       this feature specifies n.wgt as reference group
#       which is the absence of all three indicators!
# NOTE: The variable we are recoding is continuous and contains
#       missing values.  As long as those values are correctly
#       reflected in the original variable, the coding scheme
#       below will correctly handle NAs so that a missing BMI
#       Will be coded as follows:
#  Obs      u.wgt o.wgt obese   BMI
#   1       0     1     0       26.97
#   2       NA    NA     NA     NA
# Create the indicator variables using the same ifelse approach as above
FramBMI$u.wgt<-ifelse(FramBMI$BMI>0 & FramBMI$BMI<18.5, 1, 0)
FramBMI$o.wgt<-ifelse(FramBMI$BMI>=25 & FramBMI$BMI<30, 1, 0)
FramBMI$obese<-ifelse(FramBMI$BMI>=30, 1, 0)

# check your recoding visually
head(FramBMI[c("u.wgt","o.wgt","obese","BMI")],5)
##   u.wgt o.wgt obese   BMI
## 1     0     1     0 26.97
## 2     0     1     0 28.73
## 3     0     1     0 25.34
## 4     0     1     0 28.58
## 5     0     0     0 23.10
tail(FramBMI[c("u.wgt","o.wgt","obese","BMI")],5)
##      u.wgt o.wgt obese   BMI
## 4430     0     0     0 22.00
## 4431     0     0     0 19.16
## 4432     0     0     0 21.47
## 4433     0     1     0 25.60
## 4434     0     0     0 20.91
# table() can be used to verify that missings have been 
# correctly recoded
table(FramBMI$u.wgt,useNA = "ifany")
## 
##    0    1 <NA> 
## 4358   57   19
table(FramBMI$o.wgt,useNA = "ifany")
## 
##    0    1 <NA> 
## 2570 1845   19
table(FramBMI$obese,useNA = "ifany")
## 
##    0    1 <NA> 
## 3838  577   19
fit.mlr3 <- lm(SYSBP ~ u.wgt + o.wgt + obese + AGE + MALE + BPMEDS, data = FramBMI)
summary(fit.mlr3)
## 
## Call:
## lm(formula = SYSBP ~ u.wgt + o.wgt + obese + AGE + MALE + BPMEDS, 
##     data = FramBMI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.521 -12.845  -2.209   9.979 139.606 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 84.67507    1.73970  48.672  < 2e-16 ***
## u.wgt       -6.46970    2.58809  -2.500   0.0125 *  
## o.wgt        7.20117    0.64426  11.177  < 2e-16 ***
## obese       14.85372    0.92743  16.016  < 2e-16 ***
## AGE          0.87289    0.03431  25.444  < 2e-16 ***
## MALE        -2.40472    0.59802  -4.021 5.89e-05 ***
## BPMEDS      25.02151    1.65839  15.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.24 on 4347 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.2585, Adjusted R-squared:  0.2574 
## F-statistic: 252.5 on 6 and 4347 DF,  p-value: < 2.2e-16
# RECODING BMI AS CATEGORICAL (OPTION 2 -- CHARACTER VARIABLE)
# Now lets use a different approach to code categorical
# recoding tutorial
# https://www.theanalysisfactor.com/r-tutorial-recoding-values/
# Categorical BMI making it 4 levels like this.
# We use the numerical prefixes to control which category is 
# used as the reference by R -- noting that it selects the lowest
# alphanumeric. The nested ifelse approach shown allows us to create 
# a single character variable and the last NA codes all other values
# to missing as we did above for the MALE variable
# 0 = 0-normal
# 1 = 1-Under
# 2 = 2-Over
# 3 = 3-Obese
FramBMI$cBMI <- ifelse(FramBMI$BMI>0 & FramBMI$BMI<18.5, "1-Under", 
                         ifelse(FramBMI$BMI>=18.5 & FramBMI$BMI<25, "0-Norm",
                                ifelse(FramBMI$BMI>=25 & FramBMI$BMI<30, "2-Over",
                                       ifelse(FramBMI$BMI>=30, "3-Obese",NA))))

# check your recoding visually
head(FramBMI[c("cBMI","BMI")],5)
##     cBMI   BMI
## 1 2-Over 26.97
## 2 2-Over 28.73
## 3 2-Over 25.34
## 4 2-Over 28.58
## 5 0-Norm 23.10
tail(FramBMI[c("cBMI","BMI")],5)
##        cBMI   BMI
## 4430 0-Norm 22.00
## 4431 0-Norm 19.16
## 4432 0-Norm 21.47
## 4433 2-Over 25.60
## 4434 0-Norm 20.91
# table can be used to verify that missings have been 
# correctly recoded
summary(FramBMI$BMI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   15.54   23.09   25.45   25.85   28.09   56.80      19
table(FramBMI$cBMI,useNA = "ifany")
## 
##  0-Norm 1-Under  2-Over 3-Obese    <NA> 
##    1936      57    1845     577      19
# Now fit this model -- fit.mlr4
fit.mlr4 <- lm(SYSBP ~ cBMI + AGE + MALE + BPMEDS, data = FramBMI)
summary(fit.mlr4)   
## 
## Call:
## lm(formula = SYSBP ~ cBMI + AGE + MALE + BPMEDS, data = FramBMI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.521 -12.845  -2.209   9.979 139.606 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 84.67507    1.73970  48.672  < 2e-16 ***
## cBMI1-Under -6.46970    2.58809  -2.500   0.0125 *  
## cBMI2-Over   7.20117    0.64426  11.177  < 2e-16 ***
## cBMI3-Obese 14.85372    0.92743  16.016  < 2e-16 ***
## AGE          0.87289    0.03431  25.444  < 2e-16 ***
## MALE        -2.40472    0.59802  -4.021 5.89e-05 ***
## BPMEDS      25.02151    1.65839  15.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.24 on 4347 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.2585, Adjusted R-squared:  0.2574 
## F-statistic: 252.5 on 6 and 4347 DF,  p-value: < 2.2e-16
# Creating categorical BMI using FACTOR variable
# The power of factor variables in R
# Take the original numeric BMI
# and we will change it directly to a factor
# use cut() function to divide numeric ranges into categories
# use include.lowest and right to ensure proper membership in 
# each interval.
# include.lowest=TRUE -- intervals are closed on the left
# right = TRUE -- intervals are closed on the right 
FramBMI$f.BMI <- cut(FramBMI$BMI,breaks = c(0,18.49,25,29.99,90),
                     include.lowest=TRUE,right=TRUE,
                     labels = c("Under","Normal","Over","Obese"))
summary(FramBMI$f.BMI)
##  Under Normal   Over  Obese   NA's 
##     57   1936   1845    577     19
levels(FramBMI$f.BMI)
## [1] "Under"  "Normal" "Over"   "Obese"
# we note that the reference level is under and not normal
# We can change the reference level of f.BMI to be normal
# using the relevel function
# syntax: new.variable <- relevel(old.variable, ref="c")
# Note that the relevel function works with 
# factor variables 

# use relevel to change reference to normal
f.BMI.test <-relevel(FramBMI$f.BMI,ref="Normal")

# check that things changed as expected
levels(f.BMI.test)
## [1] "Normal" "Under"  "Over"   "Obese"
levels(FramBMI$f.BMI)
## [1] "Under"  "Normal" "Over"   "Obese"
# yep -- now resave test as f.BMI
FramBMI$f.BMI <- f.BMI.test

# check your recoding visually
head(FramBMI[c("f.BMI","BMI")],5)
##    f.BMI   BMI
## 1   Over 26.97
## 2   Over 28.73
## 3   Over 25.34
## 4   Over 28.58
## 5 Normal 23.10
tail(FramBMI[c("f.BMI","BMI")],5)
##       f.BMI   BMI
## 4430 Normal 22.00
## 4431 Normal 19.16
## 4432 Normal 21.47
## 4433   Over 25.60
## 4434 Normal 20.91
# table can be used to verify that missings have been 
# correctly recoded
table(FramBMI$f.BMI,useNA = "ifany")
## 
## Normal  Under   Over  Obese   <NA> 
##   1936     57   1845    577     19
# Now fit this model -- fit.mlr5
fit.mlr5 <- lm(SYSBP ~ f.BMI + AGE + MALE + BPMEDS, data = FramBMI)
summary(fit.mlr5)  
## 
## Call:
## lm(formula = SYSBP ~ f.BMI + AGE + MALE + BPMEDS, data = FramBMI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.521 -12.845  -2.209   9.979 139.606 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 84.67507    1.73970  48.672  < 2e-16 ***
## f.BMIUnder  -6.46970    2.58809  -2.500   0.0125 *  
## f.BMIOver    7.20117    0.64426  11.177  < 2e-16 ***
## f.BMIObese  14.85372    0.92743  16.016  < 2e-16 ***
## AGE          0.87289    0.03431  25.444  < 2e-16 ***
## MALE        -2.40472    0.59802  -4.021 5.89e-05 ***
## BPMEDS      25.02151    1.65839  15.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.24 on 4347 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.2585, Adjusted R-squared:  0.2574 
## F-statistic: 252.5 on 6 and 4347 DF,  p-value: < 2.2e-16
# In order to get the overall p-value for the effect of the 
# categorical BMI for any of the codings we have used, 
# we can use the likelihood ratio test
# First time you use lmtest, you must
# install the package!
# install.packages("lmtest")
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# with this test, you are comparing two nested 
# models 
# FULL model includes the categorical BMI
# REDUCED model leaves it out but is otherwise the same
# lets use the factor version we just created
# we have to use the subset function here because
# there is missing data in the  f.BMI variable
# For LRT test, both models need to use the same observations
FULL.model <- fit.mlr5 <- lm(SYSBP ~ f.BMI + AGE + MALE + BPMEDS, 
                             data = FramBMI)
REDUCED.model <- lm(SYSBP ~ AGE + MALE + BPMEDS, 
                    data = FramBMI,subset=!is.na(f.BMI))
# now compare models
lrtest(FULL.model,REDUCED.model)
## Likelihood ratio test
## 
## Model 1: SYSBP ~ f.BMI + AGE + MALE + BPMEDS
## Model 2: SYSBP ~ AGE + MALE + BPMEDS
##   #Df LogLik Df  Chisq Pr(>Chisq)    
## 1   8 -19049                         
## 2   5 -19200 -3 301.25  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# the resulting p-value is very small and significant.

# Use car package Anova function to determine overall
# significance of f.BMI
# For linear regression the lrtest and Anova will provide
# equivalent tests for the overall effect of BMI

# install.packages("car")
# recal the fit.mlr5 is as follows:
fit.mlr5 <- lm(SYSBP ~ f.BMI + AGE + MALE + BPMEDS,data = FramBMI)
# lets reorder terms to make a point about sequential sums of squares
fit.mlr5R <- lm(SYSBP ~ AGE + MALE + BPMEDS+ f.BMI,data = FramBMI)
# The Anova function in the car package defaults to 
# adjusted sums of squares which is what we want
# thus fit.mlr5 & fit,ml5R yield the same answer
# same sums of squares
library(car)
## Loading required package: carData
Anova(fit.mlr5)
## Anova Table (Type II tests)
## 
## Response: SYSBP
##            Sum Sq   Df F value    Pr(>F)    
## f.BMI      115279    3  103.81 < 2.2e-16 ***
## AGE        239647    1  647.39 < 2.2e-16 ***
## MALE         5986    1   16.17 5.889e-05 ***
## BPMEDS      84268    1  227.64 < 2.2e-16 ***
## Residuals 1609157 4347                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(fit.mlr5R)
## Anova Table (Type II tests)
## 
## Response: SYSBP
##            Sum Sq   Df F value    Pr(>F)    
## AGE        239647    1  647.39 < 2.2e-16 ***
## MALE         5986    1   16.17 5.889e-05 ***
## BPMEDS      84268    1  227.64 < 2.2e-16 ***
## f.BMI      115279    3  103.81 < 2.2e-16 ***
## Residuals 1609157 4347                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# we have used the anova (small a) before
# but this provides sequential sums of squares
# compare fit.mlr5 & fit.mlr5R below
anova(fit.mlr5)
## Analysis of Variance Table
## 
## Response: SYSBP
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## f.BMI        3  183891   61297 165.589 < 2.2e-16 ***
## AGE          1  283812  283812 766.693 < 2.2e-16 ***
## MALE         1    8868    8868  23.957 1.021e-06 ***
## BPMEDS       1   84268   84268 227.643 < 2.2e-16 ***
## Residuals 4347 1609157     370                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit.mlr5R)
## Analysis of Variance Table
## 
## Response: SYSBP
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## AGE          1  341363  341363 922.164 < 2.2e-16 ***
## MALE         1    3758    3758  10.151  0.001452 ** 
## BPMEDS       1  100439  100439 271.328 < 2.2e-16 ***
## f.BMI        3  115279   38426 103.805 < 2.2e-16 ***
## Residuals 4347 1609157     370                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# the sums of squares are different because anova
# provides sequential Sums of squares

# CENTERING INTERCEPT IN SYSBP MODEL
# use scale() function to center continuous variables
# recall fit.mlr5 coefficients
summary(fit.mlr5)
## 
## Call:
## lm(formula = SYSBP ~ f.BMI + AGE + MALE + BPMEDS, data = FramBMI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.521 -12.845  -2.209   9.979 139.606 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 84.67507    1.73970  48.672  < 2e-16 ***
## f.BMIUnder  -6.46970    2.58809  -2.500   0.0125 *  
## f.BMIOver    7.20117    0.64426  11.177  < 2e-16 ***
## f.BMIObese  14.85372    0.92743  16.016  < 2e-16 ***
## AGE          0.87289    0.03431  25.444  < 2e-16 ***
## MALE        -2.40472    0.59802  -4.021 5.89e-05 ***
## BPMEDS      25.02151    1.65839  15.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.24 on 4347 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.2585, Adjusted R-squared:  0.2574 
## F-statistic: 252.5 on 6 and 4347 DF,  p-value: < 2.2e-16
# Now fit this model with centered age -- fit.mlr6
fit.mlr6 <- lm(SYSBP ~ f.BMI + scale(AGE,scale=FALSE) + MALE +
                 BPMEDS, data = FramBMI)
summary(fit.mlr6) 
## 
## Call:
## lm(formula = SYSBP ~ f.BMI + scale(AGE, scale = FALSE) + MALE + 
##     BPMEDS, data = FramBMI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.521 -12.845  -2.209   9.979 139.606 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               128.25471    0.49548 258.850  < 2e-16 ***
## f.BMIUnder                 -6.46970    2.58809  -2.500   0.0125 *  
## f.BMIOver                   7.20117    0.64426  11.177  < 2e-16 ***
## f.BMIObese                 14.85372    0.92743  16.016  < 2e-16 ***
## scale(AGE, scale = FALSE)   0.87289    0.03431  25.444  < 2e-16 ***
## MALE                       -2.40472    0.59802  -4.021 5.89e-05 ***
## BPMEDS                     25.02151    1.65839  15.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.24 on 4347 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.2585, Adjusted R-squared:  0.2574 
## F-statistic: 252.5 on 6 and 4347 DF,  p-value: < 2.2e-16
# WHAT IF WE DECIDE TO CHANGE REFERENCE LEVEL
# FROM NORMAL TO OBESE -- WHAT HAPPENS TO MODEL
# COEFFICIENTS?

# use relevel to change reference to OBESE
FramBMI$obese.f.BMI <-relevel(FramBMI$f.BMI,ref="Obese")

# check that things changed as expected
levels(FramBMI$obese.f.BMI)
## [1] "Obese"  "Normal" "Under"  "Over"
levels(FramBMI$f.BMI)
## [1] "Normal" "Under"  "Over"   "Obese"
# Here is the original factor version of BMI with
# reference as "Normal" and centered intercept
fit.mlr6 <- lm(SYSBP ~ f.BMI + scale(AGE,scale=FALSE) + MALE +
                 BPMEDS, data = FramBMI)
summary(fit.mlr6) 
## 
## Call:
## lm(formula = SYSBP ~ f.BMI + scale(AGE, scale = FALSE) + MALE + 
##     BPMEDS, data = FramBMI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.521 -12.845  -2.209   9.979 139.606 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               128.25471    0.49548 258.850  < 2e-16 ***
## f.BMIUnder                 -6.46970    2.58809  -2.500   0.0125 *  
## f.BMIOver                   7.20117    0.64426  11.177  < 2e-16 ***
## f.BMIObese                 14.85372    0.92743  16.016  < 2e-16 ***
## scale(AGE, scale = FALSE)   0.87289    0.03431  25.444  < 2e-16 ***
## MALE                       -2.40472    0.59802  -4.021 5.89e-05 ***
## BPMEDS                     25.02151    1.65839  15.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.24 on 4347 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.2585, Adjusted R-squared:  0.2574 
## F-statistic: 252.5 on 6 and 4347 DF,  p-value: < 2.2e-16
# now use new BMI variable with new reference
fit.mlr7 <- lm(SYSBP ~ obese.f.BMI + scale(AGE,scale=FALSE) + MALE +
                 BPMEDS, data = FramBMI)
summary(fit.mlr7)
## 
## Call:
## lm(formula = SYSBP ~ obese.f.BMI + scale(AGE, scale = FALSE) + 
##     MALE + BPMEDS, data = FramBMI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.521 -12.845  -2.209   9.979 139.606 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               143.10843    0.85198 167.971  < 2e-16 ***
## obese.f.BMINormal         -14.85372    0.92743 -16.016  < 2e-16 ***
## obese.f.BMIUnder          -21.32341    2.67961  -7.958 2.22e-15 ***
## obese.f.BMIOver            -7.65254    0.92892  -8.238 2.29e-16 ***
## scale(AGE, scale = FALSE)   0.87289    0.03431  25.444  < 2e-16 ***
## MALE                       -2.40472    0.59802  -4.021 5.89e-05 ***
## BPMEDS                     25.02151    1.65839  15.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.24 on 4347 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.2585, Adjusted R-squared:  0.2574 
## F-statistic: 252.5 on 6 and 4347 DF,  p-value: < 2.2e-16
# Here is the releveled factor version with 
# reference as "3-Obese"
obese.f.BMI.fit <- lm(SYSBP ~ obese.f.BMI + scale(AGE,scale=FALSE)
                      + MALE + BPMEDS, data = FramBMI)
summary(obese.f.BMI.fit)
## 
## Call:
## lm(formula = SYSBP ~ obese.f.BMI + scale(AGE, scale = FALSE) + 
##     MALE + BPMEDS, data = FramBMI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.521 -12.845  -2.209   9.979 139.606 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               143.10843    0.85198 167.971  < 2e-16 ***
## obese.f.BMINormal         -14.85372    0.92743 -16.016  < 2e-16 ***
## obese.f.BMIUnder          -21.32341    2.67961  -7.958 2.22e-15 ***
## obese.f.BMIOver            -7.65254    0.92892  -8.238 2.29e-16 ***
## scale(AGE, scale = FALSE)   0.87289    0.03431  25.444  < 2e-16 ***
## MALE                       -2.40472    0.59802  -4.021 5.89e-05 ***
## BPMEDS                     25.02151    1.65839  15.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.24 on 4347 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.2585, Adjusted R-squared:  0.2574 
## F-statistic: 252.5 on 6 and 4347 DF,  p-value: < 2.2e-16
# Nice UCLA tutorial on R contrast coding options
# https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/

# Note you can easily convert a character variable
# to a factor variable with the as.factor function
# We can create a new factor version of BMI in 
# the FramBMI dataframe
FramBMI$fcBMI <- as.factor(FramBMI$cBMI)

# lets make sure that worked
str(FramBMI$cBMI)
##  chr [1:4434] "2-Over" "2-Over" "2-Over" "2-Over" "0-Norm" "3-Obese" ...
str(FramBMI$fcBMI)
##  Factor w/ 4 levels "0-Norm","1-Under",..: 3 3 3 3 1 4 4 1 3 1 ...
# we can check the reference level of a factor variable
# with the levels function -- and see that because
# of our numeric coding of cBMI "0-Norm" remains the 
# reference category
levels(FramBMI$fcBMI)
## [1] "0-Norm"  "1-Under" "2-Over"  "3-Obese"
##### END OF PROGRAM