#EXAMPLE 1 #install MASS packages

library(MASS)
pairs(Rubber)

#run MLR with loss as response and tensile strength/hardness as regressors

mlrfit <- lm(loss ~ hard + tens, data=Rubber)
summary(mlrfit)
## 
## Call:
## lm(formula = loss ~ hard + tens, data = Rubber)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -79.385 -14.608   3.816  19.755  65.981 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 885.1611    61.7516  14.334 3.84e-14 ***
## hard         -6.5708     0.5832 -11.267 1.03e-11 ***
## tens         -1.3743     0.1943  -7.073 1.32e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.49 on 27 degrees of freedom
## Multiple R-squared:  0.8402, Adjusted R-squared:  0.8284 
## F-statistic:    71 on 2 and 27 DF,  p-value: 1.767e-11

EXAMPLE 2

loading data in

data<-read.csv("heart failure.csv", header = TRUE) # import the data
# head(data, 10)

#Run the multiple regression model

modhrt <- lm(length_of_stay_days ~ age + sex + bmi + heart_rate_admit +
diastolic_bp_admit + systolic_bp_admit, data = data)
summary(modhrt)
## 
## Call:
## lm(formula = length_of_stay_days ~ age + sex + bmi + heart_rate_admit + 
##     diastolic_bp_admit + systolic_bp_admit, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1543 -1.2494 -0.3386  0.8314 16.6321 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.4986679  0.1549770  22.575   <2e-16 ***
## age                 0.0240388  0.0010229  23.501   <2e-16 ***
## sexMale             0.0116105  0.0215740   0.538    0.590    
## bmi                 0.0028588  0.0017939   1.594    0.111    
## heart_rate_admit    0.0186253  0.0005998  31.054   <2e-16 ***
## diastolic_bp_admit -0.0181577  0.0009381 -19.356   <2e-16 ***
## systolic_bp_admit  -0.0110428  0.0005737 -19.248   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.866 on 29993 degrees of freedom
## Multiple R-squared:  0.09997,    Adjusted R-squared:  0.09979 
## F-statistic: 555.2 on 6 and 29993 DF,  p-value: < 2.2e-16

Overall ANOVA (F-test for full model)

anova(modhrt)
## Analysis of Variance Table
## 
## Response: length_of_stay_days
##                       Df Sum Sq Mean Sq   F value Pr(>F)    
## age                    1   4180  4179.6 1200.2364 <2e-16 ***
## sex                    1      1     1.1    0.3056 0.5804    
## bmi                    1      9     8.9    2.5448 0.1107    
## heart_rate_admit       1   4559  4559.4 1309.3065 <2e-16 ***
## diastolic_bp_admit     1   1562  1561.7  448.4583 <2e-16 ***
## systolic_bp_admit      1   1290  1290.1  370.4842 <2e-16 ***
## Residuals          29993 104446     3.5                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#ANOVA for a subset of coefficients

# Full model
modhrt <- lm(length_of_stay_days ~ age + sex + bmi + heart_rate_admit +
diastolic_bp_admit + systolic_bp_admit, data = data)
# Reduced model (removing sex and bmi)
reduced_model<- lm(length_of_stay_days ~ age + heart_rate_admit +
diastolic_bp_admit + systolic_bp_admit, data = data)
# Compare models (partial F-test)
anova(reduced_model, modhrt)
## Analysis of Variance Table
## 
## Model 1: length_of_stay_days ~ age + heart_rate_admit + diastolic_bp_admit + 
##     systolic_bp_admit
## Model 2: length_of_stay_days ~ age + sex + bmi + heart_rate_admit + diastolic_bp_admit + 
##     systolic_bp_admit
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1  29995 104455                           
## 2  29993 104446  2    9.6815 1.3901 0.2491

#Using pf() for F distribution probabilites

#right-tail probability P(F > 3.5)
pf(3.5, df1 = 5, df2 = 10, lower.tail = FALSE)
## [1] 0.04348504
# Right-tail probability P(F > 1.3901)
pf(1.3901, df1 = 2, df2 = 5, lower.tail = FALSE)
## [1] 0.3310916
# Right-tail probability P(F > 8.2)
pf(8.2, df1 = 5, df2 = 10, lower.tail = FALSE)
## [1] 0.002602667
# Right-tail probability P(F > 16.4)
pf(16.4, df1 = 2, df2 = 5, lower.tail = FALSE)
## [1] 0.006363492

Assignment/Activities to complete

1. Repeat the steps in Example 1 by trying an appropriate transformation of the variables ‘loss’, ‘hard’ or ‘tens’.

#R^2 must be >0.8284

a. Display the transformed data using a scatterplot array, and specify a multiple linear regression model for predicting (transformed) loss from the predictors ‘hard’ and ‘tens’.

library(MASS)

mlrfitnew <- lm(loss ~ hard + log(tens), data=Rubber)
summary(mlrfitnew)
## 
## Call:
## lm(formula = loss ~ hard + log(tens), data = Rubber)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -89.746 -14.567   2.037  18.552  66.132 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1919.9258   196.7864   9.756 2.40e-10 ***
## hard          -6.7153     0.5878 -11.424 7.58e-12 ***
## log(tens)   -245.9099    34.6141  -7.104 1.22e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.38 on 27 degrees of freedom
## Multiple R-squared:  0.8412, Adjusted R-squared:  0.8294 
## F-statistic: 71.49 on 2 and 27 DF,  p-value: 1.634e-11
pairs(data.frame(
  loss = Rubber$loss, hard = Rubber$hard, logtens = log(Rubber$tens)))

b. Run the regression, and report the Summary of the model.

mlrfitnew <- lm(loss ~ hard + log(tens), data=Rubber)
summary(mlrfitnew)
## 
## Call:
## lm(formula = loss ~ hard + log(tens), data = Rubber)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -89.746 -14.567   2.037  18.552  66.132 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1919.9258   196.7864   9.756 2.40e-10 ***
## hard          -6.7153     0.5878 -11.424 7.58e-12 ***
## log(tens)   -245.9099    34.6141  -7.104 1.22e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.38 on 27 degrees of freedom
## Multiple R-squared:  0.8412, Adjusted R-squared:  0.8294 
## F-statistic: 71.49 on 2 and 27 DF,  p-value: 1.634e-11

c. Report the “overall significance” of the model

Overall the model is found to be very significant since the p-value = 1.63e-11 << 0.05

d. Write down the fitted model.

\[ loss = 1919.9258 - 6.7153 * hard - 245.9099 * log(tens) \]

e. Report the 90% confidence intervals for regression coefficients.

confint(mlrfitnew, level = 0.90)
##                     5 %        95 %
## (Intercept) 1584.741700 2255.109828
## hard          -7.716515   -5.714063
## log(tens)   -304.867664 -186.952187

2. Repeat the steps in Example 2 by trying an appropriate transformation of the dependent variable. Now use any five (5) variables from: “age”, “diabetes”, “hypertension”, “chronic_kidney_disease”, “creatinine_mg_dl”, “sodium_mmol_l”, “lvef_percent” “prior_inpatient_admits_12m”, “icu_stay” as predictors.

a. Run the multiple regression model as your “full model”.

modhrtnew <- lm(log(length_of_stay_days) ~ age + diabetes + hypertension + icu_stay +
prior_inpatient_admits_12m, data = data)

summary(modhrtnew)
## 
## Call:
## lm(formula = log(length_of_stay_days) ~ age + diabetes + hypertension + 
##     icu_stay + prior_inpatient_admits_12m, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.46827 -0.21642 -0.01106  0.24712  1.34312 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.7861271  0.0145008  54.213   <2e-16 ***
## age                        0.0052185  0.0002044  25.528   <2e-16 ***
## diabetes                   0.0106082  0.0046248   2.294   0.0218 *  
## hypertension               0.0084551  0.0054639   1.547   0.1218    
## icu_stay                   0.5097645  0.0057715  88.324   <2e-16 ***
## prior_inpatient_admits_12m 0.0958397  0.0022566  42.471   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3707 on 29994 degrees of freedom
## Multiple R-squared:  0.2761, Adjusted R-squared:  0.276 
## F-statistic:  2288 on 5 and 29994 DF,  p-value: < 2.2e-16

b. Remove any insignificant variable(s) and rerun the model as “reduced model”. Which Sig level are we at? 95 or 90?

modhrt_newreduced <- lm(log(length_of_stay_days) ~ age + diabetes + icu_stay + prior_inpatient_admits_12m, data = data)

summary(modhrt_newreduced)
## 
## Call:
## lm(formula = log(length_of_stay_days) ~ age + diabetes + icu_stay + 
##     prior_inpatient_admits_12m, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.46724 -0.21758 -0.01173  0.24472  1.34430 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.7885308  0.0144177  54.692   <2e-16 ***
## age                        0.0052756  0.0002011  26.238   <2e-16 ***
## diabetes                   0.0106748  0.0046247   2.308    0.021 *  
## icu_stay                   0.5097844  0.0057717  88.325   <2e-16 ***
## prior_inpatient_admits_12m 0.0961035  0.0022502  42.709   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3707 on 29995 degrees of freedom
## Multiple R-squared:  0.276,  Adjusted R-squared:  0.2759 
## F-statistic:  2859 on 4 and 29995 DF,  p-value: < 2.2e-16

c. If all the variables in the “full model” are significant, then add three (3) more variables from the list above into the model, now this new model become your “full model”, while the initial model becomes your “reduced model”.

d. Use “anova” to test the importance of the removed/added variables.

anova(modhrt_newreduced,modhrtnew)
## Analysis of Variance Table
## 
## Model 1: log(length_of_stay_days) ~ age + diabetes + icu_stay + prior_inpatient_admits_12m
## Model 2: log(length_of_stay_days) ~ age + diabetes + hypertension + icu_stay + 
##     prior_inpatient_admits_12m
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1  29995 4122.3                           
## 2  29994 4121.9  1   0.32907 2.3946 0.1218

e. Report your findings

The full multiple regression model used age, diabetes, hypertension, icu stay, and prior patient admissions in the past 12 months. Based on the initial MLR model with these predictors, hypertension was not found to be significant predictors of the transformed response in log(length of stay). This variables were then removed and the anova model found that icu stay, prior patient admissions, and age were all significant predictors of the response of log(length of stay). For the anova model, since p = 0.1218 > 0.05 we find that removing the variable hypertension did not create a significant difference between the models and can be removed to effectively simplify the model.