#EXAMPLE 1 #install MASS packages
library(MASS)
pairs(Rubber)
#run MLR with loss as response and tensile strength/hardness as
regressors
mlrfit <- lm(loss ~ hard + tens, data=Rubber)
summary(mlrfit)
##
## Call:
## lm(formula = loss ~ hard + tens, data = Rubber)
##
## Residuals:
## Min 1Q Median 3Q Max
## -79.385 -14.608 3.816 19.755 65.981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 885.1611 61.7516 14.334 3.84e-14 ***
## hard -6.5708 0.5832 -11.267 1.03e-11 ***
## tens -1.3743 0.1943 -7.073 1.32e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36.49 on 27 degrees of freedom
## Multiple R-squared: 0.8402, Adjusted R-squared: 0.8284
## F-statistic: 71 on 2 and 27 DF, p-value: 1.767e-11
data<-read.csv("heart failure.csv", header = TRUE) # import the data
# head(data, 10)
#Run the multiple regression model
modhrt <- lm(length_of_stay_days ~ age + sex + bmi + heart_rate_admit +
diastolic_bp_admit + systolic_bp_admit, data = data)
summary(modhrt)
##
## Call:
## lm(formula = length_of_stay_days ~ age + sex + bmi + heart_rate_admit +
## diastolic_bp_admit + systolic_bp_admit, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1543 -1.2494 -0.3386 0.8314 16.6321
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.4986679 0.1549770 22.575 <2e-16 ***
## age 0.0240388 0.0010229 23.501 <2e-16 ***
## sexMale 0.0116105 0.0215740 0.538 0.590
## bmi 0.0028588 0.0017939 1.594 0.111
## heart_rate_admit 0.0186253 0.0005998 31.054 <2e-16 ***
## diastolic_bp_admit -0.0181577 0.0009381 -19.356 <2e-16 ***
## systolic_bp_admit -0.0110428 0.0005737 -19.248 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.866 on 29993 degrees of freedom
## Multiple R-squared: 0.09997, Adjusted R-squared: 0.09979
## F-statistic: 555.2 on 6 and 29993 DF, p-value: < 2.2e-16
anova(modhrt)
## Analysis of Variance Table
##
## Response: length_of_stay_days
## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 4180 4179.6 1200.2364 <2e-16 ***
## sex 1 1 1.1 0.3056 0.5804
## bmi 1 9 8.9 2.5448 0.1107
## heart_rate_admit 1 4559 4559.4 1309.3065 <2e-16 ***
## diastolic_bp_admit 1 1562 1561.7 448.4583 <2e-16 ***
## systolic_bp_admit 1 1290 1290.1 370.4842 <2e-16 ***
## Residuals 29993 104446 3.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ANOVA for a subset of coefficients
# Full model
modhrt <- lm(length_of_stay_days ~ age + sex + bmi + heart_rate_admit +
diastolic_bp_admit + systolic_bp_admit, data = data)
# Reduced model (removing sex and bmi)
reduced_model<- lm(length_of_stay_days ~ age + heart_rate_admit +
diastolic_bp_admit + systolic_bp_admit, data = data)
# Compare models (partial F-test)
anova(reduced_model, modhrt)
## Analysis of Variance Table
##
## Model 1: length_of_stay_days ~ age + heart_rate_admit + diastolic_bp_admit +
## systolic_bp_admit
## Model 2: length_of_stay_days ~ age + sex + bmi + heart_rate_admit + diastolic_bp_admit +
## systolic_bp_admit
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 29995 104455
## 2 29993 104446 2 9.6815 1.3901 0.2491
#Using pf() for F distribution probabilites
#right-tail probability P(F > 3.5)
pf(3.5, df1 = 5, df2 = 10, lower.tail = FALSE)
## [1] 0.04348504
# Right-tail probability P(F > 1.3901)
pf(1.3901, df1 = 2, df2 = 5, lower.tail = FALSE)
## [1] 0.3310916
# Right-tail probability P(F > 8.2)
pf(8.2, df1 = 5, df2 = 10, lower.tail = FALSE)
## [1] 0.002602667
# Right-tail probability P(F > 16.4)
pf(16.4, df1 = 2, df2 = 5, lower.tail = FALSE)
## [1] 0.006363492
#R^2 must be >0.8284
library(MASS)
mlrfitnew <- lm(loss ~ hard + log(tens), data=Rubber)
summary(mlrfitnew)
##
## Call:
## lm(formula = loss ~ hard + log(tens), data = Rubber)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89.746 -14.567 2.037 18.552 66.132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1919.9258 196.7864 9.756 2.40e-10 ***
## hard -6.7153 0.5878 -11.424 7.58e-12 ***
## log(tens) -245.9099 34.6141 -7.104 1.22e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36.38 on 27 degrees of freedom
## Multiple R-squared: 0.8412, Adjusted R-squared: 0.8294
## F-statistic: 71.49 on 2 and 27 DF, p-value: 1.634e-11
pairs(data.frame(
loss = Rubber$loss, hard = Rubber$hard, logtens = log(Rubber$tens)))
mlrfitnew <- lm(loss ~ hard + log(tens), data=Rubber)
summary(mlrfitnew)
##
## Call:
## lm(formula = loss ~ hard + log(tens), data = Rubber)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89.746 -14.567 2.037 18.552 66.132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1919.9258 196.7864 9.756 2.40e-10 ***
## hard -6.7153 0.5878 -11.424 7.58e-12 ***
## log(tens) -245.9099 34.6141 -7.104 1.22e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36.38 on 27 degrees of freedom
## Multiple R-squared: 0.8412, Adjusted R-squared: 0.8294
## F-statistic: 71.49 on 2 and 27 DF, p-value: 1.634e-11
Overall the model is found to be very significant since the p-value = 1.63e-11 << 0.05
\[ loss = 1919.9258 - 6.7153 * hard - 245.9099 * log(tens) \]
confint(mlrfitnew, level = 0.90)
## 5 % 95 %
## (Intercept) 1584.741700 2255.109828
## hard -7.716515 -5.714063
## log(tens) -304.867664 -186.952187
modhrtnew <- lm(log(length_of_stay_days) ~ age + diabetes + hypertension + icu_stay +
prior_inpatient_admits_12m, data = data)
summary(modhrtnew)
##
## Call:
## lm(formula = log(length_of_stay_days) ~ age + diabetes + hypertension +
## icu_stay + prior_inpatient_admits_12m, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.46827 -0.21642 -0.01106 0.24712 1.34312
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7861271 0.0145008 54.213 <2e-16 ***
## age 0.0052185 0.0002044 25.528 <2e-16 ***
## diabetes 0.0106082 0.0046248 2.294 0.0218 *
## hypertension 0.0084551 0.0054639 1.547 0.1218
## icu_stay 0.5097645 0.0057715 88.324 <2e-16 ***
## prior_inpatient_admits_12m 0.0958397 0.0022566 42.471 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3707 on 29994 degrees of freedom
## Multiple R-squared: 0.2761, Adjusted R-squared: 0.276
## F-statistic: 2288 on 5 and 29994 DF, p-value: < 2.2e-16
modhrt_newreduced <- lm(log(length_of_stay_days) ~ age + diabetes + icu_stay + prior_inpatient_admits_12m, data = data)
summary(modhrt_newreduced)
##
## Call:
## lm(formula = log(length_of_stay_days) ~ age + diabetes + icu_stay +
## prior_inpatient_admits_12m, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.46724 -0.21758 -0.01173 0.24472 1.34430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7885308 0.0144177 54.692 <2e-16 ***
## age 0.0052756 0.0002011 26.238 <2e-16 ***
## diabetes 0.0106748 0.0046247 2.308 0.021 *
## icu_stay 0.5097844 0.0057717 88.325 <2e-16 ***
## prior_inpatient_admits_12m 0.0961035 0.0022502 42.709 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3707 on 29995 degrees of freedom
## Multiple R-squared: 0.276, Adjusted R-squared: 0.2759
## F-statistic: 2859 on 4 and 29995 DF, p-value: < 2.2e-16
anova(modhrt_newreduced,modhrtnew)
## Analysis of Variance Table
##
## Model 1: log(length_of_stay_days) ~ age + diabetes + icu_stay + prior_inpatient_admits_12m
## Model 2: log(length_of_stay_days) ~ age + diabetes + hypertension + icu_stay +
## prior_inpatient_admits_12m
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 29995 4122.3
## 2 29994 4121.9 1 0.32907 2.3946 0.1218
The full multiple regression model used age, diabetes, hypertension, icu stay, and prior patient admissions in the past 12 months. Based on the initial MLR model with these predictors, hypertension was not found to be significant predictors of the transformed response in log(length of stay). This variables were then removed and the anova model found that icu stay, prior patient admissions, and age were all significant predictors of the response of log(length of stay). For the anova model, since p = 0.1218 > 0.05 we find that removing the variable hypertension did not create a significant difference between the models and can be removed to effectively simplify the model.