PREDICTING LUNG CAPACITY USING HEIGHT
lungcap = read.csv("C:/temp/LungCapData2.csv")
attach(lungcap)
names(lungcap)
## [1] "Age" "LungCap" "Height" "Gender" "Smoke"
is.null(lungcap)
## [1] FALSE
Summary Statistics:
summary(lungcap)
## Age LungCap Height Gender
## Min. : 3.000 Min. : 0.373 Min. :46.00 Length:654
## 1st Qu.: 8.000 1st Qu.: 3.943 1st Qu.:57.00 Class :character
## Median :10.000 Median : 5.643 Median :61.50 Mode :character
## Mean : 9.931 Mean : 5.910 Mean :61.14
## 3rd Qu.:12.000 3rd Qu.: 7.356 3rd Qu.:65.50
## Max. :19.000 Max. :15.379 Max. :74.00
## Smoke
## Length:654
## Class :character
## Mode :character
##
##
##
str(lungcap)
## 'data.frame': 654 obs. of 5 variables:
## $ Age : int 9 8 7 9 9 8 6 6 8 9 ...
## $ LungCap: num 3.12 3.17 3.16 2.67 3.68 ...
## $ Height : num 57 67.5 54.5 53 57 61 58 56 58.5 60 ...
## $ Gender : chr "female" "female" "female" "male" ...
## $ Smoke : chr "no" "no" "no" "no" ...
lungcap$Gender = as.factor(lungcap$Gender)
lungcap$Smoke = as.factor(lungcap$Smoke)
hist(lungcap$Age)
hist(lungcap$LungCap)
hist(lungcap$Height)
table(lungcap$Gender)
##
## female male
## 318 336
table(lungcap$Smoke)
##
## no yes
## 589 65
plot(Height,LungCap,xlab="Height",ylab="LungCap", main="LungCap vs. Height")
plot(Age,LungCap,xlab="Age",ylab="LungCap", main="LungCap vs. Age")
boxplot(LungCap ~ Gender,data = lungcap,xlab = "Gender",ylab = "LungCap",main = "LungCap vs. Gender",col="red")
boxplot(LungCap ~ Smoke,data = lungcap,xlab = "Smoke",ylab = "LungCap",main = "LungCap vs. Smoke",col="purple")
Correlation:
lungcapnum = lungcap[, c(1,2,3)]
correlationvar = cor(lungcapnum)
corrplot::corrplot(correlationvar, method = "shade", type= "full")
We see there is a higher correlation between LungCap-Height vs LungCap-Age. Therefore, we will explore predictive models between LungCap-Height in this analysis.
Model 1: LungCap vs Height + error
model1=lm(LungCap ~ Height)
summary(model1)
##
## Call:
## lm(formula = LungCap ~ Height)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2550 -0.7986 -0.0120 0.7342 6.3581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18.298036 0.544380 -33.61 <2e-16 ***
## Height 0.395927 0.008865 44.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.292 on 652 degrees of freedom
## Multiple R-squared: 0.7537, Adjusted R-squared: 0.7533
## F-statistic: 1995 on 1 and 652 DF, p-value: < 2.2e-16
plot(model1)
MODEL2: LungCap vs (Height + Height**2) + error
model2 <- lm(LungCap ~ Height + I(Height^2))
summary(model2)
##
## Call:
## lm(formula = LungCap ~ Height + I(Height^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4031 -0.6878 -0.0076 0.6577 5.9910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.080634 4.509553 3.566 0.000389 ***
## Height -0.750147 0.149566 -5.015 6.83e-07 ***
## I(Height^2) 0.009466 0.001233 7.675 6.07e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.238 on 651 degrees of freedom
## Multiple R-squared: 0.7741, Adjusted R-squared: 0.7734
## F-statistic: 1115 on 2 and 651 DF, p-value: < 2.2e-16
plot(model2)
MODEL3: LungCap vs (Height + Height2 + Height3) + error
model3 <- lm(LungCap ~ Height + I(Height^2) + I(Height^3))
summary(model3)
##
## Call:
## lm(formula = LungCap ~ Height + I(Height^2) + I(Height^3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3885 -0.6900 0.0069 0.6511 5.9936
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.293e-01 3.803e+01 -0.017 0.987
## Height 9.179e-02 1.908e+00 0.048 0.962
## I(Height^2) -4.567e-03 3.173e-02 -0.144 0.886
## I(Height^3) 7.739e-05 1.749e-04 0.443 0.658
##
## Residual standard error: 1.239 on 650 degrees of freedom
## Multiple R-squared: 0.7742, Adjusted R-squared: 0.7731
## F-statistic: 742.7 on 3 and 650 DF, p-value: < 2.2e-16
plot(model3)
We see that the adj R-squared value has decreased from model 2 to model 3 - this means that the additional term in model 3 does not improve the predictive power of the model.
MODEL COMPARISON:
anova(model1,model2)
## Analysis of Variance Table
##
## Model 1: LungCap ~ Height
## Model 2: LungCap ~ Height + I(Height^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 652 1088.41
## 2 651 998.09 1 90.314 58.907 6.069e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2,model3)
## Analysis of Variance Table
##
## Model 1: LungCap ~ Height + I(Height^2)
## Model 2: LungCap ~ Height + I(Height^2) + I(Height^3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 651 998.09
## 2 650 997.79 1 0.30066 0.1959 0.6582
anova2 = lm(LungCap ~ Height + I(Height^2))
anova3 = lm(LungCap ~ Height + I(Height^2) + I(Height^3))
plot(Height,LungCap)
abline(model1, col= "purple")
Height_2 = data.frame(Height=seq(min(Height),max(Height),0.01))
Height_2$pred1 = predict(anova2, Height_2)
lines(Height_2$Height, Height_2$pred1, col="red")
Height_2 = data.frame(Height=seq(min(Height),max(Height),0.01))
Height_2$pred1 = predict(anova3, Height_2)
lines(Height_2$Height, Height_2$pred1, col="green")
Conclusion: MODEL 2 is the best model out of all.