PREDICTING LUNG CAPACITY USING HEIGHT

lungcap = read.csv("C:/temp/LungCapData2.csv")
attach(lungcap)
names(lungcap)
## [1] "Age"     "LungCap" "Height"  "Gender"  "Smoke"
is.null(lungcap)
## [1] FALSE

Summary Statistics:

summary(lungcap)
##       Age            LungCap           Height         Gender         
##  Min.   : 3.000   Min.   : 0.373   Min.   :46.00   Length:654        
##  1st Qu.: 8.000   1st Qu.: 3.943   1st Qu.:57.00   Class :character  
##  Median :10.000   Median : 5.643   Median :61.50   Mode  :character  
##  Mean   : 9.931   Mean   : 5.910   Mean   :61.14                     
##  3rd Qu.:12.000   3rd Qu.: 7.356   3rd Qu.:65.50                     
##  Max.   :19.000   Max.   :15.379   Max.   :74.00                     
##     Smoke          
##  Length:654        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
str(lungcap)
## 'data.frame':    654 obs. of  5 variables:
##  $ Age    : int  9 8 7 9 9 8 6 6 8 9 ...
##  $ LungCap: num  3.12 3.17 3.16 2.67 3.68 ...
##  $ Height : num  57 67.5 54.5 53 57 61 58 56 58.5 60 ...
##  $ Gender : chr  "female" "female" "female" "male" ...
##  $ Smoke  : chr  "no" "no" "no" "no" ...
lungcap$Gender = as.factor(lungcap$Gender)
lungcap$Smoke = as.factor(lungcap$Smoke)
hist(lungcap$Age)

hist(lungcap$LungCap)

hist(lungcap$Height)

table(lungcap$Gender)
## 
## female   male 
##    318    336
table(lungcap$Smoke)
## 
##  no yes 
## 589  65
plot(Height,LungCap,xlab="Height",ylab="LungCap", main="LungCap vs. Height")

plot(Age,LungCap,xlab="Age",ylab="LungCap", main="LungCap vs. Age")

boxplot(LungCap ~ Gender,data = lungcap,xlab = "Gender",ylab = "LungCap",main = "LungCap vs. Gender",col="red")

boxplot(LungCap ~ Smoke,data = lungcap,xlab = "Smoke",ylab = "LungCap",main = "LungCap vs. Smoke",col="purple")

Correlation:

lungcapnum = lungcap[, c(1,2,3)]
correlationvar = cor(lungcapnum)
corrplot::corrplot(correlationvar, method = "shade", type= "full")

We see there is a higher correlation between LungCap-Height vs LungCap-Age. Therefore, we will explore predictive models between LungCap-Height in this analysis.

Model 1: LungCap vs Height + error

model1=lm(LungCap ~ Height)
summary(model1)
## 
## Call:
## lm(formula = LungCap ~ Height)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2550 -0.7986 -0.0120  0.7342  6.3581 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -18.298036   0.544380  -33.61   <2e-16 ***
## Height        0.395927   0.008865   44.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.292 on 652 degrees of freedom
## Multiple R-squared:  0.7537, Adjusted R-squared:  0.7533 
## F-statistic:  1995 on 1 and 652 DF,  p-value: < 2.2e-16
plot(model1)

MODEL2: LungCap vs (Height + Height**2) + error

model2 <- lm(LungCap ~ Height + I(Height^2))
summary(model2)
## 
## Call:
## lm(formula = LungCap ~ Height + I(Height^2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4031 -0.6878 -0.0076  0.6577  5.9910 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.080634   4.509553   3.566 0.000389 ***
## Height      -0.750147   0.149566  -5.015 6.83e-07 ***
## I(Height^2)  0.009466   0.001233   7.675 6.07e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.238 on 651 degrees of freedom
## Multiple R-squared:  0.7741, Adjusted R-squared:  0.7734 
## F-statistic:  1115 on 2 and 651 DF,  p-value: < 2.2e-16
plot(model2)

MODEL3: LungCap vs (Height + Height2 + Height3) + error

model3 <- lm(LungCap ~ Height + I(Height^2) + I(Height^3))
summary(model3)
## 
## Call:
## lm(formula = LungCap ~ Height + I(Height^2) + I(Height^3))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3885 -0.6900  0.0069  0.6511  5.9936 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.293e-01  3.803e+01  -0.017    0.987
## Height       9.179e-02  1.908e+00   0.048    0.962
## I(Height^2) -4.567e-03  3.173e-02  -0.144    0.886
## I(Height^3)  7.739e-05  1.749e-04   0.443    0.658
## 
## Residual standard error: 1.239 on 650 degrees of freedom
## Multiple R-squared:  0.7742, Adjusted R-squared:  0.7731 
## F-statistic: 742.7 on 3 and 650 DF,  p-value: < 2.2e-16
plot(model3)

We see that the adj R-squared value has decreased from model 2 to model 3 - this means that the additional term in model 3 does not improve the predictive power of the model.

MODEL COMPARISON:

anova(model1,model2)
## Analysis of Variance Table
## 
## Model 1: LungCap ~ Height
## Model 2: LungCap ~ Height + I(Height^2)
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1    652 1088.41                                  
## 2    651  998.09  1    90.314 58.907 6.069e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2,model3)
## Analysis of Variance Table
## 
## Model 1: LungCap ~ Height + I(Height^2)
## Model 2: LungCap ~ Height + I(Height^2) + I(Height^3)
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    651 998.09                           
## 2    650 997.79  1   0.30066 0.1959 0.6582
anova2 = lm(LungCap ~ Height + I(Height^2))
anova3 = lm(LungCap ~ Height + I(Height^2) + I(Height^3))

plot(Height,LungCap)

abline(model1, col= "purple")

Height_2 = data.frame(Height=seq(min(Height),max(Height),0.01))
Height_2$pred1 = predict(anova2, Height_2)
lines(Height_2$Height, Height_2$pred1, col="red")

Height_2 = data.frame(Height=seq(min(Height),max(Height),0.01))
Height_2$pred1 = predict(anova3, Height_2)
lines(Height_2$Height, Height_2$pred1, col="green")

Conclusion: MODEL 2 is the best model out of all.