Childhood Respiratory Disease

Description

FEV (forced expiratory volume) is an index of pulmonary function that measures the volume of air expelled after one second of constant effort. The data contains determinations of FEV on 654 children ages 6-22 who were seen in the Childhood Respiratory Desease Study in 1980 in East Boston, Massachusetts. The data are part of a larger study to follow the change in pulmonary function over time in children.

ID - ID number Age - years FEV - litres Height - inches Sex - Male or Female Smoker - Non = nonsmoker, Current = current smoker

Read Data

library(knitr)

url <- "http://www.statsci.org/data/general/fev.txt"

crd  <- read.table(file = url, header = T, sep="\t", stringsAsFactors = F)
crd$Sex <- factor(crd$Sex, levels=c("Male","Female"), labels=c(0,1))
crd$Smoker <- factor(crd$Smoker, levels=c("Non","Current"), labels=c(0,1))

head(crd)
##     ID Age   FEV Height Sex Smoker
## 1  301   9 1.708   57.0   1      0
## 2  451   8 1.724   67.5   1      0
## 3  501   7 1.720   54.5   1      0
## 4  642   9 1.558   53.0   0      0
## 5  901   9 1.895   57.0   0      0
## 6 1701   8 2.336   61.0   1      0
kable(head(crd))
ID Age FEV Height Sex Smoker
301 9 1.708 57.0 1 0
451 8 1.724 67.5 1 0
501 7 1.720 54.5 1 0
642 9 1.558 53.0 0 0
901 9 1.895 57.0 0 0
1701 8 2.336 61.0 1 0
summary(crd)
##        ID             Age              FEV            Height      Sex    
##  Min.   :  201   Min.   : 3.000   Min.   :0.791   Min.   :46.00   0:336  
##  1st Qu.:15811   1st Qu.: 8.000   1st Qu.:1.981   1st Qu.:57.00   1:318  
##  Median :36071   Median :10.000   Median :2.547   Median :61.50          
##  Mean   :37170   Mean   : 9.931   Mean   :2.637   Mean   :61.14          
##  3rd Qu.:53639   3rd Qu.:12.000   3rd Qu.:3.119   3rd Qu.:65.50          
##  Max.   :90001   Max.   :19.000   Max.   :5.793   Max.   :74.00          
##  Smoker 
##  0:589  
##  1: 65  
##         
##         
##         
## 
# use pairs plot to see interaction between variables
pairs(crd, gap = 0.5)

# create quadratic term
age_sq <- (crd$Age)^2
# generate model for age, sex and height
crd.lm <- lm(crd$FEV ~ age_sq + crd$Sex + crd$Height )
# summary for model for age, sex and height
summary(crd.lm)
## 
## Call:
## lm(formula = crd$FEV ~ age_sq + crd$Sex + crd$Height)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.38456 -0.23741  0.00404  0.24694  1.90402 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.110987   0.231037 -17.794  < 2e-16 ***
## age_sq       0.002876   0.000369   7.796 2.55e-14 ***
## crd$Sex1    -0.157471   0.032614  -4.828 1.72e-06 ***
## crd$Height   0.106562   0.004155  25.645  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4082 on 650 degrees of freedom
## Multiple R-squared:  0.7794, Adjusted R-squared:  0.7783 
## F-statistic: 765.3 on 3 and 650 DF,  p-value: < 2.2e-16

The model is as follows:

\[crd= -4.110987 - 0.002876 * age 2 - 0.157471 * sex + 0.1065625 * height\]

#residual analysis
plot(fitted(crd.lm), resid(crd.lm))
abline(0, 0)

#qqplot
qqnorm(crd.lm$residuals)
qqline(crd.lm$residuals)

# generate model for age and height
crd_age_height.lm <- lm(crd$FEV ~ age_sq +  crd$Height )
# summary for model for age and height
summary(crd_age_height.lm)
## 
## Call:
## lm(formula = crd$FEV ~ age_sq + crd$Height)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.50088 -0.25351 -0.00563  0.24915  1.97563 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.4093808  0.2264014 -19.476  < 2e-16 ***
## age_sq       0.0026498  0.0003722   7.119 2.88e-12 ***
## crd$Height   0.1105878  0.0041400  26.712  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4151 on 651 degrees of freedom
## Multiple R-squared:  0.7715, Adjusted R-squared:  0.7707 
## F-statistic:  1099 on 2 and 651 DF,  p-value: < 2.2e-16
#residual analysis for age and height
plot(fitted(crd_age_height.lm), resid(crd_age_height.lm))
abline(0, 0)

#qqplot for age and height
qqnorm(crd_age_height.lm$residuals)
qqline(crd_age_height.lm$residuals)

##correlation between age and height.
cor(crd$Age, crd$Height, method="pearson")
## [1] 0.7919436
confint(crd_age_height.lm, conf.level=0.95)
##                    2.5 %       97.5 %
## (Intercept) -4.853945875 -3.964815731
## age_sq       0.001918885  0.003380617
## crd$Height   0.102458504  0.118717094

Summary: Adjusted square is high and pvalue is less.. So, FEV(78%) can be explained based on height, sex and age or height and age. there are outliers in the QQ plot. corelation between age and height is high. relation between age, height and FEV is linear and normal.