Prediction Of Lung Capacity due to different factors

nd<-read.table("/Users/ravishankar/Desktop/LungCapData.txt",header=T,sep="\t")
head(nd)
##   LungCap Age Height Smoke Gender Caesarean
## 1   6.475   6   62.1    no   male        no
## 2  10.125  18   74.7   yes female        no
## 3   9.550  16   69.7    no female       yes
## 4  11.125  14   71.0    no   male        no
## 5   4.800   5   56.9    no   male        no
## 6   6.225  11   58.7    no female        no
attach(nd)
colnames(nd)
## [1] "LungCap"   "Age"       "Height"    "Smoke"     "Gender"    "Caesarean"
##ScatterPlot
#It is appropriate for examining the relationship between 2 numeric variables
#las-it will rotate the label in y-axis, xlim-It will determine the length of the x-axis
#col-color of the plot,pcf-size of the plot
plot(Age,Height,main="Scatterplot",xlab="Age",ylab="Height",las=1,xlim=c(3,20),ylim=c(40,85),col=2,pch=5)

#Linear Regression Line
abline(lm(Height~Age),col=7)

#Non-Paramateric Smoother-It describes the relationship
#lty-Line Type,lwd-Line width
lines(smooth.spline(Age,Height),lty=2,col=1)

#Proportion
proportion<-(table(Gender)/length(Gender))*100
proportion
## Gender
##   female     male 
## 49.37931 50.62069
##Barplots- It is used to describe the relationship between 2 categorical Variables
#Stacked Bar
tab<-table(Gender,Smoke)
barplot(tab,col=c(2,4),legend.text=T,las=1)

#Clustered Bar- it will be besides
barplot(tab,beside=T,col=c(2,5),legend.text=T,las=1)

#Mosaic Plot
mosaicplot(tab,las=1)

##BarCharts-It is used to summarize the distribution of categorical variables
#hori <- To give a horizontal View,names.arg<-to change the variabe name in the axis
count<-table(Gender)
perc<-(table(Gender)/length(Gender))*100
barplot(perc,hori=T,names.arg=c("M","F"))

#Piechart
pie(count,border=T)
box()

##Boxplot-It is used to summarize the distribution of numerical variables
#Boxplot without groups
boxplot(LungCap,ylab="Age",las=1)

#Boxplot with groups
boxplot(LungCap~Gender,ylab="Age",las=1,xlab="Gender")

##Histogram-It is used to summarize the distribution of numerical variables
hist(LungCap)

hist(LungCap,prob=T)

hist(LungCap,prob=T,ylim=c(0,0.2))

hist(LungCap,prob=T,ylim=c(0,0.2),breaks=7)

hist(LungCap,prob=T,ylim=c(0,0.2),breaks=14)

hist(LungCap,prob=T,ylim=c(0,0.2),breaks=c(0,2,4,6,8,10,12,14,16))

hist(LungCap,prob=T,ylim=c(0,0.2),breaks=seq(from=0,to=16,by=4))
lines(density(LungCap),col=2,lwd=5)

#Steam and Leaf Plot-It is used to summarize the distribution of numerical variables for small datasets
FLungCap<-LungCap[Gender=="female"]
stem(FLungCap)
## 
##   The decimal point is at the |
## 
##    0 | 5
##    1 | 0135689
##    2 | 0033456777789999
##    3 | 0122457788999999
##    4 | 012333344555556666677777899
##    5 | 0000122222334466666777778999
##    6 | 000111111122222222233345555556666667777777788888999999
##    7 | 000123334444444445555666667778888888999999
##    8 | 000000001111122222333333444444555556666666666777777888888888899
##    9 | 0000000011122223333344455556666777788888999999
##   10 | 000011111222334445555666777778899
##   11 | 00111223556678888
##   12 | 1222479
##   13 | 1
library(car)
## Warning: package 'car' was built under R version 3.1.3
#ITERATIION 1
model.1<-lm(LungCap~Age+Smoke,nd)
summary(model.1) 
## 
## Call:
## lm(formula = LungCap ~ Age + Smoke, data = nd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8559 -1.0289 -0.0363  1.0083  4.1995 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.08572    0.18299   5.933 4.61e-09 ***
## Age          0.55540    0.01438  38.628  < 2e-16 ***
## Smokeyes    -0.64859    0.18676  -3.473 0.000546 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.514 on 722 degrees of freedom
## Multiple R-squared:  0.6773, Adjusted R-squared:  0.6764 
## F-statistic: 757.5 on 2 and 722 DF,  p-value: < 2.2e-16
#ITERATION 2
model.2<-lm(LungCap~Age+Smoke+Gender,nd)
summary(model.2) 
## 
## Call:
## lm(formula = LungCap ~ Age + Smoke + Gender, data = nd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5948 -0.9324  0.0426  0.9497  4.2676 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.52933    0.18187   2.911  0.00372 ** 
## Age          0.55793    0.01355  41.178  < 2e-16 ***
## Smokeyes    -0.56692    0.17616  -3.218  0.00135 ** 
## Gendermale   1.02026    0.10616   9.611  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.427 on 721 degrees of freedom
## Multiple R-squared:  0.7139, Adjusted R-squared:  0.7127 
## F-statistic: 599.7 on 3 and 721 DF,  p-value: < 2.2e-16
#ITERATION 3 
model.3<-lm(LungCap~Age+Smoke+Gender+Caesarean+Height,nd)
summary(model.3) 
## 
## Call:
## lm(formula = LungCap ~ Age + Smoke + Gender + Caesarean + Height, 
##     data = nd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3388 -0.7200  0.0444  0.7093  3.0172 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -11.32249    0.47097 -24.041  < 2e-16 ***
## Age            0.16053    0.01801   8.915  < 2e-16 ***
## Smokeyes      -0.60956    0.12598  -4.839 1.60e-06 ***
## Gendermale     0.38701    0.07966   4.858 1.45e-06 ***
## Caesareanyes  -0.21422    0.09074  -2.361   0.0185 *  
## Height         0.26411    0.01006  26.248  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.02 on 719 degrees of freedom
## Multiple R-squared:  0.8542, Adjusted R-squared:  0.8532 
## F-statistic: 842.8 on 5 and 719 DF,  p-value: < 2.2e-16