Prediction Of Lung Capacity due to different factors
nd<-read.table("/Users/ravishankar/Desktop/LungCapData.txt",header=T,sep="\t")
head(nd)
## LungCap Age Height Smoke Gender Caesarean
## 1 6.475 6 62.1 no male no
## 2 10.125 18 74.7 yes female no
## 3 9.550 16 69.7 no female yes
## 4 11.125 14 71.0 no male no
## 5 4.800 5 56.9 no male no
## 6 6.225 11 58.7 no female no
attach(nd)
colnames(nd)
## [1] "LungCap" "Age" "Height" "Smoke" "Gender" "Caesarean"
##ScatterPlot
#It is appropriate for examining the relationship between 2 numeric variables
#las-it will rotate the label in y-axis, xlim-It will determine the length of the x-axis
#col-color of the plot,pcf-size of the plot
plot(Age,Height,main="Scatterplot",xlab="Age",ylab="Height",las=1,xlim=c(3,20),ylim=c(40,85),col=2,pch=5)
#Linear Regression Line
abline(lm(Height~Age),col=7)
#Non-Paramateric Smoother-It describes the relationship
#lty-Line Type,lwd-Line width
lines(smooth.spline(Age,Height),lty=2,col=1)
#Proportion
proportion<-(table(Gender)/length(Gender))*100
proportion
## Gender
## female male
## 49.37931 50.62069
##Barplots- It is used to describe the relationship between 2 categorical Variables
#Stacked Bar
tab<-table(Gender,Smoke)
barplot(tab,col=c(2,4),legend.text=T,las=1)
#Clustered Bar- it will be besides
barplot(tab,beside=T,col=c(2,5),legend.text=T,las=1)
#Mosaic Plot
mosaicplot(tab,las=1)
##BarCharts-It is used to summarize the distribution of categorical variables
#hori <- To give a horizontal View,names.arg<-to change the variabe name in the axis
count<-table(Gender)
perc<-(table(Gender)/length(Gender))*100
barplot(perc,hori=T,names.arg=c("M","F"))
#Piechart
pie(count,border=T)
box()
##Boxplot-It is used to summarize the distribution of numerical variables
#Boxplot without groups
boxplot(LungCap,ylab="Age",las=1)
#Boxplot with groups
boxplot(LungCap~Gender,ylab="Age",las=1,xlab="Gender")
##Histogram-It is used to summarize the distribution of numerical variables
hist(LungCap)
hist(LungCap,prob=T)
hist(LungCap,prob=T,ylim=c(0,0.2))
hist(LungCap,prob=T,ylim=c(0,0.2),breaks=7)
hist(LungCap,prob=T,ylim=c(0,0.2),breaks=14)
hist(LungCap,prob=T,ylim=c(0,0.2),breaks=c(0,2,4,6,8,10,12,14,16))
hist(LungCap,prob=T,ylim=c(0,0.2),breaks=seq(from=0,to=16,by=4))
lines(density(LungCap),col=2,lwd=5)
#Steam and Leaf Plot-It is used to summarize the distribution of numerical variables for small datasets
FLungCap<-LungCap[Gender=="female"]
stem(FLungCap)
##
## The decimal point is at the |
##
## 0 | 5
## 1 | 0135689
## 2 | 0033456777789999
## 3 | 0122457788999999
## 4 | 012333344555556666677777899
## 5 | 0000122222334466666777778999
## 6 | 000111111122222222233345555556666667777777788888999999
## 7 | 000123334444444445555666667778888888999999
## 8 | 000000001111122222333333444444555556666666666777777888888888899
## 9 | 0000000011122223333344455556666777788888999999
## 10 | 000011111222334445555666777778899
## 11 | 00111223556678888
## 12 | 1222479
## 13 | 1
library(car)
## Warning: package 'car' was built under R version 3.1.3
#ITERATIION 1
model.1<-lm(LungCap~Age+Smoke,nd)
summary(model.1)
##
## Call:
## lm(formula = LungCap ~ Age + Smoke, data = nd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8559 -1.0289 -0.0363 1.0083 4.1995
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08572 0.18299 5.933 4.61e-09 ***
## Age 0.55540 0.01438 38.628 < 2e-16 ***
## Smokeyes -0.64859 0.18676 -3.473 0.000546 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.514 on 722 degrees of freedom
## Multiple R-squared: 0.6773, Adjusted R-squared: 0.6764
## F-statistic: 757.5 on 2 and 722 DF, p-value: < 2.2e-16
#ITERATION 2
model.2<-lm(LungCap~Age+Smoke+Gender,nd)
summary(model.2)
##
## Call:
## lm(formula = LungCap ~ Age + Smoke + Gender, data = nd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5948 -0.9324 0.0426 0.9497 4.2676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.52933 0.18187 2.911 0.00372 **
## Age 0.55793 0.01355 41.178 < 2e-16 ***
## Smokeyes -0.56692 0.17616 -3.218 0.00135 **
## Gendermale 1.02026 0.10616 9.611 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.427 on 721 degrees of freedom
## Multiple R-squared: 0.7139, Adjusted R-squared: 0.7127
## F-statistic: 599.7 on 3 and 721 DF, p-value: < 2.2e-16
#ITERATION 3
model.3<-lm(LungCap~Age+Smoke+Gender+Caesarean+Height,nd)
summary(model.3)
##
## Call:
## lm(formula = LungCap ~ Age + Smoke + Gender + Caesarean + Height,
## data = nd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3388 -0.7200 0.0444 0.7093 3.0172
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11.32249 0.47097 -24.041 < 2e-16 ***
## Age 0.16053 0.01801 8.915 < 2e-16 ***
## Smokeyes -0.60956 0.12598 -4.839 1.60e-06 ***
## Gendermale 0.38701 0.07966 4.858 1.45e-06 ***
## Caesareanyes -0.21422 0.09074 -2.361 0.0185 *
## Height 0.26411 0.01006 26.248 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.02 on 719 degrees of freedom
## Multiple R-squared: 0.8542, Adjusted R-squared: 0.8532
## F-statistic: 842.8 on 5 and 719 DF, p-value: < 2.2e-16