R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#for cleaning process
library(ggplot2)
#for visualization
LungCapData<-read.csv("LungCapData.csv")
View(LungCapData)
attach(LungCapData)
summary(LungCap)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.507   6.150   8.000   7.863   9.800  14.675
head(LungCapData)
##   LungCap Age Height Smoke Gender Caesarean
## 1   6.475   6   62.1    no   male        no
## 2  10.125  18   74.7   yes female        no
## 3   9.550  16   69.7    no female       yes
## 4  11.125  14   71.0    no   male        no
## 5   4.800   5   56.9    no   male        no
## 6   6.225  11   58.7    no female        no
class(Gender)
## [1] "factor"
Gender<-as.factor(Gender)
levels(Gender)
## [1] "female" "male"
#2 way contigency matrix
tab<-table(LungCapData$Smoke,LungCapData$Gender)
tab
##      
##       female male
##   no     314  334
##   yes     44   33
#side by side barcharts

ggplot(LungCapData,aes(x=Smoke,fill=Gender))+geom_bar(position = "dodge")

ggplot(LungCapData,aes(x=Smoke,fill=Gender))+geom_bar(position = "fill")

lung<-LungCapData %>% filter(LungCap>=10.00)
ggplot(lung,aes(x=LungCap))+geom_dotplot(dotsize = 0.5)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

#histogram
class(LungCapData$LungCap)
## [1] "numeric"
LungCapData$LungCap<-as.numeric(LungCapData$LungCap)
ggplot(LungCapData, aes(x = LungCap)) + geom_histogram(color = 'blue')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#densityplot
ggplot(LungCapData,aes(x=Height))+geom_density()

ggplot(LungCapData,aes(x=Height))+geom_density()+facet_wrap(~LungCapData$Caesarean)

ggplot(LungCapData,aes(x=Age,fill=Caesarean))+geom_density(alpha=0.4)       

count <- table(LungCapData$Gender)
pie(count, main="Percentage ratio between Male & Female")

ggplot(LungCapData, aes(x = LungCap, y = Age)) + geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

men <- Gender == "male"
women <- Gender == "female"
boxplot(LungCap[men],LungCap[women] ) 

library(rpart)




LungCapData<- LungCapData [,c(1,3)]

# Fitting a decision tree model

dt<-rpart(LungCapData $LungCap~LungCapData $Height, data= LungCapData,control = rpart.control(minsplit = 3))
new<-data.frame(LungCapData $Height)

pd<-predict(dt,newdata = new)
LungCapData $pd<-pd
library(ggplot2)
ggplot() + geom_point(aes(x= LungCapData $Height,y= LungCapData $LungCap),color=3) +
  geom_line(aes(x= LungCapData $Height,y= LungCapData $pd),color=2) +
  ggtitle("Decision Tree Regression Model") + xlab("Height") +
  ylab("Lung Capacity") + theme(plot.title = element_text(hjust=0.5)) 

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
rf<-randomForest(LungCapData $LungCap~ LungCapData$Height,proximity=T, importance=T,ntrees=500)
pd_rf<-predict(rf,newdata = LungCapData [,-c(1,3)])

LungCapData $pd_rf<-pd_rf

ggplot() + geom_point(aes(x= LungCapData $Height,y= LungCapData$LungCap),color=3) + 
  geom_line(aes(x= LungCapData $Height,y= LungCapData $pd_rf),color=4) +theme_bw() +
  xlab("Height") + ylab("Lung Capacity") + ggtitle("Random Forest Regression Model")

library(randomForest)

rf<-randomForest(LungCapData $LungCap~ LungCapData $Height,proximity=T, importance=T,ntrees=500)
pd_rf<-predict(rf,newdata =LungCapData [,-c(1,3)])


plot(rf,main="The effect of tree size",col=4)