This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#for cleaning process
library(ggplot2)
#for visualization
LungCapData<-read.csv("LungCapData.csv")
View(LungCapData)
attach(LungCapData)
summary(LungCap)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.507 6.150 8.000 7.863 9.800 14.675
head(LungCapData)
## LungCap Age Height Smoke Gender Caesarean
## 1 6.475 6 62.1 no male no
## 2 10.125 18 74.7 yes female no
## 3 9.550 16 69.7 no female yes
## 4 11.125 14 71.0 no male no
## 5 4.800 5 56.9 no male no
## 6 6.225 11 58.7 no female no
class(Gender)
## [1] "factor"
Gender<-as.factor(Gender)
levels(Gender)
## [1] "female" "male"
#2 way contigency matrix
tab<-table(LungCapData$Smoke,LungCapData$Gender)
tab
##
## female male
## no 314 334
## yes 44 33
#side by side barcharts
ggplot(LungCapData,aes(x=Smoke,fill=Gender))+geom_bar(position = "dodge")
ggplot(LungCapData,aes(x=Smoke,fill=Gender))+geom_bar(position = "fill")
lung<-LungCapData %>% filter(LungCap>=10.00)
ggplot(lung,aes(x=LungCap))+geom_dotplot(dotsize = 0.5)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
#histogram
class(LungCapData$LungCap)
## [1] "numeric"
LungCapData$LungCap<-as.numeric(LungCapData$LungCap)
ggplot(LungCapData, aes(x = LungCap)) + geom_histogram(color = 'blue')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#densityplot
ggplot(LungCapData,aes(x=Height))+geom_density()
ggplot(LungCapData,aes(x=Height))+geom_density()+facet_wrap(~LungCapData$Caesarean)
ggplot(LungCapData,aes(x=Age,fill=Caesarean))+geom_density(alpha=0.4)
count <- table(LungCapData$Gender)
pie(count, main="Percentage ratio between Male & Female")
ggplot(LungCapData, aes(x = LungCap, y = Age)) + geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
men <- Gender == "male"
women <- Gender == "female"
boxplot(LungCap[men],LungCap[women] )
library(rpart)
LungCapData<- LungCapData [,c(1,3)]
# Fitting a decision tree model
dt<-rpart(LungCapData $LungCap~LungCapData $Height, data= LungCapData,control = rpart.control(minsplit = 3))
new<-data.frame(LungCapData $Height)
pd<-predict(dt,newdata = new)
LungCapData $pd<-pd
library(ggplot2)
ggplot() + geom_point(aes(x= LungCapData $Height,y= LungCapData $LungCap),color=3) +
geom_line(aes(x= LungCapData $Height,y= LungCapData $pd),color=2) +
ggtitle("Decision Tree Regression Model") + xlab("Height") +
ylab("Lung Capacity") + theme(plot.title = element_text(hjust=0.5))
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
rf<-randomForest(LungCapData $LungCap~ LungCapData$Height,proximity=T, importance=T,ntrees=500)
pd_rf<-predict(rf,newdata = LungCapData [,-c(1,3)])
LungCapData $pd_rf<-pd_rf
ggplot() + geom_point(aes(x= LungCapData $Height,y= LungCapData$LungCap),color=3) +
geom_line(aes(x= LungCapData $Height,y= LungCapData $pd_rf),color=4) +theme_bw() +
xlab("Height") + ylab("Lung Capacity") + ggtitle("Random Forest Regression Model")
library(randomForest)
rf<-randomForest(LungCapData $LungCap~ LungCapData $Height,proximity=T, importance=T,ntrees=500)
pd_rf<-predict(rf,newdata =LungCapData [,-c(1,3)])
plot(rf,main="The effect of tree size",col=4)