** Decision Tree Model**
This demo will be used dataset called lungcap from “this page”
library(rpart)
## Warning: package 'rpart' was built under R version 3.2.5
df<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/LungCapData.csv",sep=";")
head(df)
## LungCap Age Height Smoke Gender Caesarean
## 1 6.475 6 62.1 no male no
## 2 10.125 18 74.7 yes female no
## 3 9.550 16 69.7 no female yes
## 4 11.125 14 71.0 no male no
## 5 4.800 5 56.9 no male no
## 6 6.225 11 58.7 no female no
Fitting a decision tree model
library(rpart)
# We will work with a subset of this data
df<-df[,c(1,3)]
# Fitting a decision tree model
dt<-rpart(df$LungCap~df$Height, data=df,control = rpart.control(minsplit = 3))
Predict the result
# Predict the result
new<-data.frame(df$Height)
pd<-predict(dt,newdata = new)
# create a new variable
df$pd<-pd
Visualize the result
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
ggplot() + geom_point(aes(x=df$Height,y=df$LungCap),color=3) + geom_line(aes(x=df$Height,y=df$pd),color=2) + ggtitle("Decision Tree Regression Model") + xlab("Height") + ylab("Lung Capacity") + theme(plot.title = element_text(hjust=0.5)) +theme_bw()
** Random Forest Model**
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.2.5
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
rf<-randomForest(df$LungCap~df$Height,proximity=T, importance=T,ntrees=500)
Predicting the result
pd_rf<-predict(rf,newdata = df[,-c(1,3)])
# Adding this new predicted value to df
df$pd_rf<-pd_rf
Visualizing the result
ggplot() + geom_point(aes(x=df$Height,y=df$LungCap),color=3) +
geom_line(aes(x=df$Height,y=df$pd_rf),color=4) +theme_bw() +xlab("Height") + ylab("Lung Capacity") + ggtitle("Random Forest Regression Model")
Checking the effect of tree size on the model accuracy
plot(rf,main="The effect of tree size",col=4)
Logistic Regression Model
social_network<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/Social_Network_Ads.csv")
# Tidying up dataset
social_network<-social_network[,-c(1:2)]
head(social_network)
## Age EstimatedSalary Purchased
## 1 19 19000 0
## 2 35 20000 0
## 3 26 43000 0
## 4 27 57000 0
## 5 19 76000 0
## 6 27 58000 0
Splitting training and testing sets
library(caTools)
## Warning: package 'caTools' was built under R version 3.2.5
split<-sample.split(social_network$Purchased,SplitRatio = 0.75)
training_set<-subset(social_network,split==T)
test_set<-subset(social_network,split==F)
Applying feature scalling
training_set[,c(1:2)]<-scale(training_set[,c(1:2)])
test_set[,c(1:2)]<-scale(test_set[,1:2])
head(training_set)
## Age EstimatedSalary Purchased
## 1 -1.8176303 -1.4935873 0
## 2 -0.2447755 -1.4643128 0
## 4 -1.0312029 -0.3811546 0
## 5 -1.8176303 0.1750618 0
## 6 -1.0312029 -0.3518800 0
## 8 -0.5396858 2.3413782 1
Building a logistic regression model
logit_model<-glm(Purchased~., data=training_set,family = binomial)
summary(logit_model)
##
## Call:
## glm(formula = Purchased ~ ., family = binomial, data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8979 -0.5680 -0.1714 0.4076 2.3108
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1206 0.1908 -5.872 4.30e-09 ***
## Age 2.3123 0.3009 7.685 1.53e-14 ***
## EstimatedSalary 1.1925 0.2078 5.739 9.51e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 390.89 on 299 degrees of freedom
## Residual deviance: 217.80 on 297 degrees of freedom
## AIC: 223.8
##
## Number of Fisher Scoring iterations: 6
Predicting the test_set
pred_logit<-predict(logit_model,test_set[,-3],type = "response")
Purchased<-ifelse(pred_logit>0.5,1,0)
test_set$Purchased_pred<-Purchased
head(test_set)
## Age EstimatedSalary Purchased Purchased_pred
## 3 -1.0655117 -0.7605644 0 0
## 7 -0.9778152 0.4429532 0 0
## 13 -1.5916903 0.5016613 0 0
## 15 -1.7670831 0.3842450 0 0
## 16 -0.8024224 0.3255368 0 0
## 19 0.6884170 -1.2008757 1 0
Evaluating the logistic regression model
library(caret)
## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.2.5
cm<-confusionMatrix(test_set$Purchased_pred,test_set$Purchased)
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 61 9
## 1 3 27
##
## Accuracy : 0.88
## 95% CI : (0.7998, 0.9364)
## No Information Rate : 0.64
## P-Value [Acc > NIR] : 5.703e-08
##
## Kappa : 0.7297
## Mcnemar's Test P-Value : 0.1489
##
## Sensitivity : 0.9531
## Specificity : 0.7500
## Pos Pred Value : 0.8714
## Neg Pred Value : 0.9000
## Prevalence : 0.6400
## Detection Rate : 0.6100
## Detection Prevalence : 0.7000
## Balanced Accuracy : 0.8516
##
## 'Positive' Class : 0
##
Another way of obtaining confusion matrix for logistic regression model
tb<-table(test_set$Purchased,test_set$Purchased_pred)
tb
##
## 0 1
## 0 61 3
## 1 9 27
Visualizing the result
# install.packages("ElemStatLearn")
library(ElemStatLearn)
## Warning: package 'ElemStatLearn' was built under R version 3.2.5
set=training_set
X1<-seq(min(set[,1])-1, max(set[,1])+1,by=0.01)
X2<-seq(min(set[,2])-1, max(set[,2])+1,by=0.01)
grid_set<-expand.grid(X1,X2)
colnames(grid_set)<-c("Age","EstimatedSalary")
prob_set<-predict(logit_model,newdata = grid_set,type = "response")
y_grid<-ifelse(prob_set>0.5,1,0)
plot(set[,-3],xlab="Age",ylab="Estimated Salary",main="Logistic Regression (training set)",xlim=range(X1),ylim=range(X2))
contour(X1,X2,matrix(as.numeric(y_grid),length(X1),length(X2)),add=T)
points(grid_set,col=ifelse(y_grid==1,"springgreen3","blue"),pch=".")
points(set,pch=21,bg=ifelse(set[,3]==1,"yellow","red3"))
# Yellow indicates `1` and `0` otherwise
library(ElemStatLearn)
set=test_set[,-4]
X1<-seq(min(set[,1])-1, max(set[,1])+1,by=0.01)
X2<-seq(min(set[,2])-1, max(set[,2])+1,by=0.01)
grid_set<-expand.grid(X1,X2)
colnames(grid_set)<-c("Age","EstimatedSalary")
prob_set<-predict(logit_model,newdata = grid_set,type = "response")
y_grid<-ifelse(prob_set>0.5,1,0)
plot(set[,-3],xlab="Age",ylab="Estimated Salary",main="Logistic Regression (Test set)",xlim=range(X1),ylim=range(X2))
contour(X1,X2,matrix(as.numeric(y_grid),length(X1),length(X2)),add=T)
points(grid_set,col=ifelse(y_grid==1,"springgreen3","blue"),pch=".")
points(set,pch=21,bg=ifelse(set[,3]==1,"yellow","red3"))
K-NN Classifier
We will use the dataset as mentioned in logistic regression model
# build K-NN model and predicting the test set
library(caTools)
split<-sample.split(social_network$Purchased,SplitRatio = 0.5)
training_set<-subset(social_network,split==T)
test_set<-subset(social_network,split==F)
Building K-NN model
library(class)
knn_model<-knn(train = training_set[,-3],test = test_set[,-3],cl=training_set$Purchased,k=5)