R Notebook

** Decision Tree Model**

This demo will be used dataset called lungcap from “this page”

library(rpart)

## Warning: package 'rpart' was built under R version 3.2.5

df<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/LungCapData.csv",sep=";")

head(df)

##   LungCap Age Height Smoke Gender Caesarean
## 1   6.475   6   62.1    no   male        no
## 2  10.125  18   74.7   yes female        no
## 3   9.550  16   69.7    no female       yes
## 4  11.125  14   71.0    no   male        no
## 5   4.800   5   56.9    no   male        no
## 6   6.225  11   58.7    no female        no

Fitting a decision tree model

library(rpart)
# We will work with a subset of this data

df<-df[,c(1,3)]
# Fitting a decision tree model

dt<-rpart(df$LungCap~df$Height, data=df,control = rpart.control(minsplit = 3))

Predict the result

# Predict the result

new<-data.frame(df$Height)

pd<-predict(dt,newdata = new)

# create a new variable

df$pd<-pd

Visualize the result

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.2.5

ggplot() + geom_point(aes(x=df$Height,y=df$LungCap),color=3) + geom_line(aes(x=df$Height,y=df$pd),color=2) + ggtitle("Decision Tree Regression Model") + xlab("Height") + ylab("Lung Capacity") + theme(plot.title = element_text(hjust=0.5)) +theme_bw()

** Random Forest Model**

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.2.5

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

rf<-randomForest(df$LungCap~df$Height,proximity=T, importance=T,ntrees=500)

Predicting the result

pd_rf<-predict(rf,newdata = df[,-c(1,3)])

# Adding this new predicted value to df

df$pd_rf<-pd_rf

Visualizing the result

ggplot() + geom_point(aes(x=df$Height,y=df$LungCap),color=3) + 
  geom_line(aes(x=df$Height,y=df$pd_rf),color=4) +theme_bw() +xlab("Height") + ylab("Lung Capacity") + ggtitle("Random Forest Regression Model")

Checking the effect of tree size on the model accuracy

plot(rf,main="The effect of tree size",col=4)

Logistic Regression Model

social_network<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/Social_Network_Ads.csv")

# Tidying up dataset

social_network<-social_network[,-c(1:2)]


head(social_network)

##   Age EstimatedSalary Purchased
## 1  19           19000         0
## 2  35           20000         0
## 3  26           43000         0
## 4  27           57000         0
## 5  19           76000         0
## 6  27           58000         0

Splitting training and testing sets

library(caTools)

## Warning: package 'caTools' was built under R version 3.2.5

split<-sample.split(social_network$Purchased,SplitRatio = 0.75)

training_set<-subset(social_network,split==T)

test_set<-subset(social_network,split==F)

Applying feature scalling

training_set[,c(1:2)]<-scale(training_set[,c(1:2)])

test_set[,c(1:2)]<-scale(test_set[,1:2])

head(training_set)

##          Age EstimatedSalary Purchased
## 1 -1.8176303      -1.4935873         0
## 2 -0.2447755      -1.4643128         0
## 4 -1.0312029      -0.3811546         0
## 5 -1.8176303       0.1750618         0
## 6 -1.0312029      -0.3518800         0
## 8 -0.5396858       2.3413782         1

Building a logistic regression model

logit_model<-glm(Purchased~., data=training_set,family = binomial)

summary(logit_model)

## 
## Call:
## glm(formula = Purchased ~ ., family = binomial, data = training_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8979  -0.5680  -0.1714   0.4076   2.3108  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.1206     0.1908  -5.872 4.30e-09 ***
## Age               2.3123     0.3009   7.685 1.53e-14 ***
## EstimatedSalary   1.1925     0.2078   5.739 9.51e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 390.89  on 299  degrees of freedom
## Residual deviance: 217.80  on 297  degrees of freedom
## AIC: 223.8
## 
## Number of Fisher Scoring iterations: 6

Predicting the test_set

pred_logit<-predict(logit_model,test_set[,-3],type = "response")

Purchased<-ifelse(pred_logit>0.5,1,0)

test_set$Purchased_pred<-Purchased

head(test_set)

##           Age EstimatedSalary Purchased Purchased_pred
## 3  -1.0655117      -0.7605644         0              0
## 7  -0.9778152       0.4429532         0              0
## 13 -1.5916903       0.5016613         0              0
## 15 -1.7670831       0.3842450         0              0
## 16 -0.8024224       0.3255368         0              0
## 19  0.6884170      -1.2008757         1              0

Evaluating the logistic regression model

library(caret)

## Warning: package 'caret' was built under R version 3.2.5

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 3.2.5

cm<-confusionMatrix(test_set$Purchased_pred,test_set$Purchased)

cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 61  9
##          1  3 27
##                                           
##                Accuracy : 0.88            
##                  95% CI : (0.7998, 0.9364)
##     No Information Rate : 0.64            
##     P-Value [Acc > NIR] : 5.703e-08       
##                                           
##                   Kappa : 0.7297          
##  Mcnemar's Test P-Value : 0.1489          
##                                           
##             Sensitivity : 0.9531          
##             Specificity : 0.7500          
##          Pos Pred Value : 0.8714          
##          Neg Pred Value : 0.9000          
##              Prevalence : 0.6400          
##          Detection Rate : 0.6100          
##    Detection Prevalence : 0.7000          
##       Balanced Accuracy : 0.8516          
##                                           
##        'Positive' Class : 0               
##

Another way of obtaining confusion matrix for logistic regression model

tb<-table(test_set$Purchased,test_set$Purchased_pred)

tb

##    
##      0  1
##   0 61  3
##   1  9 27

Visualizing the result

# install.packages("ElemStatLearn")

library(ElemStatLearn)

## Warning: package 'ElemStatLearn' was built under R version 3.2.5

set=training_set

X1<-seq(min(set[,1])-1, max(set[,1])+1,by=0.01)

X2<-seq(min(set[,2])-1, max(set[,2])+1,by=0.01)

grid_set<-expand.grid(X1,X2)

colnames(grid_set)<-c("Age","EstimatedSalary")

prob_set<-predict(logit_model,newdata = grid_set,type = "response")

y_grid<-ifelse(prob_set>0.5,1,0)

plot(set[,-3],xlab="Age",ylab="Estimated Salary",main="Logistic Regression (training set)",xlim=range(X1),ylim=range(X2))

contour(X1,X2,matrix(as.numeric(y_grid),length(X1),length(X2)),add=T)

points(grid_set,col=ifelse(y_grid==1,"springgreen3","blue"),pch=".")

points(set,pch=21,bg=ifelse(set[,3]==1,"yellow","red3"))

# Yellow indicates `1` and `0` otherwise

library(ElemStatLearn)


set=test_set[,-4]

X1<-seq(min(set[,1])-1, max(set[,1])+1,by=0.01)

X2<-seq(min(set[,2])-1, max(set[,2])+1,by=0.01)

grid_set<-expand.grid(X1,X2)

colnames(grid_set)<-c("Age","EstimatedSalary")

prob_set<-predict(logit_model,newdata = grid_set,type = "response")

y_grid<-ifelse(prob_set>0.5,1,0)

plot(set[,-3],xlab="Age",ylab="Estimated Salary",main="Logistic Regression (Test set)",xlim=range(X1),ylim=range(X2))

contour(X1,X2,matrix(as.numeric(y_grid),length(X1),length(X2)),add=T)

points(grid_set,col=ifelse(y_grid==1,"springgreen3","blue"),pch=".")

points(set,pch=21,bg=ifelse(set[,3]==1,"yellow","red3"))

K-NN Classifier

We will use the dataset as mentioned in logistic regression model

# build K-NN model and predicting the test set
library(caTools)

split<-sample.split(social_network$Purchased,SplitRatio = 0.5)

training_set<-subset(social_network,split==T)

test_set<-subset(social_network,split==F)

Building K-NN model

library(class)

knn_model<-knn(train = training_set[,-3],test = test_set[,-3],cl=training_set$Purchased,k=5)