library(rpart)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(carData)
library(reshape2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.4 v purrr 0.3.4
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x car::recode() masks dplyr::recode()
## x purrr::some() masks car::some()
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
#Load dataset
IR <- iris
View(IR)
attach(IR)
names(IR)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
summary(IR)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
df3 <- melt(as.data.frame(IR[,-5]))
## No id variables; using all as measure variables
###########################CART METHOD##########################################
fit <- rpart(Species~.,data = IR)
printcp(fit)
##
## Classification tree:
## rpart(formula = Species ~ ., data = IR)
##
## Variables actually used in tree construction:
## [1] Petal.Length Petal.Width
##
## Root node error: 100/150 = 0.66667
##
## n= 150
##
## CP nsplit rel error xerror xstd
## 1 0.50 0 1.00 1.19 0.049592
## 2 0.44 1 0.50 0.71 0.061150
## 3 0.01 2 0.06 0.09 0.029086
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
#Prune data with CP value that contains lowest error
#Plot the model
plot(fit)
text(fit, cex = 0.9, xpd = TRUE, pos = 2.5)

#Predict test data
pred <- predict(fit, IR, type = "class")
p_frame <- data.frame(IR, pred)
View(p_frame)
sample_n(IR,4) #Data inspection
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 4.4 3.0 1.3 0.2 setosa
## 2 5.7 2.6 3.5 1.0 versicolor
## 3 6.8 2.8 4.8 1.4 versicolor
## 4 4.4 2.9 1.4 0.2 setosa
#Training and testing fit
training_samples <- IR$Species %>%
createDataPartition(p =0.8,list = FALSE)
train.data <- IR[training_samples,]
test.data <- IR[-training_samples,]
#Build model
set.seed(123)
fit1 <- rpart(Species~. ,data = train.data, method = "class")
fit1
## n= 120
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 120 80 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.6 40 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.6 80 40 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Length< 4.75 37 1 versicolor (0.00000000 0.97297297 0.02702703) *
## 7) Petal.Length>=4.75 43 4 virginica (0.00000000 0.09302326 0.90697674) *
par(xpd = NA)
plot(fit1)
text(fit1, digits = 3)

#Make predictions
predicted_class <- fit1 %>%
predict(test.data, type = "class")
head(predicted_class)
## 1 8 9 10 15 22
## setosa setosa setosa setosa setosa setosa
## Levels: setosa versicolor virginica
#Compute model accuracy
mean(predicted_class == test.data$Species)
## [1] 0.9333333
#The overall accuracy of the model is 93.33%