#Introduction In this case we have an a dataset with four continuous columns and one categorical column. In such a classification problem we seek to classify the flowers with their known attributes on sepal and petal that is the width and length.
#Objective To understand the distribution of the flower continuous attributes from the descriptive statistics. Whether the data is normally distributed or skewed by the difference between mean and median.
To classify the flowers in to the three species category using the four attributes in decision tree algorithms with a section of the dataset and testing the model performance in the classification task with another section of the dataset.
loading required packages for the task
require(rpart)
## Loading required package: rpart
require(rpart.plot)
## Loading required package: rpart.plot
## Warning: package 'rpart.plot' was built under R version 4.2.1
require(mlbench)
## Loading required package: mlbench
require(caret)
## Loading required package: caret
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.1
## Loading required package: lattice
require(pROC)
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(ggplot2)
loading data and transforming variable of interest in to factors Extracting descriptive statistics of the variables
data(iris)
mydata <- iris
head(mydata)
mydata$Species<- as.factor(mydata$Species)
summary(mydata)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
scatter plot of Sepal Length and Sepal width
ggplot(mydata,aes(Sepal.Width,(Sepal.Length),color=Species))+
geom_point(position="jitter")
linear relationship between Petal Length and Sepal Length
ggplot(mydata, aes(Sepal.Length,Petal.Length , color=Species))+
geom_point(position = "jitter")+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Correlation test between Sepal width and Petal width
with(subset(mydata,mydata$Species=="setosa"), cor.test(Sepal.Width,Petal.Width))
##
## Pearson's product-moment correlation
##
## data: Sepal.Width and Petal.Width
## t = 1.6581, df = 48, p-value = 0.1038
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.0487543 0.4800023
## sample estimates:
## cor
## 0.232752
with(subset(mydata,mydata$Species=="versicolor"), cor.test(Sepal.Width,Petal.Width))
##
## Pearson's product-moment correlation
##
## data: Sepal.Width and Petal.Width
## t = 6.1523, df = 48, p-value = 1.467e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4730884 0.7953482
## sample estimates:
## cor
## 0.6639987
with(subset(mydata,mydata$Species=="virginica"), cor.test(Sepal.Width,Petal.Width))
##
## Pearson's product-moment correlation
##
## data: Sepal.Width and Petal.Width
## t = 4.4187, df = 48, p-value = 5.648e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3050368 0.7098315
## sample estimates:
## cor
## 0.537728
##Decision Tree model
splitting in to train and test set
set.seed(1234)
ind <- sample(2, nrow(mydata), replace = T, prob = c(0.75, 0.25))
train <- mydata[ind == 1,]
test <- mydata[ind == 2,]
Training the decision tree model
tree <- rpart(Species ~., data = train)
rpart.plot(tree)
printcp(tree)
##
## Classification tree:
## rpart(formula = Species ~ ., data = train)
##
## Variables actually used in tree construction:
## [1] Petal.Length Petal.Width
##
## Root node error: 77/118 = 0.65254
##
## n= 118
##
## CP nsplit rel error xerror xstd
## 1 0.51948 0 1.000000 1.155844 0.060738
## 2 0.42857 1 0.480519 0.545455 0.067546
## 3 0.01000 2 0.051948 0.090909 0.033326
prediction and assessment on the train set
p <- predict(tree, train, type = 'class')
confusionMatrix(p, train$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 40 3
## virginica 0 1 34
##
## Overall Statistics
##
## Accuracy : 0.9661
## 95% CI : (0.9155, 0.9907)
## No Information Rate : 0.3475
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9491
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.000 0.9756 0.9189
## Specificity 1.000 0.9610 0.9877
## Pos Pred Value 1.000 0.9302 0.9714
## Neg Pred Value 1.000 0.9867 0.9639
## Prevalence 0.339 0.3475 0.3136
## Detection Rate 0.339 0.3390 0.2881
## Detection Prevalence 0.339 0.3644 0.2966
## Balanced Accuracy 1.000 0.9683 0.9533
prediction and assessment on the test set
p1 <- predict(tree, test, type = 'class')
confusionMatrix(p1, test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 2
## virginica 0 0 11
##
## Overall Statistics
##
## Accuracy : 0.9375
## 95% CI : (0.7919, 0.9923)
## No Information Rate : 0.4062
## P-Value [Acc > NIR] : 3.355e-10
##
## Kappa : 0.9062
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8462
## Specificity 1.0000 0.9130 1.0000
## Pos Pred Value 1.0000 0.8182 1.0000
## Neg Pred Value 1.0000 1.0000 0.9048
## Prevalence 0.3125 0.2812 0.4062
## Detection Rate 0.3125 0.2812 0.3438
## Detection Prevalence 0.3125 0.3438 0.3438
## Balanced Accuracy 1.0000 0.9565 0.9231
#conclusion The attributes are continuous and normally distributed this is from the descriptive statistics of the dataset. The machine learning model Decision Tree classifier performs we both on the train set and test dataset after assessment.