iris classification

Loading the required libary that will be used for the classification and data analysis

library(psych)
library(randomForest)   #for classification algorithm

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:psych':
## 
##     outlier

library(tidyverse) #for data wragling

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## %+%():     ggplot2, psych
## alpha():   ggplot2, psych
## combine(): dplyr, randomForest
## filter():  dplyr, stats
## lag():     dplyr, stats
## margin():  ggplot2, randomForest

Load the iris dataset from the r dataset repository

iris_data <- datasets::iris   
summary(iris_data)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

Check the data type for each columns as a convertion to better understanding of how many discreate and continuous variable is the dataframe. in R we use str()

str(iris_data)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

From the display above, we understand that sepal.lenght,sepal.width, petal.lenght, and peta.width are all numeric while the species is factor(categorical).

plot a scatter matrix to get a better understand of the data corelation

pairs.panels(iris_data,gap=0)

cor(iris_data[c(1,2,3,4)])

##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000

cor.test(iris_data$Sepal.Length, iris_data$Petal.Length)

## 
##  Pearson's product-moment correlation
## 
## data:  iris_data$Sepal.Length and iris_data$Petal.Length
## t = 21.646, df = 148, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8270363 0.9055080
## sample estimates:
##       cor 
## 0.8717538

cor.test(iris_data$Sepal.Length, iris_data$Sepal.Width)

## 
##  Pearson's product-moment correlation
## 
## data:  iris_data$Sepal.Length and iris_data$Sepal.Width
## t = -1.4403, df = 148, p-value = 0.1519
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.27269325  0.04351158
## sample estimates:
##        cor 
## -0.1175698

cor.test(iris_data$Petal.Length, iris_data$Petal.Width)

## 
##  Pearson's product-moment correlation
## 
## data:  iris_data$Petal.Length and iris_data$Petal.Width
## t = 43.387, df = 148, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9490525 0.9729853
## sample estimates:
##       cor 
## 0.9628654

library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following object is masked from 'package:psych':
## 
##     describe

## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

rcorr(as.matrix(iris_data[c(1,2,3,4)]))  #compared p-values

##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length         1.00       -0.12         0.87        0.82
## Sepal.Width         -0.12        1.00        -0.43       -0.37
## Petal.Length         0.87       -0.43         1.00        0.96
## Petal.Width          0.82       -0.37         0.96        1.00
## 
## n= 150 
## 
## 
## P
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length              0.1519      0.0000       0.0000     
## Sepal.Width  0.1519                   0.0000       0.0000     
## Petal.Length 0.0000       0.0000                   0.0000     
## Petal.Width  0.0000       0.0000      0.0000

Model

M <- iris_data[,-5]
L <- iris_data[,5]

#train the model
iris_rf <- randomForest(M,L)
iris_rf

## 
## Call:
##  randomForest(x = M, y = L) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.67%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         50          0         0        0.00
## versicolor      0         47         3        0.06
## virginica       0          4        46        0.08

iris_rf$importance

##              MeanDecreaseGini
## Sepal.Length        10.064307
## Sepal.Width          2.724958
## Petal.Length        40.177059
## Petal.Width         46.253903

iris classification

Tajudeen Abdulazeez

October 22, 2018

Loading the required libary that will be used for the classification and data analysis

Load the iris dataset from the r dataset repository

plot a scatter matrix to get a better understand of the data corelation

Model