iris_n.knit

IRIS data| EDA | Decision Tree | Random Forest | SVM

library(ggplot2) # Data visualization
library(plotly) # Interactive data visualizations

## Warning: package 'plotly' was built under R version 4.3.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(psych) # Will be used for correlation visualizations

## Warning: package 'psych' was built under R version 4.3.3

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(rattle) # Graphing decision trees

## Warning: package 'rattle' was built under R version 4.3.3

## Loading required package: tibble

## Loading required package: bitops

## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(caret) # Machine learning

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: lattice

data("iris")#Load the iris data set

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

## Exploratory data analysis#########
#pairs.panels() from psych package

pairs.panels(
  iris[,1:4], # Our data.
  scale = TRUE, # Changes size of correlation value lables based on strength.
  hist.col = 'grey85', # Histogram color.  
  bg = c("mediumseagreen","orange2","mediumpurple1")[iris$Species], # Colors of the Species levels.
  pch = 21, # The plot characters shape and size.
  main = 'Correlation matrix of Iris data')  # Title.

#3Dplot

library(plotly)
library(magrittr)

plot_ly(data = iris,  # Data
        x = ~Sepal.Length, y = ~Petal.Length, z = ~Petal.Width,  # X, Y, and Z variables
        color = ~Species,  # Color separation by Species
        type = "scatter3d",  # 3D scatterplot
        mode = "markers"  # Use markers
) %>%
  layout(scene = list(xaxis = list(title = 'Sepal length'),  # Axes names
                      yaxis = list(title = 'Petal length'),
                      zaxis = list(title = 'Petal width')))

#####Boxplot##########
ggplot(
  # (1) Set data; (2) Specify X and Y variables; (3) 'fill' color separates our Species levels.
  data = iris, mapping = aes(x = Species, y = Sepal.Width, fill = Species)) +
  geom_boxplot() +  # Specifies that we want a box plot. 
  scale_fill_brewer(palette = 'Dark2') +  # Change color of box plots. 
  theme_light() +  # Set light theme. 
  labs(title = 'Box plot of sepal width for each species', 
       x = 'Species', y = 'Sepal width')  # Assign a title, axis names.

ggplot(data = iris, mapping = aes(x = Species, y = Sepal.Length, fill = Species)) +
  geom_boxplot() + 
  scale_fill_brewer(palette = 'Dark2') + 
  theme_light() +
  labs(title = 'Box plot of sepal length for each species', 
       x = 'Species', y = 'Sepal length')

ggplot(data = iris, mapping = aes(x = Species, y = Petal.Width, fill = Species)) + 
  geom_boxplot() + 
  scale_fill_brewer(palette = 'Dark2') + 
  theme_light() +
  labs(title = 'Box plot of petal width for each species', 
       x = 'Species', y = 'Petal width')

ggplot(data = iris, mapping = aes(x = Species, y = Petal.Length, fill = Species)) +
  geom_boxplot() + 
  scale_fill_brewer(palette = 'Dark2') +
  theme_light() +
  labs(title = 'Box plot of petal length for each species', 
       x = 'Species', y = 'Petal length')

####Split Data In to Train and Test #####
set.seed(222)

train_index <- createDataPartition(y = iris$Species,  # y = our dependent variable.
                                   p = .7,  # Specifies split into 70% & 30%.
                                   list = FALSE,  # Sets results to matrix form. 
                                   times = 1)  # Sets number of partitions to create to 1. 


train_data <- iris[train_index,]  # Use train_index of iris data to create train_data.
test_data <- iris[-train_index,]  # Use whatever that is not in train_index to create test_data.


#to predict which category of species (setosa, versicolor, virginica) each iris flower belongs to 
####DECISION TREE####
#Model the decision tree model with a 10 fold cross validation.
fitControl <- trainControl(method = "cv", number = 10, savePredictions = TRUE)

#Create a predictor model with the train() function from the CARET package. Specify method = 'rpart' to run a decision tree model.

# Create model
dt_model <- train(Species ~ ., # Set Y variable followed by '~'. The period indicates to include all variables for prediction. 
                  data = train_data, # Data
                  method = 'rpart', # Specify SVM model
                  trControl = fitControl) # Use cross validation

confusionMatrix(dt_model)

## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##             Reference
## Prediction   setosa versicolor virginica
##   setosa       33.3        0.0       0.0
##   versicolor    0.0       29.5       5.7
##   virginica     0.0        3.8      27.6
##                             
##  Accuracy (average) : 0.9048

pred_dt <- predict(dt_model, test_data)
# Evaluate accuracy
accuracy_dt <- mean(pred_dt == test_data$Species)

#average accuracy is 90.48% when testing data on resamples of training data. check weather predicted correctly/incorrectly.

#Check the importance of each feature in our model.

# Create object of importance of our variables 
dt_importance <- varImp(dt_model)

# Create plot of importance of variables
ggplot(data = dt_importance, mapping = aes(x = dt_importance[,1])) + # Data & mapping
  geom_boxplot() + # Create box plot
  labs(title = "Variable importance: Decision tree model") + # Title
  theme_light() # Theme

#decision tree using fancyRpartPlot() from the RATTLE package. This will give us clear insight into how the model makes its predictions.

fancyRpartPlot(dt_model$finalModel, sub = '')

#PREDICTION: Decision tree model
#Use the created dt_model to run a prediction on the test data.

prediction_dt <- predict(dt_model, test_data)

table(prediction_dt, test_data$Species) %>% # Create prediction table. 
  prop.table() %>% # Convert table values into proportions instead of counts. 
  round(2) # Round numbers to 2 significant values.

##              
## prediction_dt setosa versicolor virginica
##    setosa       0.33       0.00      0.00
##    versicolor   0.00       0.31      0.00
##    virginica    0.00       0.02      0.33

#Create an object for a 10 fold cross validation. We will use this in our train() function to set trControl next.

fitControl <- trainControl(method = "cv", number = 10, savePredictions = TRUE)

######Random Forest########
# Create model
rf_model <- train(
  Species ~ .,  # Set Y variable followed by "~." to include all variables in formula.
  method = 'rf',  # Set method as random forest.
  trControl = fitControl,  # Set cross validation settings
  data = train_data)  # Set data as train_data.


# Create object of importance of our variables 
rf_importance <- varImp(rf_model) 

# Create box plot of importance of variables
ggplot(data = rf_importance, mapping = aes(x = rf_importance[,1])) + # Data & mapping
  geom_boxplot() + # Create box plot
  labs(title = "Variable importance: Random forest model") + # Title
  theme_light() # Theme

confusionMatrix(rf_model)

## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##             Reference
## Prediction   setosa versicolor virginica
##   setosa       33.3        0.0       0.0
##   versicolor    0.0       29.5       2.9
##   virginica     0.0        3.8      30.5
##                             
##  Accuracy (average) : 0.9333

#Prediction: Random forest model
#We will now use our created random forest model in order to predict species on our test data (i.e., the data set our ‘machine’ has not seen before).

#Use the created rf_model to run a prediction on the test data.

prediction_rf <- predict(rf_model, test_data)
table(prediction_rf, test_data$Species) %>% # Create prediction table. 
  prop.table() %>% # Convert table values into proportions instead of counts. 
  round(2) # Round numbers to 2 significant values.

##              
## prediction_rf setosa versicolor virginica
##    setosa       0.33       0.00      0.00
##    versicolor   0.00       0.33      0.00
##    virginica    0.00       0.00      0.33

pred_rf <- predict(rf_model, test_data)
# Evaluate accuracy
accuracy_rf <- mean(pred_rf == test_data$Species)

#####SVM##########

fitControl <- trainControl(method = "cv", number = 10, savePredictions = TRUE)

# Create model
svm_model <- train(Species ~ ., # Set Y variable followed by '~'. The period indicates to include all variables for prediction. 
                   data = train_data, # Data
                   method = 'svmLinear', # Specify SVM model
                   trControl = fitControl) # Use cross validation

confusionMatrix(svm_model)

## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##             Reference
## Prediction   setosa versicolor virginica
##   setosa       33.3        0.0       0.0
##   versicolor    0.0       30.5       1.0
##   virginica     0.0        2.9      32.4
##                             
##  Accuracy (average) : 0.9619

# Create object of importance of our variables 
svm_importance <- varImp(svm_model)

# Create box plot
ggplot(data = svm_importance, mapping = aes(x = svm_importance[,1])) + # Data & mapping
  geom_boxplot() + # Create box plot
  labs(title = "Variable importance: Support vector machine model") + # Title
  theme_light() # Theme

prediction_svm <- predict(svm_model, test_data)

table(prediction_svm, test_data$Species) %>% # Create prediction table. 
  prop.table() %>% # Convert table values into proportions instead of counts. 
  round(2) # Round numbers to 2 significant values.

##               
## prediction_svm setosa versicolor virginica
##     setosa       0.33       0.00      0.00
##     versicolor   0.00       0.33      0.00
##     virginica    0.00       0.00      0.33

pred_SVM <- predict(svm_model, test_data)

# Evaluate accuracy
accuracy_SVM <- mean(pred_SVM == test_data$Species)

# Compare results
results <- data.frame(
  Model = c("Decision Tree", "Random Forest", "SVM"),
  Accuracy = c(accuracy_dt, accuracy_rf, accuracy_SVM)
  
)
results

##           Model  Accuracy
## 1 Decision Tree 0.9777778
## 2 Random Forest 1.0000000
## 3           SVM 1.0000000