#install.packages("caret")

#install.packages("caret", dependencies=c("Depends", "Suggests"))

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

# attach the iris dataset to the environment
data(iris)
# rename the dataset
dataset <- iris

# define the filename-manual procedure
#filename <- "iris.csv"
# load the CSV file from the local directory
#dataset <- read.csv(filename, header=FALSE)

# set the column names in the dataset
colnames(dataset) <- c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species")
#validation_index
# create a list of 80% of the rows in the original dataset we can use for training
validation_index <- createDataPartition(dataset$Species, p=0.80, list=FALSE)
# select 20% of the data for validation
validation <- dataset[-validation_index,]
# use the remaining 80% of data to training and testing the models
dataset <- dataset[validation_index,]

# dimensions of dataset
dim(dataset)

## [1] 120   5

# take a peek at the first 5 rows of the data
head(dataset)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
## 7          4.6         3.4          1.4         0.3  setosa

# list the levels for the class
levels(dataset$Species)

## [1] "setosa"     "versicolor" "virginica"

# summarize the class distribution
percentage <- prop.table(table(dataset$Species)) * 100
cbind(freq=table(dataset$Species), percentage=percentage)

##            freq percentage
## setosa       40   33.33333
## versicolor   40   33.33333
## virginica    40   33.33333

# summarize attribute distributions
summary(dataset)

##   Sepal.Length   Sepal.Width     Petal.Length    Petal.Width          Species  
##  Min.   :4.30   Min.   :2.000   Min.   :1.000   Min.   :0.100   setosa    :40  
##  1st Qu.:5.10   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300   versicolor:40  
##  Median :5.80   Median :3.000   Median :4.250   Median :1.300   virginica :40  
##  Mean   :5.81   Mean   :3.058   Mean   :3.729   Mean   :1.189                  
##  3rd Qu.:6.40   3rd Qu.:3.225   3rd Qu.:5.100   3rd Qu.:1.800                  
##  Max.   :7.90   Max.   :4.400   Max.   :6.700   Max.   :2.500

# split input and output
x <- dataset[,1:4]
y <- dataset[,5]

# boxplot for each attribute on one image
par(mfrow=c(1,4))
  for(i in 1:4) {
  boxplot(x[,i], main=names(iris)[i])
}

# barplot for class breakdown
plot(y)

# scatterplot matrix
featurePlot(x=x, y=y, plot="ellipse")

# box and whisker plots for each attribute
featurePlot(x=x, y=y, plot="box")

# density plots for each attribute by class value
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)

# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"

# a) linear algorithms
set.seed(7)
fit.lda <- train(Species~., data=dataset, method="lda", metric=metric, trControl=control)
# b) nonlinear algorithms
# CART
set.seed(7)
fit.cart <- train(Species~., data=dataset, method="rpart", metric=metric, trControl=control)
# kNN
set.seed(7)
fit.knn <- train(Species~., data=dataset, method="knn", metric=metric, trControl=control)
# c) advanced algorithms
# SVM
set.seed(7)
fit.svm <- train(Species~., data=dataset, method="svmRadial", metric=metric, trControl=control)
# Random Forest
set.seed(7)
fit.rf <- train(Species~., data=dataset, method="rf", metric=metric, trControl=control)

#Â summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: lda, cart, knn, svm, rf 
## Number of resamples: 10 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## lda  0.8333333 1.0000000 1.0000000 0.9750000 1.0000000    1    0
## cart 0.8333333 0.9166667 0.9166667 0.9166667 0.9166667    1    0
## knn  0.9166667 0.9166667 1.0000000 0.9666667 1.0000000    1    0
## svm  0.8333333 0.9166667 0.9583333 0.9500000 1.0000000    1    0
## rf   0.8333333 0.9166667 0.9583333 0.9500000 1.0000000    1    0
## 
## Kappa 
##       Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## lda  0.750   1.000 1.0000 0.9625   1.000    1    0
## cart 0.750   0.875 0.8750 0.8750   0.875    1    0
## knn  0.875   0.875 1.0000 0.9500   1.000    1    0
## svm  0.750   0.875 0.9375 0.9250   1.000    1    0
## rf   0.750   0.875 0.9375 0.9250   1.000    1    0

# compareÂ accuracy of models
dotplot(results)

# summarize Best Model
print(fit.lda)

## Linear Discriminant Analysis 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results:
## 
##   Accuracy  Kappa 
##   0.975     0.9625

# estimate skill of LDA on the validation dataset
predictions <- predict(fit.lda, validation)
confusionMatrix(predictions, validation$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         0
##   virginica       0          0        10
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.8843, 1)
##     No Information Rate : 0.3333     
##     P-Value [Acc > NIR] : 4.857e-15  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           1.0000
## Specificity                 1.0000            1.0000           1.0000
## Pos Pred Value              1.0000            1.0000           1.0000
## Neg Pred Value              1.0000            1.0000           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3333
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            1.0000           1.0000

#install.packages(“caret”)

#install.packages(“caret”, dependencies=c(“Depends”, “Suggests”))

library(caret)

attach the iris dataset to the environment

data(iris) # rename the dataset dataset <- iris

define the filename-manual procedure

#filename <- “iris.csv” # load the CSV file from the local directory #dataset <- read.csv(filename, header=FALSE)

set the column names in the dataset

colnames(dataset) <- c(“Sepal.Length”,“Sepal.Width”,“Petal.Length”,“Petal.Width”,“Species”) validation_index # create a list of 80% of the rows in the original dataset we can use for training validation_index <- createDataPartition(dataset$Species, p=0.80, list=FALSE) # select 20% of the data for validation validation <- dataset[-validation_index,] # use the remaining 80% of data to training and testing the models dataset <- dataset[validation_index,]

dimensions of dataset

dim(dataset)

take a peek at the first 5 rows of the data

head(dataset)

list the levels for the class

levels(dataset$Species)

summarize the class distribution

percentage <- prop.table(table(dataset$Species)) * 100 cbind(freq=table(dataset$Species), percentage=percentage)

summarize attribute distributions

summary(dataset)

split input and output

x <- dataset[,1:4] y <- dataset[,5]

boxplot for each attribute on one image

par(mfrow=c(1,4)) for(i in 1:4) { boxplot(x[,i], main=names(iris)[i]) }

barplot for class breakdown

plot(y)

scatterplot matrix

featurePlot(x=x, y=y, plot=“ellipse”)

box and whisker plots for each attribute

featurePlot(x=x, y=y, plot=“box”)

density plots for each attribute by class value

scales <- list(x=list(relation=“free”), y=list(relation=“free”)) featurePlot(x=x, y=y, plot=“density”, scales=scales)

Run algorithms using 10-fold cross validation

control <- trainControl(method=“cv”, number=10) metric <- “Accuracy”

a) linear algorithms

set.seed(7) fit.lda <- train(Species~., data=dataset, method=“lda”, metric=metric, trControl=control) # b) nonlinear algorithms # CART set.seed(7) fit.cart <- train(Species~., data=dataset, method=“rpart”, metric=metric, trControl=control) # kNN set.seed(7) fit.knn <- train(Species~., data=dataset, method=“knn”, metric=metric, trControl=control) # c) advanced algorithms # SVM set.seed(7) fit.svm <- train(Species~., data=dataset, method=“svmRadial”, metric=metric, trControl=control) # Random Forest set.seed(7) fit.rf <- train(Species~., data=dataset, method=“rf”, metric=metric, trControl=control)

#Â summarize accuracy of models results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf)) summary(results)

compareÂ accuracy of models

dotplot(results)

summarize Best Model

print(fit.lda)

estimate skill of LDA on the validation dataset

predictions <- predict(fit.lda, validation) confusionMatrix(predictions, validation$Species) ## R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Week2ML

Lisa Szydziak

3/3/2022