data(iris)
names(iris) = tolower(names(iris))
names(iris)
## [1] "sepal.length" "sepal.width" "petal.length" "petal.width"
## [5] "species"
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ sepal.length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ sepal.width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ petal.length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ petal.width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
head(iris)
## sepal.length sepal.width petal.length petal.width species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
dim(iris)
## [1] 150 5
suppressMessages(library(caret))
index = createDataPartition(y=iris$species, p=0.7, list=FALSE)
train = iris[index,]
test = iris[-index,]
dim(train)
## [1] 105 5
dim(test)
## [1] 45 5
lda.fit = train(species ~ ., data=train, method="lda",
trControl = trainControl(method = "cv"))
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
lda.fit
## Linear Discriminant Analysis
##
## 105 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 95, 95, 95, 94, 93, 93, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9705556 0.9556818
##
##
summary(lda.fit)
## Length Class Mode
## prior 3 -none- numeric
## counts 3 -none- numeric
## means 12 -none- numeric
## scaling 8 -none- numeric
## lev 3 -none- character
## svd 2 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## xNames 4 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 3 -none- character
pred.species = predict(lda.fit, test)
table(pred.species, test$species)
##
## pred.species setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 15 1
## virginica 0 0 14
pred.accuracy = round(mean(pred.species == test$species)*100,2)
pred.accuracy
## [1] 97.78
# plot of original species from test-set:
qplot(petal.width, sepal.width, data=test, cex=2, col=species)
# plot of predicted species with misclassified species:
test$pred.right = pred.species == test$species
qplot(petal.width, sepal.width, data=test, cex=2, col=pred.right)
Red points are misclassified observations as per LDA model.