library(ggplot2)
library(lattice)
library(AppliedPredictiveModeling)
library(caret)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data("iris")
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
nrow(iris)
## [1] 150
set.seed(9999)
#Visualising
transparentTheme(trans = .4)
featurePlot(x = iris[, 1:4],
y = iris$Species,
plot = "ellipse",
auto.key = list(columns = 3))

#Split into train and test dataset
trainIndex <- createDataPartition(iris$Species, p = .8,
list = FALSE,
times = 1)
train <- iris[ trainIndex,]
test <- iris[-trainIndex,]
nrow(train)
## [1] 120
nrow(test)
## [1] 30
fitControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 5)
dt.fit <- train(Species ~ ., data = train,
method = "rpart",
trControl = fitControl,
preProcess=c("center", "scale"))
dt.fit
## CART
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.00 0.9583333 0.9375
## 0.45 0.7716667 0.6575
## 0.50 0.3333333 0.0000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
predictions <- predict(dt.fit, test)
confusionMatrix(predictions, test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 1
## virginica 0 1 9
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 0.9000
## Specificity 1.0000 0.9500 0.9500
## Pos Pred Value 1.0000 0.9000 0.9000
## Neg Pred Value 1.0000 0.9500 0.9500
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3000
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.9250 0.9250
## Confusion Matrix and Statistics
plot(varImp(dt.fit))

knn.fit <- train(Species ~ ., data = train,
method = "knn",
trControl = fitControl,
preProcess=c("center", "scale"))
knn.fit
## k-Nearest Neighbors
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9583333 0.9375
## 7 0.9616667 0.9425
## 9 0.9600000 0.9400
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
predictions <- predict(knn.fit, test)
confusionMatrix(predictions, test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 8 1
## virginica 0 2 9
##
## Overall Statistics
##
## Accuracy : 0.9
## 95% CI : (0.7347, 0.9789)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 1.665e-10
##
## Kappa : 0.85
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.8000 0.9000
## Specificity 1.0000 0.9500 0.9000
## Pos Pred Value 1.0000 0.8889 0.8182
## Neg Pred Value 1.0000 0.9048 0.9474
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.2667 0.3000
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.8750 0.9000
## Confusion Matrix and Statistics
plot(varImp(knn.fit))

rf.fit <- train(Species ~ ., data = train,
method = "rf",
trControl = fitControl,
preProcess=c("center", "scale"))
rf.fit
## Random Forest
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9533333 0.930
## 3 0.9566667 0.935
## 4 0.9566667 0.935
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
## Random Forest
predictions <- predict(rf.fit, test)
confusionMatrix(predictions, test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 1
## virginica 0 1 9
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 0.9000
## Specificity 1.0000 0.9500 0.9500
## Pos Pred Value 1.0000 0.9000 0.9000
## Neg Pred Value 1.0000 0.9500 0.9500
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3000
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.9250 0.9250
## Confusion Matrix and Statistics
plot(varImp(rf.fit))

#Create a linear regression model that predicts
#sepal_length
model=lm(iris$Sepal.Length ~ iris$Sepal.Width + iris$Petal.Length + iris$Petal.Width + iris$Species)
summary(model)
##
## Call:
## lm(formula = iris$Sepal.Length ~ iris$Sepal.Width + iris$Petal.Length +
## iris$Petal.Width + iris$Species)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.79424 -0.21874 0.00899 0.20255 0.73103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.17127 0.27979 7.760 1.43e-12 ***
## iris$Sepal.Width 0.49589 0.08607 5.761 4.87e-08 ***
## iris$Petal.Length 0.82924 0.06853 12.101 < 2e-16 ***
## iris$Petal.Width -0.31516 0.15120 -2.084 0.03889 *
## iris$Speciesversicolor -0.72356 0.24017 -3.013 0.00306 **
## iris$Speciesvirginica -1.02350 0.33373 -3.067 0.00258 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3068 on 144 degrees of freedom
## Multiple R-squared: 0.8673, Adjusted R-squared: 0.8627
## F-statistic: 188.3 on 5 and 144 DF, p-value: < 2.2e-16
plot(model)



