library(ggplot2)
library(lattice)
library(AppliedPredictiveModeling)
library(caret)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
data("iris")
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
nrow(iris)
## [1] 150
set.seed(9999)
#Visualising 
transparentTheme(trans = .4)
featurePlot(x = iris[, 1:4], 
             y = iris$Species, 
             plot = "ellipse",
             auto.key = list(columns = 3))

#Split into train and test dataset
trainIndex <- createDataPartition(iris$Species, p = .8,
 list = FALSE,
 times = 1)
train <- iris[ trainIndex,]
test  <- iris[-trainIndex,]

nrow(train)
## [1] 120
nrow(test)
## [1] 30
fitControl <- trainControl(
 method = "repeatedcv",
 number = 10,
 repeats = 5)
dt.fit <- train(Species ~ ., data = train,
 method = "rpart",
 trControl = fitControl,
 preProcess=c("center", "scale"))


dt.fit
## CART 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results across tuning parameters:
## 
##   cp    Accuracy   Kappa 
##   0.00  0.9583333  0.9375
##   0.45  0.7716667  0.6575
##   0.50  0.3333333  0.0000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
predictions <- predict(dt.fit, test)

confusionMatrix(predictions, test$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0          9         1
##   virginica       0          1         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9333          
##                  95% CI : (0.7793, 0.9918)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 8.747e-12       
##                                           
##                   Kappa : 0.9             
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9000           0.9000
## Specificity                 1.0000            0.9500           0.9500
## Pos Pred Value              1.0000            0.9000           0.9000
## Neg Pred Value              1.0000            0.9500           0.9500
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3000           0.3000
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9250           0.9250
## Confusion Matrix and Statistics

plot(varImp(dt.fit))

knn.fit <- train(Species ~ ., data = train,
 method = "knn",
 trControl = fitControl,
 preProcess=c("center", "scale"))

knn.fit
## k-Nearest Neighbors 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa 
##   5  0.9583333  0.9375
##   7  0.9616667  0.9425
##   9  0.9600000  0.9400
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
predictions <- predict(knn.fit, test)

confusionMatrix(predictions, test$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0          8         1
##   virginica       0          2         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9             
##                  95% CI : (0.7347, 0.9789)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 1.665e-10       
##                                           
##                   Kappa : 0.85            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.8000           0.9000
## Specificity                 1.0000            0.9500           0.9000
## Pos Pred Value              1.0000            0.8889           0.8182
## Neg Pred Value              1.0000            0.9048           0.9474
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.2667           0.3000
## Detection Prevalence        0.3333            0.3000           0.3667
## Balanced Accuracy           1.0000            0.8750           0.9000
## Confusion Matrix and Statistics


plot(varImp(knn.fit))

rf.fit <- train(Species ~ ., data = train,
 method = "rf",
 trControl = fitControl,
 preProcess=c("center", "scale"))



rf.fit
## Random Forest 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa
##   2     0.9533333  0.930
##   3     0.9566667  0.935
##   4     0.9566667  0.935
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
## Random Forest

predictions <- predict(rf.fit, test)

confusionMatrix(predictions, test$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0          9         1
##   virginica       0          1         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9333          
##                  95% CI : (0.7793, 0.9918)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 8.747e-12       
##                                           
##                   Kappa : 0.9             
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9000           0.9000
## Specificity                 1.0000            0.9500           0.9500
## Pos Pred Value              1.0000            0.9000           0.9000
## Neg Pred Value              1.0000            0.9500           0.9500
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3000           0.3000
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9250           0.9250
## Confusion Matrix and Statistics


plot(varImp(rf.fit))

#Create a linear regression model that predicts
#sepal_length

model=lm(iris$Sepal.Length ~ iris$Sepal.Width + iris$Petal.Length + iris$Petal.Width + iris$Species)
summary(model)
## 
## Call:
## lm(formula = iris$Sepal.Length ~ iris$Sepal.Width + iris$Petal.Length + 
##     iris$Petal.Width + iris$Species)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.79424 -0.21874  0.00899  0.20255  0.73103 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             2.17127    0.27979   7.760 1.43e-12 ***
## iris$Sepal.Width        0.49589    0.08607   5.761 4.87e-08 ***
## iris$Petal.Length       0.82924    0.06853  12.101  < 2e-16 ***
## iris$Petal.Width       -0.31516    0.15120  -2.084  0.03889 *  
## iris$Speciesversicolor -0.72356    0.24017  -3.013  0.00306 ** 
## iris$Speciesvirginica  -1.02350    0.33373  -3.067  0.00258 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3068 on 144 degrees of freedom
## Multiple R-squared:  0.8673, Adjusted R-squared:  0.8627 
## F-statistic: 188.3 on 5 and 144 DF,  p-value: < 2.2e-16
plot(model)