R Notebook

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

library(rsconnect)
library(rattle)

## Loading required package: tibble

## Loading required package: bitops

## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(ggplot2)
library(caret)

## Loading required package: lattice

library(rpart)
library(rpart.plot)
library(corrplot)

## corrplot 0.92 loaded

library(corrplot)
library(RColorBrewer)
library(lattice)
library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:rattle':
## 
##     importance

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(gbm)

## Loaded gbm 2.1.8.1

set.seed(222)

url_train <- "http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
url_quiz  <- "http://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
data_train <- read.csv(url(url_train), strip.white = TRUE, na.strings = c("NA",""))
data_quiz  <- read.csv(url(url_quiz),  strip.white = TRUE, na.strings = c("NA",""))

dim(data_train)

## [1] 19622   160

dim(data_quiz)

## [1]  20 160

in_train <- createDataPartition(data_train$classe, p = 0.75, list = FALSE)
train_set <- data_train[in_train,]
test_set <- data_train[-in_train,]
dim(train_set)

## [1] 14718   160

dim(test_set)

## [1] 4904  160

nzv_var <- nearZeroVar(train_set)
train_set <- train_set[,-nzv_var]
test_set <- test_set[,-nzv_var]
dim(train_set)

## [1] 14718   120

dim(test_set)

## [1] 4904  120

na_var<- sapply(train_set,function(x) mean(is.na(x))) > 0.95
train_set <- train_set[,na_var ==FALSE]
test_set <- test_set[,na_var ==FALSE]
dim(train_set)

## [1] 14718    59

dim(test_set)

## [1] 4904   59

train_set <- train_set[,-(1:5)]
test_set <- test_set [,-(1:5)]

dim(train_set)

## [1] 14718    54

dim(test_set)

## [1] 4904   54

corr_matrix <- cor(train_set[,-54])
corrplot(corr_matrix,order = "FPC",method = "circle",type = "lower",
         t1.cex = 0.6, t1.col = rgb(0,0,0))

## Warning in text.default(pos.xlabel[, 1], pos.xlabel[, 2], newcolnames, srt =
## tl.srt, : "t1.cex" is not a graphical parameter

## Warning in text.default(pos.xlabel[, 1], pos.xlabel[, 2], newcolnames, srt =
## tl.srt, : "t1.col" is not a graphical parameter

## Warning in text.default(pos.ylabel[, 1], pos.ylabel[, 2], newrownames, col =
## tl.col, : "t1.cex" is not a graphical parameter

## Warning in text.default(pos.ylabel[, 1], pos.ylabel[, 2], newrownames, col =
## tl.col, : "t1.col" is not a graphical parameter

## Warning in title(title, ...): "t1.cex" is not a graphical parameter

## Warning in title(title, ...): "t1.col" is not a graphical parameter

###Decision Tree Model: 
set.seed(2222)
fit_decision_tree <- rpart(classe ~ ., data = train_set, method="class")
fancyRpartPlot(fit_decision_tree)

predict_decision_tree <- predict(fit_decision_tree, newdata = test_set, type="class")
conf_matrix_decision_tree <- confusionMatrix(predict_decision_tree, factor(test_set$classe))
conf_matrix_decision_tree

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1238  218   37   76   36
##          B   41  547   28   30   19
##          C    8   53  688  114   38
##          D   70   91   50  518  111
##          E   38   40   52   66  697
## 
## Overall Statistics
##                                           
##                Accuracy : 0.752           
##                  95% CI : (0.7397, 0.7641)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.685           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.8875   0.5764   0.8047   0.6443   0.7736
## Specificity            0.8954   0.9702   0.9474   0.9215   0.9510
## Pos Pred Value         0.7713   0.8226   0.7636   0.6167   0.7805
## Neg Pred Value         0.9524   0.9052   0.9583   0.9296   0.9491
## Prevalence             0.2845   0.1935   0.1743   0.1639   0.1837
## Detection Rate         0.2524   0.1115   0.1403   0.1056   0.1421
## Detection Prevalence   0.3273   0.1356   0.1837   0.1713   0.1821
## Balanced Accuracy      0.8914   0.7733   0.8760   0.7829   0.8623

plot(conf_matrix_decision_tree$table, col = conf_matrix_decision_tree$byClass, 
     main = paste("Decision Tree Model: Predictive Accuracy =",
                  round(conf_matrix_decision_tree$overall['Accuracy'], 4)))

set.seed(2222)
ctrl_GBM <- trainControl(method = "repeatedcv", number = 5, repeats = 2)
fit_GBM  <- train(classe ~ ., data = train_set, method = "gbm",
                  trControl = ctrl_GBM, verbose = FALSE)
fit_GBM$finalModel

## A gradient boosted model with multinomial loss function.
## 150 iterations were performed.
## There were 53 predictors of which 52 had non-zero influence.

predict_GBM <- predict(fit_GBM, newdata = test_set)
conf_matrix_GBM <- confusionMatrix(predict_GBM, factor(test_set$classe))
conf_matrix_GBM

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1393    7    0    1    0
##          B    2  929    3    4    2
##          C    0   11  842   12    1
##          D    0    2   10  784    9
##          E    0    0    0    3  889
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9863          
##                  95% CI : (0.9827, 0.9894)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9827          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9986   0.9789   0.9848   0.9751   0.9867
## Specificity            0.9977   0.9972   0.9941   0.9949   0.9993
## Pos Pred Value         0.9943   0.9883   0.9723   0.9739   0.9966
## Neg Pred Value         0.9994   0.9950   0.9968   0.9951   0.9970
## Prevalence             0.2845   0.1935   0.1743   0.1639   0.1837
## Detection Rate         0.2841   0.1894   0.1717   0.1599   0.1813
## Detection Prevalence   0.2857   0.1917   0.1766   0.1642   0.1819
## Balanced Accuracy      0.9981   0.9881   0.9894   0.9850   0.9930

set.seed(2222)
ctrl_RF <- trainControl(method = "repeatedcv", number = 5, repeats = 2)
fit_RF  <- train(classe ~ ., data = train_set, method = "rf",
                  trControl = ctrl_RF, verbose = FALSE)
fit_RF$finalModel

## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry, verbose = FALSE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 27
## 
##         OOB estimate of  error rate: 0.24%
## Confusion matrix:
##      A    B    C    D    E  class.error
## A 4183    1    0    0    1 0.0004778973
## B    8 2836    3    1    0 0.0042134831
## C    0    6 2561    0    0 0.0023373588
## D    0    0    7 2404    1 0.0033167496
## E    0    1    0    7 2698 0.0029563932

predict_quiz <- as.data.frame(predict(fit_RF, newdata = data_quiz))
predict_quiz

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.