This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
library(rsconnect)
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(ggplot2)
library(caret)
## Loading required package: lattice
library(rpart)
library(rpart.plot)
library(corrplot)
## corrplot 0.92 loaded
library(corrplot)
library(RColorBrewer)
library(lattice)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:rattle':
##
## importance
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gbm)
## Loaded gbm 2.1.8.1
set.seed(222)
url_train <- "http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
url_quiz <- "http://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
data_train <- read.csv(url(url_train), strip.white = TRUE, na.strings = c("NA",""))
data_quiz <- read.csv(url(url_quiz), strip.white = TRUE, na.strings = c("NA",""))
dim(data_train)
## [1] 19622 160
dim(data_quiz)
## [1] 20 160
in_train <- createDataPartition(data_train$classe, p = 0.75, list = FALSE)
train_set <- data_train[in_train,]
test_set <- data_train[-in_train,]
dim(train_set)
## [1] 14718 160
dim(test_set)
## [1] 4904 160
nzv_var <- nearZeroVar(train_set)
train_set <- train_set[,-nzv_var]
test_set <- test_set[,-nzv_var]
dim(train_set)
## [1] 14718 120
dim(test_set)
## [1] 4904 120
na_var<- sapply(train_set,function(x) mean(is.na(x))) > 0.95
train_set <- train_set[,na_var ==FALSE]
test_set <- test_set[,na_var ==FALSE]
dim(train_set)
## [1] 14718 59
dim(test_set)
## [1] 4904 59
train_set <- train_set[,-(1:5)]
test_set <- test_set [,-(1:5)]
dim(train_set)
## [1] 14718 54
dim(test_set)
## [1] 4904 54
corr_matrix <- cor(train_set[,-54])
corrplot(corr_matrix,order = "FPC",method = "circle",type = "lower",
t1.cex = 0.6, t1.col = rgb(0,0,0))
## Warning in text.default(pos.xlabel[, 1], pos.xlabel[, 2], newcolnames, srt =
## tl.srt, : "t1.cex" is not a graphical parameter
## Warning in text.default(pos.xlabel[, 1], pos.xlabel[, 2], newcolnames, srt =
## tl.srt, : "t1.col" is not a graphical parameter
## Warning in text.default(pos.ylabel[, 1], pos.ylabel[, 2], newrownames, col =
## tl.col, : "t1.cex" is not a graphical parameter
## Warning in text.default(pos.ylabel[, 1], pos.ylabel[, 2], newrownames, col =
## tl.col, : "t1.col" is not a graphical parameter
## Warning in title(title, ...): "t1.cex" is not a graphical parameter
## Warning in title(title, ...): "t1.col" is not a graphical parameter
###Decision Tree Model:
set.seed(2222)
fit_decision_tree <- rpart(classe ~ ., data = train_set, method="class")
fancyRpartPlot(fit_decision_tree)
predict_decision_tree <- predict(fit_decision_tree, newdata = test_set, type="class")
conf_matrix_decision_tree <- confusionMatrix(predict_decision_tree, factor(test_set$classe))
conf_matrix_decision_tree
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1238 218 37 76 36
## B 41 547 28 30 19
## C 8 53 688 114 38
## D 70 91 50 518 111
## E 38 40 52 66 697
##
## Overall Statistics
##
## Accuracy : 0.752
## 95% CI : (0.7397, 0.7641)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.685
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.8875 0.5764 0.8047 0.6443 0.7736
## Specificity 0.8954 0.9702 0.9474 0.9215 0.9510
## Pos Pred Value 0.7713 0.8226 0.7636 0.6167 0.7805
## Neg Pred Value 0.9524 0.9052 0.9583 0.9296 0.9491
## Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Detection Rate 0.2524 0.1115 0.1403 0.1056 0.1421
## Detection Prevalence 0.3273 0.1356 0.1837 0.1713 0.1821
## Balanced Accuracy 0.8914 0.7733 0.8760 0.7829 0.8623
plot(conf_matrix_decision_tree$table, col = conf_matrix_decision_tree$byClass,
main = paste("Decision Tree Model: Predictive Accuracy =",
round(conf_matrix_decision_tree$overall['Accuracy'], 4)))
set.seed(2222)
ctrl_GBM <- trainControl(method = "repeatedcv", number = 5, repeats = 2)
fit_GBM <- train(classe ~ ., data = train_set, method = "gbm",
trControl = ctrl_GBM, verbose = FALSE)
fit_GBM$finalModel
## A gradient boosted model with multinomial loss function.
## 150 iterations were performed.
## There were 53 predictors of which 52 had non-zero influence.
predict_GBM <- predict(fit_GBM, newdata = test_set)
conf_matrix_GBM <- confusionMatrix(predict_GBM, factor(test_set$classe))
conf_matrix_GBM
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1393 7 0 1 0
## B 2 929 3 4 2
## C 0 11 842 12 1
## D 0 2 10 784 9
## E 0 0 0 3 889
##
## Overall Statistics
##
## Accuracy : 0.9863
## 95% CI : (0.9827, 0.9894)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9827
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9986 0.9789 0.9848 0.9751 0.9867
## Specificity 0.9977 0.9972 0.9941 0.9949 0.9993
## Pos Pred Value 0.9943 0.9883 0.9723 0.9739 0.9966
## Neg Pred Value 0.9994 0.9950 0.9968 0.9951 0.9970
## Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Detection Rate 0.2841 0.1894 0.1717 0.1599 0.1813
## Detection Prevalence 0.2857 0.1917 0.1766 0.1642 0.1819
## Balanced Accuracy 0.9981 0.9881 0.9894 0.9850 0.9930
set.seed(2222)
ctrl_RF <- trainControl(method = "repeatedcv", number = 5, repeats = 2)
fit_RF <- train(classe ~ ., data = train_set, method = "rf",
trControl = ctrl_RF, verbose = FALSE)
fit_RF$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry, verbose = FALSE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 27
##
## OOB estimate of error rate: 0.24%
## Confusion matrix:
## A B C D E class.error
## A 4183 1 0 0 1 0.0004778973
## B 8 2836 3 1 0 0.0042134831
## C 0 6 2561 0 0 0.0023373588
## D 0 0 7 2404 1 0.0033167496
## E 0 1 0 7 2698 0.0029563932
predict_quiz <- as.data.frame(predict(fit_RF, newdata = data_quiz))
predict_quiz
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.