In this project, we use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants to predict the manner in which they did the exercise.
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.2
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
training.url <- 'http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv'
test.cases.url <- 'http://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv'
downloadcsv <- function(url, nastrings) {
temp <- tempfile()
download.file(url, temp, method = "curl")
data <- read.csv(temp, na.strings = nastrings)
unlink(temp)
return(data)
}
train <- downloadcsv(training.url, c("", "NA", "#DIV/0!"))
test <- downloadcsv(test.cases.url, c("", "NA", "#DIV/0!"))
dim(train)
## [1] 19622 160
table(train$classe)
##
## A B C D E
## 5580 3797 3422 3216 3607
##Validation and Selection
set.seed(12345)
trainset <- createDataPartition(train$classe, p = 0.8, list = FALSE)
Training <- train[trainset, ]
Validation <- train[-trainset, ]
nonzerocol <- nearZeroVar(Training)
Training <- Training[, -nonzerocol]
countlength <- sapply(Training, function(x) {
sum(!(is.na(x) | x == ""))
})
nullCol <- names(countlength[countlength < 0.6 * length(Training$classe)])
descriptcol <- c("X", "user_name", "raw_timestamp_part_1", "raw_timestamp_part_2",
"cvtd_timestamp", "new_window", "num_window")
excludecolumns <- c(descriptcol, nullCol)
Training <- Training[, !names(Training) %in% excludecolumns]
rfModel <- randomForest(as.factor(classe)~ ., data = Training, importance = TRUE, ntrees = 10)
ptraining <- predict(rfModel, Training)
u1 <- union(ptraining,Training$classe)
t1 <- table(factor(ptraining, u1), factor(Training$classe, u1))
print(confusionMatrix(t1))
## Confusion Matrix and Statistics
##
##
## A B C D E
## A 4464 0 0 0 0
## B 0 3038 0 0 0
## C 0 0 2738 0 0
## D 0 0 0 2573 0
## E 0 0 0 0 2886
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9998, 1)
## No Information Rate : 0.2843
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 1.0000 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000 1.0000 1.0000
## Prevalence 0.2843 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2843 0.1935 0.1744 0.1639 0.1838
## Detection Prevalence 0.2843 0.1935 0.1744 0.1639 0.1838
## Balanced Accuracy 1.0000 1.0000 1.0000 1.0000 1.0000
pvalidation <- predict(rfModel, Validation)
u2 <- union(pvalidation,Validation$classe)
t2 <- table(factor(pvalidation, u2), factor(Validation$classe, u2))
print(confusionMatrix(t2))
## Confusion Matrix and Statistics
##
##
## A B C D E
## A 1116 2 0 0 0
## B 0 757 2 0 0
## C 0 0 682 7 0
## D 0 0 0 635 0
## E 0 0 0 1 721
##
## Overall Statistics
##
## Accuracy : 0.9969
## 95% CI : (0.9947, 0.9984)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9961
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 0.9974 0.9971 0.9876 1.0000
## Specificity 0.9993 0.9994 0.9978 1.0000 0.9997
## Pos Pred Value 0.9982 0.9974 0.9898 1.0000 0.9986
## Neg Pred Value 1.0000 0.9994 0.9994 0.9976 1.0000
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2845 0.1930 0.1738 0.1619 0.1838
## Detection Prevalence 0.2850 0.1935 0.1756 0.1619 0.1840
## Balanced Accuracy 0.9996 0.9984 0.9975 0.9938 0.9998
ptest <- predict(rfModel, test)
ptest
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
Taking into account the previous data and the models; the result with the random forest model with cross validation, gives us an accurate result of approximately 99%.