Prediction Writeup

Synopsis

In this project, we use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants to predict the manner in which they did the exercise.

Load libraries

library(caret)

## Warning: package 'caret' was built under R version 4.1.2

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.1.2

## Loading required package: lattice

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.1.2

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

Load Data and delete NA

training.url <- 'http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv'
test.cases.url <- 'http://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv'
downloadcsv <- function(url, nastrings) {
    temp <- tempfile()
    download.file(url, temp, method = "curl")
    data <- read.csv(temp, na.strings = nastrings)
    unlink(temp)
    return(data)
}
train <- downloadcsv(training.url, c("", "NA", "#DIV/0!"))
test <- downloadcsv(test.cases.url, c("", "NA", "#DIV/0!"))
dim(train)

## [1] 19622   160

table(train$classe)

## 
##    A    B    C    D    E 
## 5580 3797 3422 3216 3607

##Validation and Selection

set.seed(12345)
trainset <- createDataPartition(train$classe, p = 0.8, list = FALSE)
Training <- train[trainset, ]
Validation <- train[-trainset, ]
nonzerocol <- nearZeroVar(Training)
Training <- Training[, -nonzerocol]
countlength <- sapply(Training, function(x) {
    sum(!(is.na(x) | x == ""))
})
nullCol <- names(countlength[countlength < 0.6 * length(Training$classe)])
descriptcol <- c("X", "user_name", "raw_timestamp_part_1", "raw_timestamp_part_2", 
    "cvtd_timestamp", "new_window", "num_window")
excludecolumns <- c(descriptcol, nullCol)
Training <- Training[, !names(Training) %in% excludecolumns]

Creating models

rfModel <- randomForest(as.factor(classe)~ ., data = Training, importance = TRUE, ntrees = 10)
ptraining <- predict(rfModel, Training)

Validation and Test the model

u1 <- union(ptraining,Training$classe)
t1 <- table(factor(ptraining, u1), factor(Training$classe, u1))
print(confusionMatrix(t1))

## Confusion Matrix and Statistics
## 
##    
##        A    B    C    D    E
##   A 4464    0    0    0    0
##   B    0 3038    0    0    0
##   C    0    0 2738    0    0
##   D    0    0    0 2573    0
##   E    0    0    0    0 2886
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9998, 1)
##     No Information Rate : 0.2843     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   1.0000   1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Prevalence             0.2843   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2843   0.1935   0.1744   0.1639   0.1838
## Detection Prevalence   0.2843   0.1935   0.1744   0.1639   0.1838
## Balanced Accuracy      1.0000   1.0000   1.0000   1.0000   1.0000

pvalidation <- predict(rfModel, Validation)
u2 <- union(pvalidation,Validation$classe)
t2 <- table(factor(pvalidation, u2), factor(Validation$classe, u2))
print(confusionMatrix(t2))

## Confusion Matrix and Statistics
## 
##    
##        A    B    C    D    E
##   A 1116    2    0    0    0
##   B    0  757    2    0    0
##   C    0    0  682    7    0
##   D    0    0    0  635    0
##   E    0    0    0    1  721
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9969          
##                  95% CI : (0.9947, 0.9984)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9961          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   0.9974   0.9971   0.9876   1.0000
## Specificity            0.9993   0.9994   0.9978   1.0000   0.9997
## Pos Pred Value         0.9982   0.9974   0.9898   1.0000   0.9986
## Neg Pred Value         1.0000   0.9994   0.9994   0.9976   1.0000
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2845   0.1930   0.1738   0.1619   0.1838
## Detection Prevalence   0.2850   0.1935   0.1756   0.1619   0.1840
## Balanced Accuracy      0.9996   0.9984   0.9975   0.9938   0.9998

ptest <- predict(rfModel, test)
ptest

##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  B  A  B  A  A  E  D  B  A  A  B  C  B  A  E  E  A  B  B  B 
## Levels: A B C D E

Conclusions

Taking into account the previous data and the models; the result with the random forest model with cross validation, gives us an accurate result of approximately 99%.