Practical Machine Learning Final Project

Data

The training data for this project are available here:

https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv

The test data are available here:

https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv

The data for this project come from this source: http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har. If you use the document you create for this class for any purpose please cite them as they have been very generous in allowing their data to be used for this kind of assignment.

library(caTools)

## Warning: package 'caTools' was built under R version 4.0.5

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.0.5

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

library(caret)

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.0.5

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

setwd("C:\\Users\\localadmin\\Desktop\\ML")
dataset <- read.csv("pml-training.csv", header=T)

dim(dataset)

## [1] 19622   160

set.seed(1234)
dataset <- dataset[, -c(1:7)]
dim(dataset)

## [1] 19622   153

#dataset <- dataset %>% select_if(~ sum(is.na(.)) <= 2)
#only keep columns with at least 50% non-blanks
dataset <- dataset[, colSums(is.na(dataset)) < nrow(dataset) * 0.5]
dim(dataset)

## [1] 19622    86

dataset <- dataset[complete.cases(dataset), ]
dim(dataset)

## [1] 19622    86

#We eliminate zeroVar and nearZeroVar columns
x = nearZeroVar(dataset, saveMetrics = TRUE)
str(x, vec.len=2)

## 'data.frame':    86 obs. of  4 variables:
##  $ freqRatio    : num  1.1 1.04 ...
##  $ percentUnique: num  6.78 9.38 ...
##  $ zeroVar      : logi  FALSE FALSE FALSE ...
##  $ nzv          : logi  FALSE FALSE FALSE ...

dataset <- dataset[!x$zeroVar]
dim(dataset)

## [1] 19622    86

x = nearZeroVar(dataset, saveMetrics = TRUE)
str(x, vec.len=2)

## 'data.frame':    86 obs. of  4 variables:
##  $ freqRatio    : num  1.1 1.04 ...
##  $ percentUnique: num  6.78 9.38 ...
##  $ zeroVar      : logi  FALSE FALSE FALSE ...
##  $ nzv          : logi  FALSE FALSE FALSE ...

dataset <- dataset[!x$nzv]
dim(dataset)

## [1] 19622    53

#At this point we 53 variables

write.csv(dataset, file="data.csv")

dataset <- read.csv("data.csv", header=T)
#We are going to check how many NAs are in the dataset
#To double check that we have 0 NAs
sum(is.na(dataset))

## [1] 0

dataset <- dataset[, -1]
#classe is opened as chr but it is a categorical variable
dataset$classe<- factor(dataset$classe)
#We have 52 predictors
#For classification problems we need to use a number close to 
#sqrt52
#We are going to use 7


#We divide our data in a training set
#and a testing set
split= sample.split(dataset$classe, SplitRatio = 7/10)
training_set = subset(dataset, split==TRUE)
testing_set = subset(dataset, split==FALSE)
rf_mod <- train(
  classe ~ .,
  data = training_set,
  metric = "Accuracy",
  method = "rf",
  trControl= trainControl(method="none"),
  tuneGrid = expand.grid(.mtry=7)
)

#We measure the accuracty of our model
rf_pred <- predict(rf_mod, testing_set)
confusionMatrix(rf_pred, testing_set$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1673    2    0    0    0
##          B    0 1134    7    0    0
##          C    0    3 1019   14    0
##          D    0    0    1  950    2
##          E    1    0    0    1 1080
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9947          
##                  95% CI : (0.9925, 0.9964)
##     No Information Rate : 0.2844          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9933          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9994   0.9956   0.9922   0.9845   0.9982
## Specificity            0.9995   0.9985   0.9965   0.9994   0.9996
## Pos Pred Value         0.9988   0.9939   0.9836   0.9969   0.9982
## Neg Pred Value         0.9998   0.9989   0.9984   0.9970   0.9996
## Prevalence             0.2844   0.1935   0.1745   0.1639   0.1838
## Detection Rate         0.2842   0.1926   0.1731   0.1614   0.1835
## Detection Prevalence   0.2845   0.1938   0.1760   0.1619   0.1838
## Balanced Accuracy      0.9995   0.9971   0.9944   0.9919   0.9989

#Finally, we predict the results for the quiz
dataset <- read.csv("pml-testing.csv", header=T)
pred <- predict(rf_mod, dataset)
print(pred)

##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E

Practical Machine Learning Final Project

Illya Bjazevic

6/6/2021

Data