The training data for this project are available here:
https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv
The test data are available here:
https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv
The data for this project come from this source: http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har. If you use the document you create for this class for any purpose please cite them as they have been very generous in allowing their data to be used for this kind of assignment.
library(caTools)
## Warning: package 'caTools' was built under R version 4.0.5
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(caret)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.0.5
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
setwd("C:\\Users\\localadmin\\Desktop\\ML")
dataset <- read.csv("pml-training.csv", header=T)
dim(dataset)
## [1] 19622 160
set.seed(1234)
dataset <- dataset[, -c(1:7)]
dim(dataset)
## [1] 19622 153
#dataset <- dataset %>% select_if(~ sum(is.na(.)) <= 2)
#only keep columns with at least 50% non-blanks
dataset <- dataset[, colSums(is.na(dataset)) < nrow(dataset) * 0.5]
dim(dataset)
## [1] 19622 86
dataset <- dataset[complete.cases(dataset), ]
dim(dataset)
## [1] 19622 86
#We eliminate zeroVar and nearZeroVar columns
x = nearZeroVar(dataset, saveMetrics = TRUE)
str(x, vec.len=2)
## 'data.frame': 86 obs. of 4 variables:
## $ freqRatio : num 1.1 1.04 ...
## $ percentUnique: num 6.78 9.38 ...
## $ zeroVar : logi FALSE FALSE FALSE ...
## $ nzv : logi FALSE FALSE FALSE ...
dataset <- dataset[!x$zeroVar]
dim(dataset)
## [1] 19622 86
x = nearZeroVar(dataset, saveMetrics = TRUE)
str(x, vec.len=2)
## 'data.frame': 86 obs. of 4 variables:
## $ freqRatio : num 1.1 1.04 ...
## $ percentUnique: num 6.78 9.38 ...
## $ zeroVar : logi FALSE FALSE FALSE ...
## $ nzv : logi FALSE FALSE FALSE ...
dataset <- dataset[!x$nzv]
dim(dataset)
## [1] 19622 53
#At this point we 53 variables
write.csv(dataset, file="data.csv")
dataset <- read.csv("data.csv", header=T)
#We are going to check how many NAs are in the dataset
#To double check that we have 0 NAs
sum(is.na(dataset))
## [1] 0
dataset <- dataset[, -1]
#classe is opened as chr but it is a categorical variable
dataset$classe<- factor(dataset$classe)
#We have 52 predictors
#For classification problems we need to use a number close to
#sqrt52
#We are going to use 7
#We divide our data in a training set
#and a testing set
split= sample.split(dataset$classe, SplitRatio = 7/10)
training_set = subset(dataset, split==TRUE)
testing_set = subset(dataset, split==FALSE)
rf_mod <- train(
classe ~ .,
data = training_set,
metric = "Accuracy",
method = "rf",
trControl= trainControl(method="none"),
tuneGrid = expand.grid(.mtry=7)
)
#We measure the accuracty of our model
rf_pred <- predict(rf_mod, testing_set)
confusionMatrix(rf_pred, testing_set$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1673 2 0 0 0
## B 0 1134 7 0 0
## C 0 3 1019 14 0
## D 0 0 1 950 2
## E 1 0 0 1 1080
##
## Overall Statistics
##
## Accuracy : 0.9947
## 95% CI : (0.9925, 0.9964)
## No Information Rate : 0.2844
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9933
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9994 0.9956 0.9922 0.9845 0.9982
## Specificity 0.9995 0.9985 0.9965 0.9994 0.9996
## Pos Pred Value 0.9988 0.9939 0.9836 0.9969 0.9982
## Neg Pred Value 0.9998 0.9989 0.9984 0.9970 0.9996
## Prevalence 0.2844 0.1935 0.1745 0.1639 0.1838
## Detection Rate 0.2842 0.1926 0.1731 0.1614 0.1835
## Detection Prevalence 0.2845 0.1938 0.1760 0.1619 0.1838
## Balanced Accuracy 0.9995 0.9971 0.9944 0.9919 0.9989
#Finally, we predict the results for the quiz
dataset <- read.csv("pml-testing.csv", header=T)
pred <- predict(rf_mod, dataset)
print(pred)
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E