traindata <- read.csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv', na.strings = c("", "NA"))
testdata <-read.csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv', na.strings = c("", "NA"))
# head(traindata)
# head(testdata)
# names(traindata) == names(testdata)
Remove the columns which NAs are above 50%. Remove unnecessary variables.
traindata = traindata[,-(1:7)]
testdata = testdata[,-(1:7)]
testdata = testdata[,-ncol(testdata)]
sum(colSums(is.na(traindata)) == 0)
## [1] 53
badcol = c()
for (i in names(traindata)){
if(mean(is.na(traindata[,i])) > .5){
badcol = c(badcol,i)
}
}
# badcol
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
traindata = select(traindata, -c(badcol))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(badcol)` instead of `badcol` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
testdata = select(testdata, -badcol)
set.seed(2021)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
inTrain = createDataPartition(y = traindata$classe, p = .7, list = FALSE)
training = traindata[inTrain,]
testing = traindata[-inTrain,]
dim(training)
## [1] 13737 53
dim(testing)
## [1] 5885 53
## Loading required package: foreach
## Loading required package: iterators
ctrl = trainControl(method="cv", number=5, verboseIter=FALSE, allowParallel = T)
modrf = train(classe ~ ., data = training, method = 'rf', trControl = ctrl)
predrf = predict(modrf, testing)
confusionMatrix(predrf, factor(testing$classe))$overall["Accuracy"]
## Accuracy
## 0.9943925
modlda = train(classe ~. , data = training, method = 'lda', trControl = ctrl)
predlda = predict(modlda, testing)
confusionMatrix(predlda, factor(testing$classe))$overall["Accuracy"]
## Accuracy
## 0.7099405
modrpart = train(classe ~., data = training, method = 'rpart', trControl = ctrl)
predrpart = predict(modrpart, testing)
confusionMatrix(predrpart, factor(testing$classe))$overall["Accuracy"]
## Accuracy
## 0.4992353
modgbm = train(classe ~., data = training, method = 'gbm', trControl = ctrl, verbose = FALSE)
predgbm = predict(modgbm, testing)
confusionMatrix(predgbm, factor(testing$classe))$overall["Accuracy"]
## Accuracy
## 0.9614274
According to the accuracy of 4 different models, the random Foresst model predicts best.
finalpred = predict(modrf, newdata = testdata)
print(finalpred)
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E