We will use caret machine learning library. doMC library is used for concurrency.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(doMC)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoMC(cores = 4)
Data contains character values like NA and DIV/O!. We will read the dataset and make these values as NA.
training<-read.csv("pml-training.csv", na.strings=c("NA","#DIV/0!",""))
predictDataset<-read.csv("pml-testing.csv", na.strings=c("NA","#DIV/0!",""))
We will remove first five columns. These are
The next step is to assign values to NAs. The mean of column values will be assigned to NAs.
#calculate mean for each integer and numerics columns
#assigns means to NAs
for (colName in names(training)){
if(class(training[[colName]]) %in% c("integer", "numeric")){
m<-mean(training[[colName]], na.rm = TRUE)
training[[colName]][is.na(training[[colName]])]<-m
}
}
Now, let’s remove the columns where variance is near zero.
training<-training[,-nearZeroVar(training)]
index <- createDataPartition(y=training$classe, p=0.6, list=FALSE)
trainingData <- training[index, ]
testingData <- training[-index, ]
We will do training by using Random Forest algorithm, 3 Cross Validation will be used.
train_control <- trainControl(method="cv", number=3)
model <- train(classe~., data=trainingData, method="rf", trControl = train_control)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
## Loading required namespace: e1071
First we will make predictions on training dataset and then on testing dataset. This will produce error rates.
prediction <- predict(model, trainingData, type="raw")
c<-confusionMatrix(prediction, trainingData$classe)
print("In Sample Error Rate")
## [1] "In Sample Error Rate"
print(1-c[["overall"]][["Accuracy"]])
## [1] 0
prediction <- predict(model, testingData, type="raw")
c<-confusionMatrix(prediction, testingData$classe)
print("Out of Sample Error Rate")
## [1] "Out of Sample Error Rate"
print(1-c[["overall"]][["Accuracy"]])
## [1] 0.003058884
#generate data for submitting
prediction <- predict(model, predictDataset, type="raw")
for (i in seq(20)){
fileName<-paste("problem",i,".txt",sep="_")
write.table(prediction[i],file=fileName,quote=FALSE,row.names=FALSE,col.names=FALSE)
}