Using devices such as Jawbone Up, Nike FuelBand, and Fitbit it is now possible to collect a large amount of data about personal activity relatively inexpensively. These type of devices are part of the quantified self movement – a group of enthusiasts who take measurements about themselves regularly to improve their health, to find patterns in their behavior, or because they are tech geeks. One thing that people regularly do is quantify how much of a particular activity they do, but they rarely quantify how well they do it.
In this project, the goal will be to use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants. They were asked to perform barbell lifts correctly and incorrectly in 5 different ways. More information is available from the website here: http://groupware.les.inf.puc-rio.br/har (see the section on the Weight Lifting Exercise Dataset).
The training data for this project are available here:
https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv
The test data are available here:
https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv
The data for this project come from this source: http://groupware.les.inf.puc-rio.br/har.
The data can be downloaded using the below R script.
downloadFiles<-function(
dataURL="", destF="t.csv"
){
if(!file.exists(destF)){
download.file(dataURL, destF, method="curl")
}else{
message("data already downloaded.")
}
}
trainURL<-"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
testURL <-"https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
downloadFiles(trainURL, "pml-training.csv")
## data already downloaded.
downloadFiles(testURL, "pml-test.csv")
## data already downloaded.
training <- read.csv("pml-training.csv",na.strings=c("NA",""))
testing <-read.csv("pml-test.csv",na.strings=c("NA",""))
First look of training data
dim(training)
## [1] 19622 160
#str(training)
The outcome is “classe” variable
table(training$classe)
##
## A B C D E
## 5580 3797 3422 3216 3607
The train dataset has 160 variables 19622 observations.
var <- names(training)[apply(training,2,function(x) table(is.na(x))[1]==19622)]
train2<- training[,var]
test2 <- testing[,c(var[-length(var)],names(testing)[length(testing)])] # test dataset no classe variable
Only considering numeric variable from HAR sensor
removeIndex <- grep("timestamp|X|user_name|new_window|num_window",names(train2))
train3 <- train2[,-c(removeIndex, length(train2))]
test3 <- test2[,-c(removeIndex, length(test2))]
removing zero covariates
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
nzv <- nearZeroVar(train3, saveMetrics=TRUE)
nzv
## freqRatio percentUnique zeroVar nzv
## roll_belt 1.102 6.7781 FALSE FALSE
## pitch_belt 1.036 9.3772 FALSE FALSE
## yaw_belt 1.058 9.9735 FALSE FALSE
## total_accel_belt 1.063 0.1478 FALSE FALSE
## gyros_belt_x 1.059 0.7135 FALSE FALSE
## gyros_belt_y 1.144 0.3516 FALSE FALSE
## gyros_belt_z 1.066 0.8613 FALSE FALSE
## accel_belt_x 1.055 0.8358 FALSE FALSE
## accel_belt_y 1.114 0.7288 FALSE FALSE
## accel_belt_z 1.079 1.5238 FALSE FALSE
## magnet_belt_x 1.090 1.6665 FALSE FALSE
## magnet_belt_y 1.100 1.5187 FALSE FALSE
## magnet_belt_z 1.006 2.3290 FALSE FALSE
## roll_arm 52.338 13.5256 FALSE FALSE
## pitch_arm 87.256 15.7323 FALSE FALSE
## yaw_arm 33.029 14.6570 FALSE FALSE
## total_accel_arm 1.025 0.3364 FALSE FALSE
## gyros_arm_x 1.016 3.2769 FALSE FALSE
## gyros_arm_y 1.454 1.9162 FALSE FALSE
## gyros_arm_z 1.111 1.2639 FALSE FALSE
## accel_arm_x 1.017 3.9598 FALSE FALSE
## accel_arm_y 1.140 2.7367 FALSE FALSE
## accel_arm_z 1.128 4.0363 FALSE FALSE
## magnet_arm_x 1.000 6.8240 FALSE FALSE
## magnet_arm_y 1.057 4.4440 FALSE FALSE
## magnet_arm_z 1.036 6.4468 FALSE FALSE
## roll_dumbbell 1.022 84.2065 FALSE FALSE
## pitch_dumbbell 2.277 81.7450 FALSE FALSE
## yaw_dumbbell 1.132 83.4828 FALSE FALSE
## total_accel_dumbbell 1.073 0.2191 FALSE FALSE
## gyros_dumbbell_x 1.003 1.2282 FALSE FALSE
## gyros_dumbbell_y 1.265 1.4168 FALSE FALSE
## gyros_dumbbell_z 1.060 1.0498 FALSE FALSE
## accel_dumbbell_x 1.018 2.1659 FALSE FALSE
## accel_dumbbell_y 1.053 2.3749 FALSE FALSE
## accel_dumbbell_z 1.133 2.0895 FALSE FALSE
## magnet_dumbbell_x 1.098 5.7486 FALSE FALSE
## magnet_dumbbell_y 1.198 4.3013 FALSE FALSE
## magnet_dumbbell_z 1.021 3.4451 FALSE FALSE
## roll_forearm 11.589 11.0896 FALSE FALSE
## pitch_forearm 65.983 14.8558 FALSE FALSE
## yaw_forearm 15.323 10.1468 FALSE FALSE
## total_accel_forearm 1.129 0.3567 FALSE FALSE
## gyros_forearm_x 1.059 1.5187 FALSE FALSE
## gyros_forearm_y 1.037 3.7764 FALSE FALSE
## gyros_forearm_z 1.123 1.5646 FALSE FALSE
## accel_forearm_x 1.126 4.0465 FALSE FALSE
## accel_forearm_y 1.059 5.1116 FALSE FALSE
## accel_forearm_z 1.006 2.9559 FALSE FALSE
## magnet_forearm_x 1.012 7.7668 FALSE FALSE
## magnet_forearm_y 1.247 9.5403 FALSE FALSE
## magnet_forearm_z 1.000 8.5771 FALSE FALSE
nzv[nzv$nzv,]
## [1] freqRatio percentUnique zeroVar nzv
## <0 rows> (or 0-length row.names)
corrM <- cor(train3)
library(corrplot)
corrplot(corrM, method="circle",tl.cex=0.5)
highCorr <- findCorrelation(corrM, cutoff = .75) # high correlation
train4<-cbind(classe=train2$classe,train3[,-highCorr])
test4 <- test3[, -highCorr] # dataframe of test predictors
set.seed(1234)
inTrain = createDataPartition(train4$classe, p = 3/4)[[1]]
trainPart = train4[ inTrain,]
testPart = train4[-inTrain,]
library(randomForest)
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
rfModel <- randomForest(classe ~ .,data = trainPart,importance = TRUE,ntrees = 500)
print(rfModel)
##
## Call:
## randomForest(formula = classe ~ ., data = trainPart, importance = TRUE, ntrees = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 0.63%
## Confusion matrix:
## A B C D E class.error
## A 4181 3 0 0 1 0.0009558
## B 12 2828 5 1 2 0.0070225
## C 0 13 2535 19 0 0.0124659
## D 0 0 24 2383 5 0.0120232
## E 0 0 2 6 2698 0.0029564
par(mar=c(3,4,4,4))
plot(rfModel)
varImpPlot(rfModel,cex=.5)
out.test<-predict(rfModel,testPart)
table(testData$classe, out.test)
## Error: object 'testData' not found
out.test<-predict(rfModel,test4)
out.test[1:20]
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
preProc <- preProcess(trainPart[,-1], method="pca", thresh=0.8)
trainPC <- predict(preProc, trainPart[,-1])
#modPC <- train(trainPart$classe~., method="rf", data=trainPC)
modPC <- randomForest(trainPart$classe~., data=trainPC, importance=TRUE, ntree=10)
testPC <- predict(preProc, testPart[,-1])
out.testPC<-predict(modPC, newdata=testPC)
table(out.testPC, testPart$classe)
##
## out.testPC A B C D E
## A 1343 36 13 8 5
## B 26 877 20 5 13
## C 13 23 785 45 15
## D 6 7 30 737 16
## E 7 6 7 9 852
testPC <- predict(preProc, test4)
out.testPC<-predict(modPC, newdata=testPC)
out.testPC[1:20] # the 1st and 3rd mis-classified
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## A A C A A E D B A A B C B A E E A B B B
## Levels: A B C D E
out.test[1:20]==out.testPC[1:20]
## [1] FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
answers<- as.vector(out.test[1:20])
#answers = rep("A", 20)
pml_write_files = function(x){
n = length(x)
for(i in 1:n){
filename = paste0("problem_id_",i,".txt")
write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.names=FALSE)
}
}
pml_write_files(answers)