The data for this project come from this source- http://groupware.les.inf.puc-rio.br/har Importing data from the given URLs
library('caret')
## Warning: package 'caret' was built under R version 3.3.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
library('rpart')
## Warning: package 'rpart' was built under R version 3.3.3
library('randomForest')
## Warning: package 'randomForest' was built under R version 3.3.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
trngurl <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
testurl <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
trng <- read.csv(trngurl, na.strings =c("NA","#DIV/0!", ""))
test <- read.csv(testurl, na.strings =c("NA","#DIV/0!", ""))
Data cleaning and partition
#removing the columns having NAs - since our dataset is very high dimensional
trngnonzero <- trng[,colSums(is.na(trng)) == 0]
testnonzero <- test[,colSums(is.na(test)) == 0]
#removing non relevant columns such as time stamps, dates, serial numbers etc
trngrelcols <- trngnonzero[,-c(1:7)]
testrelcols <- testnonzero[,-c(1:7)]
#data partitioning - the test set here is created from training set - serves as validation set
sample <- createDataPartition(y= trngrelcols$classe, p = 0.70, list = F)
trngset <- trngrelcols[sample,]
testset <- trngrelcols[-sample,]
dim(trngset)
## [1] 13737 53
Training and predictions Recursive Partition
# recursive partition
model1 <- rpart(classe ~ ., data=trngset, method="class")
prediction1 <- predict(model1, testset, type = "class")
confusionMatrix(testset$classe, prediction1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1484 57 37 63 33
## B 199 644 153 86 57
## C 22 78 819 75 32
## D 39 92 165 598 70
## E 38 91 99 62 792
##
## Overall Statistics
##
## Accuracy : 0.737
## 95% CI : (0.7255, 0.7482)
## No Information Rate : 0.3028
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6667
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.8328 0.6694 0.6434 0.6765 0.8049
## Specificity 0.9537 0.8995 0.9551 0.9268 0.9408
## Pos Pred Value 0.8865 0.5654 0.7982 0.6203 0.7320
## Neg Pred Value 0.9292 0.9330 0.9066 0.9419 0.9600
## Prevalence 0.3028 0.1635 0.2163 0.1502 0.1672
## Detection Rate 0.2522 0.1094 0.1392 0.1016 0.1346
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.8932 0.7844 0.7992 0.8016 0.8729
Here, we can see that our rpart model has acheived an accuracy of around 75%. Now let us try and improve it using other models and techniques.
Random Forest
# random forest
model2 <- randomForest(classe ~ ., data=trngset, method="class")
prediction2 <- predict(model2, testset, type = "class")
confusionMatrix(testset$classe, prediction2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1672 2 0 0 0
## B 4 1131 4 0 0
## C 0 8 1012 6 0
## D 0 0 20 943 1
## E 0 0 3 1 1078
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.989, 0.9938)
## No Information Rate : 0.2848
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9895
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9976 0.9912 0.9740 0.9926 0.9991
## Specificity 0.9995 0.9983 0.9971 0.9957 0.9992
## Pos Pred Value 0.9988 0.9930 0.9864 0.9782 0.9963
## Neg Pred Value 0.9991 0.9979 0.9944 0.9986 0.9998
## Prevalence 0.2848 0.1939 0.1766 0.1614 0.1833
## Detection Rate 0.2841 0.1922 0.1720 0.1602 0.1832
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.9986 0.9948 0.9856 0.9942 0.9991
Here, we can see that our random forest has done pretty well with an accuracy of 99.54%. Since our dataset has high number of columns, we can try improving this by reducing the columns by keeping only relevant columns.
Random Forest with reduced dimensionality
# reducing dimensionality of dataset by removing the less contributing columns
#saving our target variables for later use
ycol <- trngset$classe
ycoltest <- testset$classe
nzv <- nearZeroVar(trngset, saveMetrics = TRUE)
#print(paste('Range:',range(nzv$percentUnique)))
head(nzv)
## freqRatio percentUnique zeroVar nzv
## roll_belt 1.080696 7.9347747 FALSE FALSE
## pitch_belt 1.089552 12.2297445 FALSE FALSE
## yaw_belt 1.027933 12.9941035 FALSE FALSE
## total_accel_belt 1.081834 0.1965495 FALSE FALSE
## gyros_belt_x 1.001013 0.9754677 FALSE FALSE
## gyros_belt_y 1.133935 0.4731746 FALSE FALSE
#sort by percentUnique
sort(nzv$percentUnique, decreasing = TRUE)
## [1] 86.51816263 85.92123462 84.31972046 20.41930553 19.34920288
## [6] 19.07985732 17.45650433 13.76574216 13.27072869 12.99410352
## [11] 12.93586664 12.22974449 11.76384946 10.56271384 9.60908495
## [16] 9.13591032 7.93477470 7.80374172 7.09033996 6.25318483
## [21] 5.96928005 5.67081604 5.63441800 5.53250346 5.22675985
## [26] 4.82638131 4.57887457 4.04018345 3.82907476 3.33406129
## [31] 3.22486715 2.99919924 2.89728471 2.67889641 2.26395865
## [36] 2.10380724 2.10380724 2.09652763 2.05284997 1.93637621
## [41] 1.71798792 1.68886948 1.43408313 1.15745796 1.14289874
## [46] 0.99730654 0.97546771 0.48045425 0.47317464 0.47317464
## [51] 0.30574361 0.19654946 0.03639805
Selecting columns having percent unique greater than 5 - since the number of columns are high, there is a possibility that not all the columns are important and relevant. The “percent of unique values’’ is the number of unique values divided by the total number of samples (times 100) that approaches zero as the granularity of the data increases.
#selecting column having unique percent greater than 5
pcacols <- trngset[c(rownames(nzv[nzv$percentUnique > 5,]))]
pcacolstest <- testset[c(rownames(nzv[nzv$percentUnique > 5,]))]
#adding the target variable back to the dataset
pcacols$classe <- ycol
pcacolstest$classe <- ycoltest
dim(pcacols)
## [1] 13737 26
dim(pcacolstest)
## [1] 5885 26
Predicting after reducing the dimension of the dataset
pcamodel <- randomForest(classe ~ ., data=pcacols, method="class")
prediction3 <- predict(pcamodel, pcacolstest, type = "class")
confusionMatrix(pcacolstest$classe, prediction3)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1674 0 0 0 0
## B 5 1129 5 0 0
## C 0 7 1010 9 0
## D 0 1 19 943 1
## E 0 2 2 2 1076
##
## Overall Statistics
##
## Accuracy : 0.991
## 95% CI : (0.9882, 0.9932)
## No Information Rate : 0.2853
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9886
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9970 0.9912 0.9749 0.9885 0.9991
## Specificity 1.0000 0.9979 0.9967 0.9957 0.9988
## Pos Pred Value 1.0000 0.9912 0.9844 0.9782 0.9945
## Neg Pred Value 0.9988 0.9979 0.9946 0.9978 0.9998
## Prevalence 0.2853 0.1935 0.1760 0.1621 0.1830
## Detection Rate 0.2845 0.1918 0.1716 0.1602 0.1828
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.9985 0.9946 0.9858 0.9921 0.9989
We can see that there is marginal improvement in the accuracy although we have used less number of columns(26 columns as compared to 53 used in random forest(model2). There is a scope for reducing the columns further and checking how does the accuracy varies.
Final prediction on the test data downloaded from the URL provided. I have considered only the columns with a variance threshold, this can be played upon further.
testds <- testrelcols[c(rownames(nzv[nzv$percentUnique > 5,]))]
finalpred <- predict(pcamodel, testds, type = "class")
finalpred
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E