load("./samsungData.rda")
# names(samsungData)
Below function will rename all columns, there by removing duplicate column names
colnames(samsungData) <- make.names(names(samsungData), unique = TRUE)
Checking for completeness of data, below:
# samsungData[!complete.cases(samsungData),] # this will return all rows
# where incomplete infomraiton is present
Converting subject activity to factor
samsungData$activity <- as.factor(samsungData$activity)
Removing .s from column names and cleaning up column names
names(samsungData) <- gsub(".", "", names(samsungData), fixed = TRUE)
Creating test and train data sets
train <- subset(samsungData, samsungData$subject < 23)
test <- subset(samsungData, samsungData$subject >= 23)
Removing subject column from analysis
train <- train[, -562] # subject column number is 562
test <- test[, -562] # subject column number is 562
Applying decision tree
library(tree)
dec1 <- tree(activity ~ ., data = train)
plot(dec1)
text(dec1)
title("Decision Tree")
summary(dec1)
##
## Classification tree:
## tree(formula = activity ~ ., data = train)
## Variables actually used in tree construction:
## [1] "fBodyAccJerkstdX" "tGravityAccminX"
## [3] "tGravityAccmaxY" "tGravityAccminY"
## [5] "fBodyAccMagenergy" "tGravityAccarCoeffY2"
## [7] "fBodyGyromaxIndsX" "tBodyGyrocorrelationYZ"
## Number of terminal nodes: 10
## Residual mean deviance: 0.527 = 2470 / 4680
## Misclassification error rate: 0.0942 = 442 / 4694
On training data, decisiton tree gives a misclassification rate of 9.4%. Below, plot of main node and second node is made to see trend in data
plot(train$fBodyAccJerkstdX, train$fBodyAccMagenergy, pch = 19, col = as.numeric(train$activity))
legend("topright", legend = unique(train$activity), col = unique(train$activity),
pch = 19)
title("Two Main Variables that Define Subject's activity in 2D plot")
#
# plot(train$tGravityAccminX,train$fBodyAccMagenergy,pch=19,col=as.numeric(train$activity))
# legend('topleft',legend=unique(train$activity),col=unique(train$activity),pch=19)
Testing the tree produced above on test data
test1 <- predict(dec1, test)
test2 <- predict(dec1, test, type = "class")
summary(test1)
## laying sitting standing walk
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :0.000 Median :0.000
## Mean :0.193 Mean :0.188 Mean :0.195 Mean :0.200
## 3rd Qu.:0.000 3rd Qu.:0.304 3rd Qu.:0.142 3rd Qu.:0.419
## Max. :1.000 Max. :0.858 Max. :0.997 Max. :0.916
## walkdown walkup
## Min. :0.000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000
## Median :0.000 Median :0.0000
## Mean :0.123 Mean :0.1015
## 3rd Qu.:0.146 3rd Qu.:0.0598
## Max. :0.934 Max. :0.9847
summary(test2)
## laying sitting standing walk walkdown walkup
## 514 522 496 390 267 469
Confustion matrix is below
table(test$activity, test2)
## test2
## laying sitting standing walk walkdown walkup
## laying 514 0 0 0 0 0
## sitting 0 438 37 0 0 0
## standing 0 84 415 0 0 0
## walk 0 0 8 344 3 66
## walkdown 0 0 0 21 258 83
## walkup 0 0 36 25 6 320
Overall accuracy on test data is
sum(test$activity == test2)/length(test2)
## [1] 0.8612
Crossvalidation tree:
plot(cv.tree(dec1))
plot(cv.tree(dec1, FUN = prune.tree, method = "misclass"))
From above graphs it can be concluded that a tree size of 6 will give be good model for estimating.
So, pruning the tree to of size 6
pru6 <- prune.tree(dec1, best = 6)
plot(pru6)
text(pru6)
title("Decision Tree with 6 Nodes")
summary(pru6)
##
## Classification tree:
## snip.tree(tree = dec1, nodes = c(10L, 7L, 12L, 13L))
## Variables actually used in tree construction:
## [1] "fBodyAccJerkstdX" "tGravityAccminX" "tGravityAccmaxY"
## [4] "fBodyAccMagenergy" "tGravityAccarCoeffY2"
## Number of terminal nodes: 6
## Residual mean deviance: 0.707 = 3310 / 4690
## Misclassification error rate: 0.108 = 508 / 4694
Above shows that by reducing number of nodes to 6 from 12, will increase the inaccuracy by 1% to 10.82% on training data. Using this pruned tree on test data to predict values..
test3 <- predict(pru6, test, type = "class")
Overall accruacy on test data is
sum(test$activity == test3)/length(test3)
## [1] 0.8548
Confustion matrix with a tee of size 6
# test3
table(test$activity, test3)
## test3
## laying sitting standing walk walkdown walkup
## laying 514 0 0 0 0 0
## sitting 0 438 37 0 0 0
## standing 0 84 415 0 0 0
## walk 0 0 8 350 3 60
## walkdown 0 0 0 38 260 64
## walkup 0 0 36 31 25 295
Checking by pruning the tree to size 8
pru8 <- prune.tree(dec1, best = 8)
plot(pru8)
text(pru8)
title("Decision Tree with 8 Nodes")
summary(pru8)
##
## Classification tree:
## snip.tree(tree = dec1, nodes = c(10L, 7L))
## Variables actually used in tree construction:
## [1] "fBodyAccJerkstdX" "tGravityAccminX" "tGravityAccmaxY"
## [4] "fBodyAccMagenergy" "tGravityAccarCoeffY2" "fBodyGyromaxIndsX"
## [7] "tGravityAccminY"
## Number of terminal nodes: 8
## Residual mean deviance: 0.599 = 2810 / 4690
## Misclassification error rate: 0.101 = 475 / 4694
Above shows that by reducing number of nodes to 6 from 12, will increase the inaccuracy by 1% to 10.82% on training data. Using this pruned tree on test data to predict values..
test3 <- predict(pru8, test, type = "class")
Overall accruacy on test data is
sum(test$activity == test3)/length(test3)
## [1] 0.8548
Confustion matrix with a tee of size 8
# test3
table(test$activity, test3)
## test3
## laying sitting standing walk walkdown walkup
## laying 514 0 0 0 0 0
## sitting 0 438 37 0 0 0
## standing 0 84 415 0 0 0
## walk 0 0 8 344 3 66
## walkdown 0 0 0 21 260 81
## walkup 0 0 36 25 25 301
No improvement was found by increasing node size to 8
3D-Plot 3 important variables that will define subject's activity
library(scatterplot3d)
scatterplot3d(train$fBodyAccJerkstdX, train$fBodyAccMagenergy, train$tGravityAccminX,
pch = 16, color = as.numeric(train$activity), mar = c(5, 3, 5, 7) + 0.1)
legend(x = 4, y = 1, legend = unique(train$activity), col = unique(train$activity),
pch = 19, inset = 0.05, title = "Three Paramters That Define Subject's activity",
xjust = 1, cex = 1, x.intersp = 0.1)
title("Three Main Variables that Define Subject's activity in 3D plot")
Trying J48,
library(RWeka)
library(rJava)
library(party)
## Loading required package: grid
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following object(s) are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
## Loading required package: strucchange
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
##
## The following object(s) are masked from 'package:rJava':
##
## clone
j1 <- J48(activity ~ ., data = train)
plot(j1)
summary(j1)
##
## === Summary ===
##
## Correctly Classified Instances 4676 99.6165 %
## Incorrectly Classified Instances 18 0.3835 %
## Kappa statistic 0.9954
## Mean absolute error 0.0023
## Root mean squared error 0.0342
## Relative absolute error 0.8432 %
## Root relative squared error 9.1827 %
## Coverage of cases (0.95 level) 99.787 %
## Mean rel. region size (0.95 level) 16.9365 %
## Total Number of Instances 4694
##
## === Confusion Matrix ===
##
## a b c d e f <-- classified as
## 893 0 0 0 0 0 | a = laying
## 0 807 4 0 0 0 | b = sitting
## 0 3 872 0 0 0 | c = standing
## 0 0 0 801 2 2 | d = walk
## 0 0 0 2 622 0 | e = walkdown
## 0 0 0 2 3 681 | f = walkup
evaluate_Weka_classifier(j1, test, numFolds = 10)
## === 10 Fold Cross Validation ===
##
## === Summary ===
##
## Correctly Classified Instances 2598 97.7427 %
## Incorrectly Classified Instances 60 2.2573 %
## Kappa statistic 0.9728
## Mean absolute error 0.0082
## Root mean squared error 0.0853
## Relative absolute error 2.9708 %
## Root relative squared error 22.9362 %
## Coverage of cases (0.95 level) 98.006 %
## Mean rel. region size (0.95 level) 16.9175 %
## Total Number of Instances 2658
##
## === Confusion Matrix ===
##
## a b c d e f <-- classified as
## 513 0 0 0 0 1 | a = laying
## 0 458 17 0 0 0 | b = sitting
## 0 7 492 0 0 0 | c = standing
## 0 0 0 408 7 6 | d = walk
## 0 0 0 7 349 6 | e = walkdown
## 0 0 0 2 7 378 | f = walkup
Weka's J48 classifier rightly classifies data.