Samsung Data Project for coursera


load("./samsungData.rda")
# names(samsungData)

Below function will rename all columns, there by removing duplicate column names

colnames(samsungData) <- make.names(names(samsungData), unique = TRUE)

Checking for completeness of data, below:

# samsungData[!complete.cases(samsungData),] # this will return all rows
# where incomplete infomraiton is present

Converting subject activity to factor

samsungData$activity <- as.factor(samsungData$activity)

Removing .s from column names and cleaning up column names

names(samsungData) <- gsub(".", "", names(samsungData), fixed = TRUE)

Creating test and train data sets

train <- subset(samsungData, samsungData$subject < 23)
test <- subset(samsungData, samsungData$subject >= 23)

Removing subject column from analysis

train <- train[, -562]  # subject column number is 562
test <- test[, -562]  # subject column number is 562

Applying decision tree

library(tree)
dec1 <- tree(activity ~ ., data = train)
plot(dec1)
text(dec1)
title("Decision Tree")

plot of chunk Decision_Tree

summary(dec1)

## 
## Classification tree:
## tree(formula = activity ~ ., data = train)
## Variables actually used in tree construction:
## [1] "fBodyAccJerkstdX"       "tGravityAccminX"       
## [3] "tGravityAccmaxY"        "tGravityAccminY"       
## [5] "fBodyAccMagenergy"      "tGravityAccarCoeffY2"  
## [7] "fBodyGyromaxIndsX"      "tBodyGyrocorrelationYZ"
## Number of terminal nodes:  10 
## Residual mean deviance:  0.527 = 2470 / 4680 
## Misclassification error rate: 0.0942 = 442 / 4694

On training data, decisiton tree gives a misclassification rate of 9.4%. Below, plot of main node and second node is made to see trend in data

plot(train$fBodyAccJerkstdX, train$fBodyAccMagenergy, pch = 19, col = as.numeric(train$activity))
legend("topright", legend = unique(train$activity), col = unique(train$activity), 
    pch = 19)
title("Two Main Variables that Define Subject's activity in 2D plot")

plot of chunk unnamed-chunk-1


#
# plot(train$tGravityAccminX,train$fBodyAccMagenergy,pch=19,col=as.numeric(train$activity))
# legend('topleft',legend=unique(train$activity),col=unique(train$activity),pch=19)

Testing the tree produced above on test data

test1 <- predict(dec1, test)
test2 <- predict(dec1, test, type = "class")
summary(test1)

##      laying         sitting         standing          walk      
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000  
##  Median :0.000   Median :0.000   Median :0.000   Median :0.000  
##  Mean   :0.193   Mean   :0.188   Mean   :0.195   Mean   :0.200  
##  3rd Qu.:0.000   3rd Qu.:0.304   3rd Qu.:0.142   3rd Qu.:0.419  
##  Max.   :1.000   Max.   :0.858   Max.   :0.997   Max.   :0.916  
##     walkdown         walkup      
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.0000  
##  Median :0.000   Median :0.0000  
##  Mean   :0.123   Mean   :0.1015  
##  3rd Qu.:0.146   3rd Qu.:0.0598  
##  Max.   :0.934   Max.   :0.9847

summary(test2)

##   laying  sitting standing     walk walkdown   walkup 
##      514      522      496      390      267      469

Confustion matrix is below

table(test$activity, test2)

##           test2
##            laying sitting standing walk walkdown walkup
##   laying      514       0        0    0        0      0
##   sitting       0     438       37    0        0      0
##   standing      0      84      415    0        0      0
##   walk          0       0        8  344        3     66
##   walkdown      0       0        0   21      258     83
##   walkup        0       0       36   25        6    320

Overall accuracy on test data is

sum(test$activity == test2)/length(test2)

## [1] 0.8612

Crossvalidation tree:

plot(cv.tree(dec1))

plot of chunk unnamed-chunk-5

plot(cv.tree(dec1, FUN = prune.tree, method = "misclass"))

plot of chunk unnamed-chunk-5

From above graphs it can be concluded that a tree size of 6 will give be good model for estimating.

So, pruning the tree to of size 6

pru6 <- prune.tree(dec1, best = 6)
plot(pru6)
text(pru6)
title("Decision Tree with 6 Nodes")

plot of chunk unnamed-chunk-6

summary(pru6)

## 
## Classification tree:
## snip.tree(tree = dec1, nodes = c(10L, 7L, 12L, 13L))
## Variables actually used in tree construction:
## [1] "fBodyAccJerkstdX"     "tGravityAccminX"      "tGravityAccmaxY"     
## [4] "fBodyAccMagenergy"    "tGravityAccarCoeffY2"
## Number of terminal nodes:  6 
## Residual mean deviance:  0.707 = 3310 / 4690 
## Misclassification error rate: 0.108 = 508 / 4694

Above shows that by reducing number of nodes to 6 from 12, will increase the inaccuracy by 1% to 10.82% on training data. Using this pruned tree on test data to predict values..

test3 <- predict(pru6, test, type = "class")

Overall accruacy on test data is

sum(test$activity == test3)/length(test3)

## [1] 0.8548

Confustion matrix with a tee of size 6

# test3
table(test$activity, test3)

##           test3
##            laying sitting standing walk walkdown walkup
##   laying      514       0        0    0        0      0
##   sitting       0     438       37    0        0      0
##   standing      0      84      415    0        0      0
##   walk          0       0        8  350        3     60
##   walkdown      0       0        0   38      260     64
##   walkup        0       0       36   31       25    295

Checking by pruning the tree to size 8

pru8 <- prune.tree(dec1, best = 8)
plot(pru8)
text(pru8)
title("Decision Tree with 8 Nodes")

plot of chunk unnamed-chunk-10

summary(pru8)

## 
## Classification tree:
## snip.tree(tree = dec1, nodes = c(10L, 7L))
## Variables actually used in tree construction:
## [1] "fBodyAccJerkstdX"     "tGravityAccminX"      "tGravityAccmaxY"     
## [4] "fBodyAccMagenergy"    "tGravityAccarCoeffY2" "fBodyGyromaxIndsX"   
## [7] "tGravityAccminY"     
## Number of terminal nodes:  8 
## Residual mean deviance:  0.599 = 2810 / 4690 
## Misclassification error rate: 0.101 = 475 / 4694

Above shows that by reducing number of nodes to 6 from 12, will increase the inaccuracy by 1% to 10.82% on training data. Using this pruned tree on test data to predict values..

test3 <- predict(pru8, test, type = "class")

Overall accruacy on test data is

sum(test$activity == test3)/length(test3)

## [1] 0.8548

Confustion matrix with a tee of size 8

# test3
table(test$activity, test3)

##           test3
##            laying sitting standing walk walkdown walkup
##   laying      514       0        0    0        0      0
##   sitting       0     438       37    0        0      0
##   standing      0      84      415    0        0      0
##   walk          0       0        8  344        3     66
##   walkdown      0       0        0   21      260     81
##   walkup        0       0       36   25       25    301

No improvement was found by increasing node size to 8

3D-Plot 3 important variables that will define subject's activity

library(scatterplot3d)
scatterplot3d(train$fBodyAccJerkstdX, train$fBodyAccMagenergy, train$tGravityAccminX, 
    pch = 16, color = as.numeric(train$activity), mar = c(5, 3, 5, 7) + 0.1)
legend(x = 4, y = 1, legend = unique(train$activity), col = unique(train$activity), 
    pch = 19, inset = 0.05, title = "Three Paramters That Define Subject's activity", 
    xjust = 1, cex = 1, x.intersp = 0.1)
title("Three Main Variables that Define Subject's activity in 3D plot")

plot of chunk unnamed-chunk-14

Trying J48,

library(RWeka)
library(rJava)
library(party)

## Loading required package: grid
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following object(s) are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## Loading required package: strucchange
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## 
## The following object(s) are masked from 'package:rJava':
## 
##     clone

j1 <- J48(activity ~ ., data = train)
plot(j1)

plot of chunk J48

summary(j1)

## 
## === Summary ===
## 
## Correctly Classified Instances        4676               99.6165 %
## Incorrectly Classified Instances        18                0.3835 %
## Kappa statistic                          0.9954
## Mean absolute error                      0.0023
## Root mean squared error                  0.0342
## Relative absolute error                  0.8432 %
## Root relative squared error              9.1827 %
## Coverage of cases (0.95 level)          99.787  %
## Mean rel. region size (0.95 level)      16.9365 %
## Total Number of Instances             4694     
## 
## === Confusion Matrix ===
## 
##    a   b   c   d   e   f   <-- classified as
##  893   0   0   0   0   0 |   a = laying
##    0 807   4   0   0   0 |   b = sitting
##    0   3 872   0   0   0 |   c = standing
##    0   0   0 801   2   2 |   d = walk
##    0   0   0   2 622   0 |   e = walkdown
##    0   0   0   2   3 681 |   f = walkup

evaluate_Weka_classifier(j1, test, numFolds = 10)

## === 10 Fold Cross Validation ===
## 
## === Summary ===
## 
## Correctly Classified Instances        2598               97.7427 %
## Incorrectly Classified Instances        60                2.2573 %
## Kappa statistic                          0.9728
## Mean absolute error                      0.0082
## Root mean squared error                  0.0853
## Relative absolute error                  2.9708 %
## Root relative squared error             22.9362 %
## Coverage of cases (0.95 level)          98.006  %
## Mean rel. region size (0.95 level)      16.9175 %
## Total Number of Instances             2658     
## 
## === Confusion Matrix ===
## 
##    a   b   c   d   e   f   <-- classified as
##  513   0   0   0   0   1 |   a = laying
##    0 458  17   0   0   0 |   b = sitting
##    0   7 492   0   0   0 |   c = standing
##    0   0   0 408   7   6 |   d = walk
##    0   0   0   7 349   6 |   e = walkdown
##    0   0   0   2   7 378 |   f = walkup

Weka's J48 classifier rightly classifies data.