UCI HAR dataset

One of the most exciting areas in all of data science right now is wearable computing - see for example this article.

Data set information was download from http://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones#. “The experiments have been carried out with a group of 30 volunteers within an age bracket of 19-48 years. Each person performed six activities (WALKING, WALKING UPSTAIRS, WALKING DOWNSTAIRS, SITTING, STANDING, LAYING) wearing a smartphone (Samsung Galaxy S II) on the waist. Using its embedded accelerometer and gyroscope, we captured 3-axial linear acceleration and 3-axial angular velocity at a constant rate of 50Hz. The experiments have been video-recorded to label the data manually. The obtained dataset has been randomly partitioned into two sets, where 70% of the volunteers was selected for generating the training data and 30% the test data. The sensor signals (accelerometer and gyroscope) were pre-processed by applying noise filters and then sampled in fixed-width sliding windows of 2.56 sec and 50% overlap (128 readings/window). The sensor acceleration signal, which has gravitational and body motion components, was separated using a Butterworth low-pass filter into body acceleration and gravity. The gravitational force is assumed to have only low frequency components, therefore a filter with 0.3 Hz cutoff frequency was used. From each window, a vector of features was obtained by calculating variables from the time and frequency domain.”

For each record in the raw dataset it is provided:

* Triaxial acceleration from the accelerometer (total acceleration) and the estimated body acceleration.
* Triaxial Angular velocity from the gyroscope.
* A 561-feature vector with time and frequency domain variables.
* Its activity label.
* An identifier of the subject who carried out the experiment.
load("../courses/04_ExploratoryAnalysis/clusteringExample/data/samsungData.rda")
names(samsungData)[1:12]
##  [1] "tBodyAcc-mean()-X" "tBodyAcc-mean()-Y" "tBodyAcc-mean()-Z"
##  [4] "tBodyAcc-std()-X"  "tBodyAcc-std()-Y"  "tBodyAcc-std()-Z" 
##  [7] "tBodyAcc-mad()-X"  "tBodyAcc-mad()-Y"  "tBodyAcc-mad()-Z" 
## [10] "tBodyAcc-max()-X"  "tBodyAcc-max()-Y"  "tBodyAcc-max()-Z"

table(samsungData$activity)
## 
##   laying  sitting standing     walk walkdown   walkup 
##     1407     1286     1374     1226      986     1073

##plotting average acceleration for first subject

samsungData <- transform(samsungData, activity = factor(activity), subject = factor(subject))
library(ggplot2)
qplot(data = samsungData, x = subject, fill = activity)

plot of chunk processData

library(plyr)
numPredictors = ncol(samsungData) - 2
dataSd = colwise(sd)(samsungData[, 1:numPredictors])
dataSd$stat = "Predictor Variable Standard Deviation"
dataMean = colwise(mean)(samsungData[, 1:numPredictors])
dataMean$stat = "Predictor Variable Mean"
library(reshape2)
temp = melt(rbind(dataMean, dataSd), c("stat"))
qplot(data = temp, x = value, binwidth = 0.025) + facet_wrap(~stat, ncol = 1)

plot of chunk unnamed-chunk-2

sub1 <- subset(samsungData, subject == 1)

par(mfrow = c(1, 2), mar = c(5, 4, 1, 1))
plot(sub1[, 1], col = sub1$activity, ylab = names(sub1)[1])
plot(sub1[, 2], col = sub1$activity, ylab = names(sub1)[2])
legend("bottomright", legend = unique(sub1$activi), col = unique(sub1$activity), 
    pch = 1)

plot of chunk unnamed-chunk-3

myplclust <- function(hclust, lab = hclust$labels, lab.col = rep(1, length(hclust$labels)), 
    hang = 0.1, ...) {
    ## 
    y <- rep(hclust$height, 2)
    x <- as.numeric(hclust$merge)
    y <- y[which(x < 0)]
    x <- x[which(x < 0)]
    x <- abs(x)
    y <- y[order(x)]
    x <- x[order(x)]
    plot(hclust, labels = FALSE, hang = hang, ...)
    text(x = x, y = y[hclust$order] - (max(hclust$height) * hang), labels = lab[hclust$order], 
        col = lab.col[hclust$order], srt = 90, adj = c(1, 0.5), xpd = NA, ...)

}

Clustering based just on average acceleration

# source('myplclust.R')
distanceMatrix <- dist(sub1[, 1:3])
hclustering <- hclust(distanceMatrix)
myplclust(hclustering, lab.col = unclass(sub1$activity))

plot of chunk unnamed-chunk-5

#plotting max acceleration for the first subject

par(mfrow = c(1, 2))
plot(sub1[, 10], pch = 19, col = sub1$activity, ylab = names(sub1)[10])
plot(sub1[, 11], pch = 19, col = sub1$activity, ylab = names(sub1)[11])

plot of chunk unnamed-chunk-6

par(mfrow = c(1, 1))

Clustering based on maximum acceleration

# source('myplclust.R')
distanceMatrix <- dist(sub1[, 10:12])
hclustering <- hclust(distanceMatrix)
myplclust(hclustering, lab.col = unclass(sub1$activity))

plot of chunk unnamed-chunk-7

#Singular Value decomposition

svd1 = svd(scale(sub1[, -c(562, 563)]))
par(mfrow = c(1, 2))
plot(svd1$u[, 1], col = sub1$activity, pch = 19)
plot(svd1$u[, 2], col = sub1$activity, pch = 19)

plot of chunk unnamed-chunk-8

Find maximum contributor

plot(svd1$v[, 2], pch = 19)

plot of chunk unnamed-chunk-9


plot(sub1[, 296], col = sub1$activity, ylab = names(sub1[296]))

plot of chunk unnamed-chunk-9


New clustering with maximum contributer

maxContrib <- which.max(svd1$v[, 2])
distanceMatrix <- dist(sub1[, c(10:12, maxContrib)])
hclustering <- hclust(distanceMatrix)
myplclust(hclustering, lab.col = unclass(sub1$activity))

plot of chunk unnamed-chunk-10


New clustering with maximum contributer

names(samsungData)[maxContrib]
## [1] "fBodyAcc.meanFreq...Z"
(topN <- order(svd1$v[, 2], decreasing = T)[1:5])
## [1] 296 249 210 223  74
n <- names(samsungData)[topN]
r <- svd1$v[, 2][topN]
rbind(n, r)
##   [,1]                    [,2]                     
## n "fBodyAcc.meanFreq...Z" "tBodyGyroMag.arCoeff..1"
## r "0.12232138827709"      "0.121065159901176"      
##   [,3]                     [,4]                       
## n "tBodyAccMag.arCoeff..1" "tGravityAccMag.arCoeff..1"
## r "0.11408410152257"       "0.11408410152257"         
##   [,5]                       
## n "tGravityAcc.arCoeff...Z.1"
## r "0.112031658115089"

qplot(sub1[, 296], col = sub1$activity, xlab = names(sub1[296]), binwidth = 2/30, 
    fill = sub1$activity)
## Error: could not find function "qplot"

#kmean clustering

kClust <- kmeans(sub1[, -c(562, 563)], centers = 6)
table(kClust$cluster, sub1$activity)
##    
##     laying sitting standing walk walkdown walkup
##   1      0       0        0   95        0      0
##   2     10       2        0    0        0      0
##   3      0       0        0    0       49      0
##   4      0       0        0    0        0     53
##   5     24      33       46    0        0      0
##   6     16      12        7    0        0      0

kClust <- kmeans(sub1[, -c(562, 563)], centers = 6, nstart = 1)
table(kClust$cluster, sub1$activity)
##    
##     laying sitting standing walk walkdown walkup
##   1      0       0       47    0        0      0
##   2      0       0        0   95       49     53
##   3     27       0        0    0        0      0
##   4      9       2        0    0        0      0
##   5      0      34        3    0        0      0
##   6     14      11        3    0        0      0

kClust <- kmeans(sub1[, -c(562, 563)], centers = 6, nstart = 100)
table(kClust$cluster, sub1$activity)
##    
##     laying sitting standing walk walkdown walkup
##   1     29       0        0    0        0      0
##   2      0      37       51    0        0      0
##   3      0       0        0   95        0      0
##   4     18      10        2    0        0      0
##   5      0       0        0    0       49      0
##   6      3       0        0    0        0     53