Exploratory data analysis is a “rough cut” or filter which helps you to find the most beneficial areas of questioning so you can set your priorities accordingly.
To show you that “real-world” research isn’t always neat and well-defined like textbook questions with clearcut answers.
ssd <- read.table("./data/ssd.csv")
dim(ssd)
## [1] 7352 563
# Last 2 columns
lc1 <- length(ssd)
lc2 <- length(ssd)-1
names(ssd[lc2:lc1])
## [1] "subject" "activity"
ssd[sample(1:lc1,10),c(lc2:lc1)]
## subject activity
## 467 3 walkdown
## 225 1 laying
## 535 3 standing
## 49 1 sitting
## 469 3 walkdown
## 436 3 walk
## 257 1 walk
## 113 1 walk
## 466 3 walkdown
## 461 3 walk
# which subject out of 30 involved in this dataset
table(ssd$subject)
##
## 1 3 5 6 7 8 11 14 15 16 17 19 21 22 23 25 26 27 28 29
## 347 341 302 325 308 281 316 323 328 366 368 360 408 321 372 409 392 376 382 344
## 30
## 383
# count of subject
length(table(ssd$subject))
## [1] 21
So we’re looking at training data from a machine learning repository.
We can infer that this data is supposed to train machines to recognize activity collected from the accelerometers and gyroscopes built into the smartphones that the subjects had strapped to their waists.
table(ssd$activity)
##
## laying sitting standing walk walkdown walkup
## 1407 1286 1374 1226 986 1073
## Tranforming 'Activity' column as factor
ssd <- transform(ssd, activity = factor(activity))
# subset of ssd for subject 1
sub1 <- subset(ssd,subject == 1)
# dimension of sub1
dim(sub1)
## [1] 347 563
# names of some of the column of subject1 subset
names(sub1[,1:12])
## [1] "tBodyAcc.mean...X" "tBodyAcc.mean...Y" "tBodyAcc.mean...Z"
## [4] "tBodyAcc.std...X" "tBodyAcc.std...Y" "tBodyAcc.std...Z"
## [7] "tBodyAcc.mad...X" "tBodyAcc.mad...Y" "tBodyAcc.mad...Z"
## [10] "tBodyAcc.max...X" "tBodyAcc.max...Y" "tBodyAcc.max...Z"
par(mfrow = c(1,3))
plot(sub1[,1], col = sub1$activity, ylab = names(sub1)[1])
plot(sub1[,2], col = sub1$activity, ylab = names(sub1)[2])
plot(sub1[,3], col = sub1$activity, ylab = names(sub1)[3])
legend("bottomright",legend=unique(sub1$activity),col=unique(sub1$activity), pch = 1)
source("myplclust.R")
mdist1 <- dist(sub1[,1:3])
mclust1 <- hclust(mdist1)
myplclust(mclust1, lab.col = unclass(sub1$activity))
# names of some of the column of subject1 subset
names(sub1[,1:12])
## [1] "tBodyAcc.mean...X" "tBodyAcc.mean...Y" "tBodyAcc.mean...Z"
## [4] "tBodyAcc.std...X" "tBodyAcc.std...Y" "tBodyAcc.std...Z"
## [7] "tBodyAcc.mad...X" "tBodyAcc.mad...Y" "tBodyAcc.mad...Z"
## [10] "tBodyAcc.max...X" "tBodyAcc.max...Y" "tBodyAcc.max...Z"
par(mfrow = c(1,3))
plot(sub1[,10], col = sub1$activity, ylab = names(sub1)[10])
plot(sub1[,11], col = sub1$activity, ylab = names(sub1)[11])
plot(sub1[,12], col = sub1$activity, ylab = names(sub1)[12])
legend("bottomright",legend=unique(sub1$activity),col=unique(sub1$activity), pch = 1)
source("myplclust.R")
mdist2 <- dist(sub1[,10:12])
mclust2 <- hclust(mdist2)
myplclust(mclust2, lab.col = unclass(sub1$activity))
svd1 <- svd(scale(sub1[,-c(lc2,lc1)])) # lc2 & lc1 are 2nd last and last column respectively.
# To see dimension of LEFT singular vectors of sub1
dim(svd1$u)
## [1] 347 347
par(mfrow = c(1,2))
plot(svd1$u[,1], col = sub1$activity, pch = 19)
plot(svd1$u[,2], col = sub1$activity, pch = 19)
par(mfrow = c(1,2))
plot(svd1$v[,1], col = rgb(0,0,0,0.5), pch = 19)
plot(svd1$v[,2], col = rgb(0,0,0,0.5), pch = 19)
maxCon <- which.max(svd1$v[,2])
names(sub1[maxCon])
## [1] "fBodyAcc.meanFreq...Z"
source("myplclust.R")
mdist3 <- dist(sub1[,c(10:12,maxCon)])
mclust3 <- hclust(mdist3)
myplclust(mclust3, lab.col = unclass(sub1$activity))
names(sub1[maxCon])
## [1] "fBodyAcc.meanFreq...Z"
kClust<- kmeans(sub1[,-c(lc2,lc1)], centers = 6)
table(kClust$cluster, sub1$activity)
##
## laying sitting standing walk walkdown walkup
## 1 24 33 47 0 0 0
## 2 0 0 0 0 0 34
## 3 0 0 0 95 49 0
## 4 17 12 6 0 0 0
## 5 0 0 0 0 0 19
## 6 9 2 0 0 0 0
kClust<- kmeans(sub1[,-c(lc2,lc1)], centers = 6, nstart = 100)
table(kClust$cluster, sub1$activity)
##
## laying sitting standing walk walkdown walkup
## 1 0 37 51 0 0 0
## 2 29 0 0 0 0 0
## 3 18 10 2 0 0 0
## 4 0 0 0 95 0 0
## 5 0 0 0 0 49 0
## 6 3 0 0 0 0 53
#to find the dimensions of kClust's centers
dim(kClust$centers)
## [1] 6 561
# Why laying is equal to 29
table(kClust$cluster, sub1$activity)
##
## laying sitting standing walk walkdown walkup
## 1 0 37 51 0 0 0
## 2 29 0 0 0 0 0
## 3 18 10 2 0 0 0
## 4 0 0 0 95 0 0
## 5 0 0 0 0 49 0
## 6 3 0 0 0 0 53
laying <- which(kClust$size==29)
plot(kClust$centers[laying,1:12],pch=19,ylab="Laying Cluster")
names(sub1[,1:3])
## [1] "tBodyAcc.mean...X" "tBodyAcc.mean...Y" "tBodyAcc.mean...Z"
#to find the dimensions of kClust's centers
dim(kClust$centers)
## [1] 6 561
# Why walkdown is equal to 49
table(kClust$cluster, sub1$activity)
##
## laying sitting standing walk walkdown walkup
## 1 0 37 51 0 0 0
## 2 29 0 0 0 0 0
## 3 18 10 2 0 0 0
## 4 0 0 0 95 0 0
## 5 0 0 0 0 49 0
## 6 3 0 0 0 0 53
walkdown <- which(kClust$size==49)
plot(kClust$centers[walkdown,1:12],pch=19,ylab="Walkdown Cluster")