Load the necessary packages

library(plyr)    # mapping
## Warning: package 'plyr' was built under R version 4.0.5
library(MASS)    # lda and qda
library(class)   # kNN
library(ggplot2) # beautiful plots
## Warning: package 'ggplot2' was built under R version 4.0.5

Import training and test data

# train dataset and its corresponding labels
data=read.table("Train/X_train.txt",header = FALSE, sep = " ", dec = ".")
labels=read.table("Train/y_train.txt",header=FALSE)

# test dataset and its corresponding labels
data_test=read.table("Test/X_test.txt",header=FALSE, sep= " ",dec = ".")
labels_test=read.table("Test/y_test.txt",header = FALSE)

Insert header names for each attribute and response variables

# features' names can be found in features.txt
featurenames=read.table("features.txt",header = FALSE)
# some of the feature names are redundant
featurenames=make.unique(featurenames[,1],sep="_")
colnames(data)=featurenames
colnames(data_test)=featurenames
# activity types are numeric
colnames(labels)="activity_types"
colnames(labels_test)="activity_types"

Construct complete dataframe

# activity labels are characters
act_labels=read.table("activity_labels.txt",header = FALSE)[,2]
# map the numerical response variable to its real classes
labels$at=mapvalues(labels$activity_types,from=c(1:12),to = act_labels)
labels_test$at=mapvalues(labels_test$activity_types,from=c(1:12),to=act_labels)

# Construct complete dataframe
X_train=cbind(data,labels)
# Filter the data to have activity types from 1 to 6.
X_train=X_train[(X_train$activity_types>=1) & (X_train$activity_types<=6),]
X_test=cbind(data_test,labels_test)
X_test=X_test[(X_test$activity_types>=1) & (X_test$activity_types<=6),]

# remove unwanted variables
rm(data,data_test,labels,labels_test)

Linear Discriminant Analysis (LDA) classifier

start_time=Sys.time()
# LDA model training
model_lda=lda(at~.-activity_types,data=X_train)
## Warning in lda.default(x, grouping, ...): variables are collinear
end_time=Sys.time()
(t_train_lda=end_time-start_time)
## Time difference of 11.74278 secs
# Encounter the problems of collinearity
pred=predict(model_lda,newdata = X_train)$class
nclass=max(X_train$activity_types)

# performance metrics for multiclass problem
perform_metric=function(true_label,pred,nclass) {
  conf=table(truth=true_label,prediction=pred)    # confusion matrix
  nsample=sum(conf)
  acc=sum(diag(conf))/nsample   # accuracy
  Rowsum=rowSums(conf)
  Colsum=colSums(conf)
  # Kappa statistics
  expected_acc=sum(Rowsum*Colsum/nsample)/nsample
  Kappa=(acc-expected_acc)/(1-expected_acc)
  sensitivity=diag(conf)/Rowsum
  precision=diag(conf)/Colsum
  specificity=c()
  for (i in 1:nclass) {
    specificity[i]=sum(conf[-i,-i])/(sum(conf[-i,-i])+colSums(conf)[i]-diag(conf)[i])
  }
  names(specificity)=names(sensitivity)
  list(confusion_matrix=conf,
       accuracy=round(acc,4),Kappa_statistic=round(Kappa,4),
       sensitivity=round(sensitivity,4),specificity=round(specificity,4),
       precision=round(precision,4))
}

# Training performance
perform_metric(X_train$at,pred,nclass)
## $confusion_matrix
##                     prediction
## truth                LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
##   LAYING               1413       0        0       0                  0
##   SITTING                 6    1244       43       0                  0
##   STANDING                0      52     1371       0                  0
##   WALKING                 0       0        0    1225                  0
##   WALKING_DOWNSTAIRS      0       0        0       0                985
##   WALKING_UPSTAIRS        0       0        0       0                  0
##                     prediction
## truth                WALKING_UPSTAIRS
##   LAYING                            0
##   SITTING                           0
##   STANDING                          0
##   WALKING                           1
##   WALKING_DOWNSTAIRS                2
##   WALKING_UPSTAIRS               1073
## 
## $accuracy
## [1] 0.986
## 
## $Kappa_statistic
## [1] 0.9831
## 
## $sensitivity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9621             0.9635             0.9992 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9980             1.0000 
## 
## $specificity
##             LAYING            SITTING           STANDING            WALKING 
##             0.9990             0.9915             0.9928             1.0000 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             1.0000             0.9995 
## 
## $precision
##             LAYING            SITTING           STANDING            WALKING 
##             0.9958             0.9599             0.9696             1.0000 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             1.0000             0.9972
# test performance
pred=predict(model_lda,newdata = X_test)$class
perform_metric(X_test$at,pred,nclass)
## $confusion_matrix
##                     prediction
## truth                LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
##   LAYING                545       0        0       0                  0
##   SITTING                 0     445       62       0                  0
##   STANDING                0      23      533       0                  0
##   WALKING                 0       0        0     493                  0
##   WALKING_DOWNSTAIRS      0       0        0       1                404
##   WALKING_UPSTAIRS        0       0        0      10                  0
##                     prediction
## truth                WALKING_UPSTAIRS
##   LAYING                            0
##   SITTING                           1
##   STANDING                          0
##   WALKING                           3
##   WALKING_DOWNSTAIRS               15
##   WALKING_UPSTAIRS                461
## 
## $accuracy
## [1] 0.9616
## 
## $Kappa_statistic
## [1] 0.9538
## 
## $sensitivity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.8760             0.9586             0.9940 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9619             0.9788 
## 
## $specificity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9908             0.9746             0.9956 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             1.0000             0.9925 
## 
## $precision
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9509             0.8958             0.9782 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             1.0000             0.9604

k-Nearest Neighbors (kNN) classifier

kfold=5    # k-fold cv, k=5
nn=30     # maximum number of nearest neighbors

y_train=factor(X_train$at)
nfeatures=561    # total number of features

# function for searching the best k for kNN
kfold_knn_acc=function(data,label,kfold,nn,nfeatures,seed=100) {
  set.seed(seed)
  idx_kfold=sample(1:kfold,size=nrow(data),replace=TRUE)
  # Matrix to store the cv accuracy
  acc=matrix(,nrow=5,ncol=(nn-1))
  for (i in 1:kfold) {
    data_train=data[,1:nfeatures][idx_kfold!=i,]
    data_val=data[,1:nfeatures][idx_kfold==i,]
    labels_train=label[idx_kfold!=i]
    labels_val=label[idx_kfold==i]
    for (j in 2:nn) {
      pred_knn=knn(data_train,data_val,labels_train,k=j)
      acc[i,j-1]=mean(pred_knn==labels_val)
    } 
  }
  acc
}
# acc=kfold_knn_acc(X_train,y_train,kfold,nn,nfeatures)
# The above code execution to get the optimal number of nearest neighbors, k would take
# around 40-45 minutes. It had been saved as "cv_acc_knn_1.RData", So, we load the data
load('cv_acc_knn_1.RData')

# Visualize the average accuracies for each choices of k
mean_acc=apply(acc,2,mean)
std_acc=apply(acc, 2, sd)
plot_acc_knn=data.frame(nn=c(2:30),mean_acc,std_acc)
k_opt= which.max(mean_acc)+1

ggplot(data=plot_acc_knn,aes(x=nn,y=mean_acc,ymin=mean_acc-std_acc,ymax=mean_acc+std_acc)) +
  geom_line() + geom_point() + geom_errorbar() +
  geom_vline(xintercept = k_opt,color="red",linetype=2) +
  xlab("number of nearest neighbors") +
  ylab("accuracy")

# training performance
pred_knn=knn(X_train[,1:nfeatures],X_train[,1:nfeatures],y_train,k=k_opt)
perform_metric(y_train,pred_knn,nclass)
## $confusion_matrix
##                     prediction
## truth                LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
##   LAYING               1413       0        0       0                  0
##   SITTING                 4    1211       76       0                  0
##   STANDING                0      33     1388       0                  0
##   WALKING                 0       0        0    1226                  0
##   WALKING_DOWNSTAIRS      0       0        0       0                985
##   WALKING_UPSTAIRS        0       0        0       0                  1
##                     prediction
## truth                WALKING_UPSTAIRS
##   LAYING                            0
##   SITTING                           2
##   STANDING                          2
##   WALKING                           0
##   WALKING_DOWNSTAIRS                2
##   WALKING_UPSTAIRS               1072
## 
## $accuracy
## [1] 0.9838
## 
## $Kappa_statistic
## [1] 0.9805
## 
## $sensitivity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9366             0.9754             1.0000 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9980             0.9991 
## 
## $specificity
##             LAYING            SITTING           STANDING            WALKING 
##             0.9993             0.9946             0.9873             1.0000 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9998             0.9991 
## 
## $precision
##             LAYING            SITTING           STANDING            WALKING 
##             0.9972             0.9735             0.9481             1.0000 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9990             0.9944
# test performance
y_test=factor(X_test$at)
pred_knn=knn(X_train[,1:nfeatures],X_test[,1:nfeatures],y_train,k=k_opt)
perform_metric(y_test,pred_knn,nclass)
## $confusion_matrix
##                     prediction
## truth                LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
##   LAYING                541       2        1       0                  0
##   SITTING                 0     398      106       0                  0
##   STANDING                0      42      513       0                  0
##   WALKING                 0       0        0     486                  8
##   WALKING_DOWNSTAIRS      0       0        0      40                329
##   WALKING_UPSTAIRS        0       0        0      36                 10
##                     prediction
## truth                WALKING_UPSTAIRS
##   LAYING                            1
##   SITTING                           4
##   STANDING                          1
##   WALKING                           2
##   WALKING_DOWNSTAIRS               51
##   WALKING_UPSTAIRS                425
## 
## $accuracy
## [1] 0.8985
## 
## $Kappa_statistic
## [1] 0.8779
## 
## $sensitivity
##             LAYING            SITTING           STANDING            WALKING 
##             0.9927             0.7835             0.9227             0.9798 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.7833             0.9023 
## 
## $specificity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9823             0.9561             0.9696 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9930             0.9766 
## 
## $precision
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9005             0.8274             0.8648 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9481             0.8781

Principal component analysis (PCA)

pca=prcomp(X_train[,1:nfeatures])

# variance explained
var_explained=pca$sdev^2/sum(pca$sdev^2)
plot_var_exp=data.frame(no_prin_comp=c(1:10),var=var_explained[1:10])

# Scree plot
ggplot(data=plot_var_exp,aes(x=no_prin_comp,y=var)) +
  geom_line() + geom_point() +
  xlab("Number of principal components") +
  ylab("Variance explained")

# Visualize in grouped scatterplot
pca_plot=data.frame(pca$x[,c(1,2)],class=X_train$at)
ggplot(data=pca_plot,aes(x=PC1,y=PC2)) +
  geom_point(aes(color=factor(class)))

# cumulative percentages of variance explained
cum_var=cumsum(var_explained)
nf=length(cum_var[cum_var<=0.95])
X_pca=data.frame(pca$x[,1:(nf+1)])
X_pca=cbind(X_pca,at=X_train$at)

# Prepare test data
X_pca_test=predict(pca,newdata=X_test[,1:nfeatures])[,1:(nf+1)]
xx=as.data.frame(X_pca_test)
X_pca_test=cbind(xx,at=X_test$at)

PCA + LDA

start_time=Sys.time()
model_lda=lda(at~.,data=X_pca)
end_time=Sys.time()
(t_train_pca_lda=end_time-start_time)
## Time difference of 0.2933819 secs
pred=predict(model_lda,newdata = X_pca)$class
perform_metric(X_pca$at,pred,nclass)  # Training
## $confusion_matrix
##                     prediction
## truth                LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
##   LAYING               1413       0        0       0                  0
##   SITTING                13    1072      208       0                  0
##   STANDING                0     112     1311       0                  0
##   WALKING                 0       0        0    1200                  0
##   WALKING_DOWNSTAIRS      0       0        0      10                939
##   WALKING_UPSTAIRS        0       0        0      16                 11
##                     prediction
## truth                WALKING_UPSTAIRS
##   LAYING                            0
##   SITTING                           0
##   STANDING                          0
##   WALKING                          26
##   WALKING_DOWNSTAIRS               38
##   WALKING_UPSTAIRS               1046
## 
## $accuracy
## [1] 0.9415
## 
## $Kappa_statistic
## [1] 0.9295
## 
## $sensitivity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.8291             0.9213             0.9788 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9514             0.9748 
## 
## $specificity
##             LAYING            SITTING           STANDING            WALKING 
##             0.9978             0.9817             0.9653             0.9958 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9983             0.9899 
## 
## $precision
##             LAYING            SITTING           STANDING            WALKING 
##             0.9909             0.9054             0.8631             0.9788 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9884             0.9423
pred=predict(model_lda,newdata = X_pca_test)$class
perform_metric(X_pca_test$at,pred,nclass)   # test
## $confusion_matrix
##                     prediction
## truth                LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
##   LAYING                545       0        0       0                  0
##   SITTING                 0     413       95       0                  0
##   STANDING                0      32      524       0                  0
##   WALKING                 0       0        0     488                  4
##   WALKING_DOWNSTAIRS      0       0        0      10                369
##   WALKING_UPSTAIRS        0       0        0      38                  0
##                     prediction
## truth                WALKING_UPSTAIRS
##   LAYING                            0
##   SITTING                           0
##   STANDING                          0
##   WALKING                           4
##   WALKING_DOWNSTAIRS               41
##   WALKING_UPSTAIRS                433
## 
## $accuracy
## [1] 0.9252
## 
## $Kappa_statistic
## [1] 0.9101
## 
## $sensitivity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.8130             0.9424             0.9839 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.8786             0.9193 
## 
## $specificity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9871             0.9611             0.9808 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9984             0.9822 
## 
## $precision
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9281             0.8465             0.9104 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9893             0.9059

PCA + QDA

start_time=Sys.time()
model_qda=qda(at~.,data=X_pca)
end_time=Sys.time()
(t_train_pca_qda=end_time-start_time)
## Time difference of 0.1606681 secs
pred=predict(model_qda,newdata = X_pca)$class
perform_metric(X_pca$at,pred,nclass)   # Training
## $confusion_matrix
##                     prediction
## truth                LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
##   LAYING               1413       0        0       0                  0
##   SITTING                 1    1169      123       0                  0
##   STANDING                0      34     1388       0                  0
##   WALKING                 0       0        0    1221                  3
##   WALKING_DOWNSTAIRS      0       0        0       0                987
##   WALKING_UPSTAIRS        0       0        0       0                  1
##                     prediction
## truth                WALKING_UPSTAIRS
##   LAYING                            0
##   SITTING                           0
##   STANDING                          1
##   WALKING                           2
##   WALKING_DOWNSTAIRS                0
##   WALKING_UPSTAIRS               1072
## 
## $accuracy
## [1] 0.9777
## 
## $Kappa_statistic
## [1] 0.9732
## 
## $sensitivity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9041             0.9754             0.9959 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             1.0000             0.9991 
## 
## $specificity
##             LAYING            SITTING           STANDING            WALKING 
##             0.9998             0.9944             0.9795             1.0000 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9994             0.9995 
## 
## $precision
##             LAYING            SITTING           STANDING            WALKING 
##             0.9993             0.9717             0.9186             1.0000 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9960             0.9972
pred=predict(model_qda,newdata = X_pca_test)$class
perform_metric(X_pca_test$at,pred,nclass)   #test
## $confusion_matrix
##                     prediction
## truth                LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
##   LAYING                545       0        0       0                  0
##   SITTING                 1     433       74       0                  0
##   STANDING                0      25      529       0                  0
##   WALKING                 0       0        0     465                 23
##   WALKING_DOWNSTAIRS      0       0        0       0                399
##   WALKING_UPSTAIRS        0       0        0       6                  2
##                     prediction
## truth                WALKING_UPSTAIRS
##   LAYING                            0
##   SITTING                           0
##   STANDING                          2
##   WALKING                           8
##   WALKING_DOWNSTAIRS               21
##   WALKING_UPSTAIRS                463
## 
## $accuracy
## [1] 0.9459
## 
## $Kappa_statistic
## [1] 0.935
## 
## $sensitivity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.8524             0.9514             0.9375 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9500             0.9830 
## 
## $specificity
##             LAYING            SITTING           STANDING            WALKING 
##             0.9996             0.9900             0.9697             0.9976 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9903             0.9877 
## 
## $precision
##             LAYING            SITTING           STANDING            WALKING 
##             0.9982             0.9454             0.8773             0.9873 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9410             0.9372

PCA + kNN

kfold=5    # 5-fold cv
nn=30     # maximum number of nearest neighbors
y_train=factor(X_train$at)

# cv to get the best k
start_time=Sys.time()
acc_mat=kfold_knn_acc(X_pca,y_train,kfold,nn,nf+1)
end_time=Sys.time()
(t_train_pca_knn=end_time-start_time)
## Time difference of 3.081214 mins
mean_acc=apply(acc_mat,2,mean)
std_acc=apply(acc_mat, 2, sd)
plot_acc_knn=data.frame(nn=c(2:nn),mean_acc,std_acc)
k_opt=which.max(mean_acc)+1

ggplot(data=plot_acc_knn,aes(x=nn,y=mean_acc,ymin=mean_acc-std_acc,ymax=mean_acc+std_acc)) +
  geom_line() + geom_point() + geom_errorbar() +
  geom_vline(xintercept = k_opt,color="red",linetype=2) +
  xlab("number of nearest neighbors") +
  ylab("accuracy")

# k=3 is optimal

# Training performance
pred_knn=knn(X_pca[,1:nf+1],X_pca[,1:nf+1],y_train,k=k_opt)
perform_metric(y_train,pred_knn,nclass)
## $confusion_matrix
##                     prediction
## truth                LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
##   LAYING               1413       0        0       0                  0
##   SITTING                 0    1258       35       0                  0
##   STANDING                0      42     1381       0                  0
##   WALKING                 0       0        0    1226                  0
##   WALKING_DOWNSTAIRS      0       0        0       0                985
##   WALKING_UPSTAIRS        0       0        0       0                  0
##                     prediction
## truth                WALKING_UPSTAIRS
##   LAYING                            0
##   SITTING                           0
##   STANDING                          0
##   WALKING                           0
##   WALKING_DOWNSTAIRS                2
##   WALKING_UPSTAIRS               1073
## 
## $accuracy
## [1] 0.9893
## 
## $Kappa_statistic
## [1] 0.9872
## 
## $sensitivity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9729             0.9705             1.0000 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9980             1.0000 
## 
## $specificity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9931             0.9942             1.0000 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             1.0000             0.9997 
## 
## $precision
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9677             0.9753             1.0000 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             1.0000             0.9981
# Test performance
y_test=factor(X_pca_test$at)
pred_knn=knn(X_pca[,1:nf+1],X_pca_test[,1:nf+1],y_train,k=k_opt)
perform_metric(y_test,pred_knn,nclass)
## $confusion_matrix
##                     prediction
## truth                LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
##   LAYING                539       4        0       1                  0
##   SITTING                 0     409       94       0                  0
##   STANDING                0      63      491       0                  0
##   WALKING                 0       8        0     478                  8
##   WALKING_DOWNSTAIRS      0       4        0      44                316
##   WALKING_UPSTAIRS        0       2        0      39                 24
##                     prediction
## truth                WALKING_UPSTAIRS
##   LAYING                            1
##   SITTING                           5
##   STANDING                          2
##   WALKING                           2
##   WALKING_DOWNSTAIRS               56
##   WALKING_UPSTAIRS                406
## 
## $accuracy
## [1] 0.8808
## 
## $Kappa_statistic
## [1] 0.8566
## 
## $sensitivity
##             LAYING            SITTING           STANDING            WALKING 
##             0.9890             0.8051             0.8831             0.9637 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.7524             0.8620 
## 
## $specificity
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.9674             0.9615             0.9664 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9876             0.9739 
## 
## $precision
##             LAYING            SITTING           STANDING            WALKING 
##             1.0000             0.8347             0.8393             0.8505 
## WALKING_DOWNSTAIRS   WALKING_UPSTAIRS 
##             0.9080             0.8602

Reference

Jorge-L. Reyes-Ortiz, Luca Oneto, Albert Samà, Xavier Parra, Davide Anguita. Transition-Aware Human Activity Recognition Using Smartphones. Neurocomputing. Springer 2015.