Load the necessary packages
library(plyr) # mapping
## Warning: package 'plyr' was built under R version 4.0.5
library(MASS) # lda and qda
library(class) # kNN
library(ggplot2) # beautiful plots
## Warning: package 'ggplot2' was built under R version 4.0.5
Import training and test data
# train dataset and its corresponding labels
data=read.table("Train/X_train.txt",header = FALSE, sep = " ", dec = ".")
labels=read.table("Train/y_train.txt",header=FALSE)
# test dataset and its corresponding labels
data_test=read.table("Test/X_test.txt",header=FALSE, sep= " ",dec = ".")
labels_test=read.table("Test/y_test.txt",header = FALSE)
Insert header names for each attribute and response variables
# features' names can be found in features.txt
featurenames=read.table("features.txt",header = FALSE)
# some of the feature names are redundant
featurenames=make.unique(featurenames[,1],sep="_")
colnames(data)=featurenames
colnames(data_test)=featurenames
# activity types are numeric
colnames(labels)="activity_types"
colnames(labels_test)="activity_types"
Construct complete dataframe
# activity labels are characters
act_labels=read.table("activity_labels.txt",header = FALSE)[,2]
# map the numerical response variable to its real classes
labels$at=mapvalues(labels$activity_types,from=c(1:12),to = act_labels)
labels_test$at=mapvalues(labels_test$activity_types,from=c(1:12),to=act_labels)
# Construct complete dataframe
X_train=cbind(data,labels)
# Filter the data to have activity types from 1 to 6.
X_train=X_train[(X_train$activity_types>=1) & (X_train$activity_types<=6),]
X_test=cbind(data_test,labels_test)
X_test=X_test[(X_test$activity_types>=1) & (X_test$activity_types<=6),]
# remove unwanted variables
rm(data,data_test,labels,labels_test)
start_time=Sys.time()
# LDA model training
model_lda=lda(at~.-activity_types,data=X_train)
## Warning in lda.default(x, grouping, ...): variables are collinear
end_time=Sys.time()
(t_train_lda=end_time-start_time)
## Time difference of 11.74278 secs
# Encounter the problems of collinearity
pred=predict(model_lda,newdata = X_train)$class
nclass=max(X_train$activity_types)
# performance metrics for multiclass problem
perform_metric=function(true_label,pred,nclass) {
conf=table(truth=true_label,prediction=pred) # confusion matrix
nsample=sum(conf)
acc=sum(diag(conf))/nsample # accuracy
Rowsum=rowSums(conf)
Colsum=colSums(conf)
# Kappa statistics
expected_acc=sum(Rowsum*Colsum/nsample)/nsample
Kappa=(acc-expected_acc)/(1-expected_acc)
sensitivity=diag(conf)/Rowsum
precision=diag(conf)/Colsum
specificity=c()
for (i in 1:nclass) {
specificity[i]=sum(conf[-i,-i])/(sum(conf[-i,-i])+colSums(conf)[i]-diag(conf)[i])
}
names(specificity)=names(sensitivity)
list(confusion_matrix=conf,
accuracy=round(acc,4),Kappa_statistic=round(Kappa,4),
sensitivity=round(sensitivity,4),specificity=round(specificity,4),
precision=round(precision,4))
}
# Training performance
perform_metric(X_train$at,pred,nclass)
## $confusion_matrix
## prediction
## truth LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
## LAYING 1413 0 0 0 0
## SITTING 6 1244 43 0 0
## STANDING 0 52 1371 0 0
## WALKING 0 0 0 1225 0
## WALKING_DOWNSTAIRS 0 0 0 0 985
## WALKING_UPSTAIRS 0 0 0 0 0
## prediction
## truth WALKING_UPSTAIRS
## LAYING 0
## SITTING 0
## STANDING 0
## WALKING 1
## WALKING_DOWNSTAIRS 2
## WALKING_UPSTAIRS 1073
##
## $accuracy
## [1] 0.986
##
## $Kappa_statistic
## [1] 0.9831
##
## $sensitivity
## LAYING SITTING STANDING WALKING
## 1.0000 0.9621 0.9635 0.9992
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9980 1.0000
##
## $specificity
## LAYING SITTING STANDING WALKING
## 0.9990 0.9915 0.9928 1.0000
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 1.0000 0.9995
##
## $precision
## LAYING SITTING STANDING WALKING
## 0.9958 0.9599 0.9696 1.0000
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 1.0000 0.9972
# test performance
pred=predict(model_lda,newdata = X_test)$class
perform_metric(X_test$at,pred,nclass)
## $confusion_matrix
## prediction
## truth LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
## LAYING 545 0 0 0 0
## SITTING 0 445 62 0 0
## STANDING 0 23 533 0 0
## WALKING 0 0 0 493 0
## WALKING_DOWNSTAIRS 0 0 0 1 404
## WALKING_UPSTAIRS 0 0 0 10 0
## prediction
## truth WALKING_UPSTAIRS
## LAYING 0
## SITTING 1
## STANDING 0
## WALKING 3
## WALKING_DOWNSTAIRS 15
## WALKING_UPSTAIRS 461
##
## $accuracy
## [1] 0.9616
##
## $Kappa_statistic
## [1] 0.9538
##
## $sensitivity
## LAYING SITTING STANDING WALKING
## 1.0000 0.8760 0.9586 0.9940
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9619 0.9788
##
## $specificity
## LAYING SITTING STANDING WALKING
## 1.0000 0.9908 0.9746 0.9956
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 1.0000 0.9925
##
## $precision
## LAYING SITTING STANDING WALKING
## 1.0000 0.9509 0.8958 0.9782
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 1.0000 0.9604
kfold=5 # k-fold cv, k=5
nn=30 # maximum number of nearest neighbors
y_train=factor(X_train$at)
nfeatures=561 # total number of features
# function for searching the best k for kNN
kfold_knn_acc=function(data,label,kfold,nn,nfeatures,seed=100) {
set.seed(seed)
idx_kfold=sample(1:kfold,size=nrow(data),replace=TRUE)
# Matrix to store the cv accuracy
acc=matrix(,nrow=5,ncol=(nn-1))
for (i in 1:kfold) {
data_train=data[,1:nfeatures][idx_kfold!=i,]
data_val=data[,1:nfeatures][idx_kfold==i,]
labels_train=label[idx_kfold!=i]
labels_val=label[idx_kfold==i]
for (j in 2:nn) {
pred_knn=knn(data_train,data_val,labels_train,k=j)
acc[i,j-1]=mean(pred_knn==labels_val)
}
}
acc
}
# acc=kfold_knn_acc(X_train,y_train,kfold,nn,nfeatures)
# The above code execution to get the optimal number of nearest neighbors, k would take
# around 40-45 minutes. It had been saved as "cv_acc_knn_1.RData", So, we load the data
load('cv_acc_knn_1.RData')
# Visualize the average accuracies for each choices of k
mean_acc=apply(acc,2,mean)
std_acc=apply(acc, 2, sd)
plot_acc_knn=data.frame(nn=c(2:30),mean_acc,std_acc)
k_opt= which.max(mean_acc)+1
ggplot(data=plot_acc_knn,aes(x=nn,y=mean_acc,ymin=mean_acc-std_acc,ymax=mean_acc+std_acc)) +
geom_line() + geom_point() + geom_errorbar() +
geom_vline(xintercept = k_opt,color="red",linetype=2) +
xlab("number of nearest neighbors") +
ylab("accuracy")
# training performance
pred_knn=knn(X_train[,1:nfeatures],X_train[,1:nfeatures],y_train,k=k_opt)
perform_metric(y_train,pred_knn,nclass)
## $confusion_matrix
## prediction
## truth LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
## LAYING 1413 0 0 0 0
## SITTING 4 1211 76 0 0
## STANDING 0 33 1388 0 0
## WALKING 0 0 0 1226 0
## WALKING_DOWNSTAIRS 0 0 0 0 985
## WALKING_UPSTAIRS 0 0 0 0 1
## prediction
## truth WALKING_UPSTAIRS
## LAYING 0
## SITTING 2
## STANDING 2
## WALKING 0
## WALKING_DOWNSTAIRS 2
## WALKING_UPSTAIRS 1072
##
## $accuracy
## [1] 0.9838
##
## $Kappa_statistic
## [1] 0.9805
##
## $sensitivity
## LAYING SITTING STANDING WALKING
## 1.0000 0.9366 0.9754 1.0000
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9980 0.9991
##
## $specificity
## LAYING SITTING STANDING WALKING
## 0.9993 0.9946 0.9873 1.0000
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9998 0.9991
##
## $precision
## LAYING SITTING STANDING WALKING
## 0.9972 0.9735 0.9481 1.0000
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9990 0.9944
# test performance
y_test=factor(X_test$at)
pred_knn=knn(X_train[,1:nfeatures],X_test[,1:nfeatures],y_train,k=k_opt)
perform_metric(y_test,pred_knn,nclass)
## $confusion_matrix
## prediction
## truth LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
## LAYING 541 2 1 0 0
## SITTING 0 398 106 0 0
## STANDING 0 42 513 0 0
## WALKING 0 0 0 486 8
## WALKING_DOWNSTAIRS 0 0 0 40 329
## WALKING_UPSTAIRS 0 0 0 36 10
## prediction
## truth WALKING_UPSTAIRS
## LAYING 1
## SITTING 4
## STANDING 1
## WALKING 2
## WALKING_DOWNSTAIRS 51
## WALKING_UPSTAIRS 425
##
## $accuracy
## [1] 0.8985
##
## $Kappa_statistic
## [1] 0.8779
##
## $sensitivity
## LAYING SITTING STANDING WALKING
## 0.9927 0.7835 0.9227 0.9798
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.7833 0.9023
##
## $specificity
## LAYING SITTING STANDING WALKING
## 1.0000 0.9823 0.9561 0.9696
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9930 0.9766
##
## $precision
## LAYING SITTING STANDING WALKING
## 1.0000 0.9005 0.8274 0.8648
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9481 0.8781
pca=prcomp(X_train[,1:nfeatures])
# variance explained
var_explained=pca$sdev^2/sum(pca$sdev^2)
plot_var_exp=data.frame(no_prin_comp=c(1:10),var=var_explained[1:10])
# Scree plot
ggplot(data=plot_var_exp,aes(x=no_prin_comp,y=var)) +
geom_line() + geom_point() +
xlab("Number of principal components") +
ylab("Variance explained")
# Visualize in grouped scatterplot
pca_plot=data.frame(pca$x[,c(1,2)],class=X_train$at)
ggplot(data=pca_plot,aes(x=PC1,y=PC2)) +
geom_point(aes(color=factor(class)))
# cumulative percentages of variance explained
cum_var=cumsum(var_explained)
nf=length(cum_var[cum_var<=0.95])
X_pca=data.frame(pca$x[,1:(nf+1)])
X_pca=cbind(X_pca,at=X_train$at)
# Prepare test data
X_pca_test=predict(pca,newdata=X_test[,1:nfeatures])[,1:(nf+1)]
xx=as.data.frame(X_pca_test)
X_pca_test=cbind(xx,at=X_test$at)
start_time=Sys.time()
model_lda=lda(at~.,data=X_pca)
end_time=Sys.time()
(t_train_pca_lda=end_time-start_time)
## Time difference of 0.2933819 secs
pred=predict(model_lda,newdata = X_pca)$class
perform_metric(X_pca$at,pred,nclass) # Training
## $confusion_matrix
## prediction
## truth LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
## LAYING 1413 0 0 0 0
## SITTING 13 1072 208 0 0
## STANDING 0 112 1311 0 0
## WALKING 0 0 0 1200 0
## WALKING_DOWNSTAIRS 0 0 0 10 939
## WALKING_UPSTAIRS 0 0 0 16 11
## prediction
## truth WALKING_UPSTAIRS
## LAYING 0
## SITTING 0
## STANDING 0
## WALKING 26
## WALKING_DOWNSTAIRS 38
## WALKING_UPSTAIRS 1046
##
## $accuracy
## [1] 0.9415
##
## $Kappa_statistic
## [1] 0.9295
##
## $sensitivity
## LAYING SITTING STANDING WALKING
## 1.0000 0.8291 0.9213 0.9788
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9514 0.9748
##
## $specificity
## LAYING SITTING STANDING WALKING
## 0.9978 0.9817 0.9653 0.9958
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9983 0.9899
##
## $precision
## LAYING SITTING STANDING WALKING
## 0.9909 0.9054 0.8631 0.9788
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9884 0.9423
pred=predict(model_lda,newdata = X_pca_test)$class
perform_metric(X_pca_test$at,pred,nclass) # test
## $confusion_matrix
## prediction
## truth LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
## LAYING 545 0 0 0 0
## SITTING 0 413 95 0 0
## STANDING 0 32 524 0 0
## WALKING 0 0 0 488 4
## WALKING_DOWNSTAIRS 0 0 0 10 369
## WALKING_UPSTAIRS 0 0 0 38 0
## prediction
## truth WALKING_UPSTAIRS
## LAYING 0
## SITTING 0
## STANDING 0
## WALKING 4
## WALKING_DOWNSTAIRS 41
## WALKING_UPSTAIRS 433
##
## $accuracy
## [1] 0.9252
##
## $Kappa_statistic
## [1] 0.9101
##
## $sensitivity
## LAYING SITTING STANDING WALKING
## 1.0000 0.8130 0.9424 0.9839
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.8786 0.9193
##
## $specificity
## LAYING SITTING STANDING WALKING
## 1.0000 0.9871 0.9611 0.9808
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9984 0.9822
##
## $precision
## LAYING SITTING STANDING WALKING
## 1.0000 0.9281 0.8465 0.9104
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9893 0.9059
start_time=Sys.time()
model_qda=qda(at~.,data=X_pca)
end_time=Sys.time()
(t_train_pca_qda=end_time-start_time)
## Time difference of 0.1606681 secs
pred=predict(model_qda,newdata = X_pca)$class
perform_metric(X_pca$at,pred,nclass) # Training
## $confusion_matrix
## prediction
## truth LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
## LAYING 1413 0 0 0 0
## SITTING 1 1169 123 0 0
## STANDING 0 34 1388 0 0
## WALKING 0 0 0 1221 3
## WALKING_DOWNSTAIRS 0 0 0 0 987
## WALKING_UPSTAIRS 0 0 0 0 1
## prediction
## truth WALKING_UPSTAIRS
## LAYING 0
## SITTING 0
## STANDING 1
## WALKING 2
## WALKING_DOWNSTAIRS 0
## WALKING_UPSTAIRS 1072
##
## $accuracy
## [1] 0.9777
##
## $Kappa_statistic
## [1] 0.9732
##
## $sensitivity
## LAYING SITTING STANDING WALKING
## 1.0000 0.9041 0.9754 0.9959
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 1.0000 0.9991
##
## $specificity
## LAYING SITTING STANDING WALKING
## 0.9998 0.9944 0.9795 1.0000
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9994 0.9995
##
## $precision
## LAYING SITTING STANDING WALKING
## 0.9993 0.9717 0.9186 1.0000
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9960 0.9972
pred=predict(model_qda,newdata = X_pca_test)$class
perform_metric(X_pca_test$at,pred,nclass) #test
## $confusion_matrix
## prediction
## truth LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
## LAYING 545 0 0 0 0
## SITTING 1 433 74 0 0
## STANDING 0 25 529 0 0
## WALKING 0 0 0 465 23
## WALKING_DOWNSTAIRS 0 0 0 0 399
## WALKING_UPSTAIRS 0 0 0 6 2
## prediction
## truth WALKING_UPSTAIRS
## LAYING 0
## SITTING 0
## STANDING 2
## WALKING 8
## WALKING_DOWNSTAIRS 21
## WALKING_UPSTAIRS 463
##
## $accuracy
## [1] 0.9459
##
## $Kappa_statistic
## [1] 0.935
##
## $sensitivity
## LAYING SITTING STANDING WALKING
## 1.0000 0.8524 0.9514 0.9375
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9500 0.9830
##
## $specificity
## LAYING SITTING STANDING WALKING
## 0.9996 0.9900 0.9697 0.9976
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9903 0.9877
##
## $precision
## LAYING SITTING STANDING WALKING
## 0.9982 0.9454 0.8773 0.9873
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9410 0.9372
kfold=5 # 5-fold cv
nn=30 # maximum number of nearest neighbors
y_train=factor(X_train$at)
# cv to get the best k
start_time=Sys.time()
acc_mat=kfold_knn_acc(X_pca,y_train,kfold,nn,nf+1)
end_time=Sys.time()
(t_train_pca_knn=end_time-start_time)
## Time difference of 3.081214 mins
mean_acc=apply(acc_mat,2,mean)
std_acc=apply(acc_mat, 2, sd)
plot_acc_knn=data.frame(nn=c(2:nn),mean_acc,std_acc)
k_opt=which.max(mean_acc)+1
ggplot(data=plot_acc_knn,aes(x=nn,y=mean_acc,ymin=mean_acc-std_acc,ymax=mean_acc+std_acc)) +
geom_line() + geom_point() + geom_errorbar() +
geom_vline(xintercept = k_opt,color="red",linetype=2) +
xlab("number of nearest neighbors") +
ylab("accuracy")
# k=3 is optimal
# Training performance
pred_knn=knn(X_pca[,1:nf+1],X_pca[,1:nf+1],y_train,k=k_opt)
perform_metric(y_train,pred_knn,nclass)
## $confusion_matrix
## prediction
## truth LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
## LAYING 1413 0 0 0 0
## SITTING 0 1258 35 0 0
## STANDING 0 42 1381 0 0
## WALKING 0 0 0 1226 0
## WALKING_DOWNSTAIRS 0 0 0 0 985
## WALKING_UPSTAIRS 0 0 0 0 0
## prediction
## truth WALKING_UPSTAIRS
## LAYING 0
## SITTING 0
## STANDING 0
## WALKING 0
## WALKING_DOWNSTAIRS 2
## WALKING_UPSTAIRS 1073
##
## $accuracy
## [1] 0.9893
##
## $Kappa_statistic
## [1] 0.9872
##
## $sensitivity
## LAYING SITTING STANDING WALKING
## 1.0000 0.9729 0.9705 1.0000
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9980 1.0000
##
## $specificity
## LAYING SITTING STANDING WALKING
## 1.0000 0.9931 0.9942 1.0000
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 1.0000 0.9997
##
## $precision
## LAYING SITTING STANDING WALKING
## 1.0000 0.9677 0.9753 1.0000
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 1.0000 0.9981
# Test performance
y_test=factor(X_pca_test$at)
pred_knn=knn(X_pca[,1:nf+1],X_pca_test[,1:nf+1],y_train,k=k_opt)
perform_metric(y_test,pred_knn,nclass)
## $confusion_matrix
## prediction
## truth LAYING SITTING STANDING WALKING WALKING_DOWNSTAIRS
## LAYING 539 4 0 1 0
## SITTING 0 409 94 0 0
## STANDING 0 63 491 0 0
## WALKING 0 8 0 478 8
## WALKING_DOWNSTAIRS 0 4 0 44 316
## WALKING_UPSTAIRS 0 2 0 39 24
## prediction
## truth WALKING_UPSTAIRS
## LAYING 1
## SITTING 5
## STANDING 2
## WALKING 2
## WALKING_DOWNSTAIRS 56
## WALKING_UPSTAIRS 406
##
## $accuracy
## [1] 0.8808
##
## $Kappa_statistic
## [1] 0.8566
##
## $sensitivity
## LAYING SITTING STANDING WALKING
## 0.9890 0.8051 0.8831 0.9637
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.7524 0.8620
##
## $specificity
## LAYING SITTING STANDING WALKING
## 1.0000 0.9674 0.9615 0.9664
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9876 0.9739
##
## $precision
## LAYING SITTING STANDING WALKING
## 1.0000 0.8347 0.8393 0.8505
## WALKING_DOWNSTAIRS WALKING_UPSTAIRS
## 0.9080 0.8602
Jorge-L. Reyes-Ortiz, Luca Oneto, Albert Samà , Xavier Parra, Davide Anguita. Transition-Aware Human Activity Recognition Using Smartphones. Neurocomputing. Springer 2015.