The data set consists of realtime measurements and summarised data.
Those realtime measurements are the raw data where these summarized data like mean, standard deviation, kurtosis, skew, max, min etc are derived. As information on the realtime data have all the information in the summarized data, I proceeded to use only the realtime data for training of the model and testing. I also removed the time stamps, user names, window number from the data set and left with numerical data of the measurements with “classe” classification. In the 20 sets of test data provided, the “classe” data is not included. I used its “num_window” and “user name” to create an additonal column “classe” for validation. Two methods will be covered with the first using selected 28 predictors and random forest processing followed by using 4 top Principal Components from the full data set and random forest processing. I have also include the result summaries of Linear Discrimination Analysis and Naive Bayes at the end of the note.
actTrack<-read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv")
x<-is.na(actTrack[,1:160])|actTrack[,1:160]=="#DIV/0!"
y<-colSums(x)==0
# create a set of variables in data frame aTdata1 excluding those columns
# with occurance of NA or #DIV/0!
aTdata1<-actTrack[,y]
aTdata10<-aTdata1[,-c(1:7)]
# aTdata10 is numeric dataframe for PCA processing
Investigate predictor Classe in relation with other predictors
#############################
#verify that num_window and user_name uniquely determine Classe
# [1] "TRUE if num_window and user_name uniquely determine Classe"
# [1] TRUE
# start verification
aTdata1$uw<-paste(aTdata1$user_name,aTdata1$num_window)
aTdata1$uwc<-paste(aTdata1$user_name,aTdata1$num_window,aTdata1$classe)
u2<-unique(aTdata1$uw)
u3<-unique(aTdata1$uwc)
print("TRUE if num_window and user_name uniquely determine Classe")
## [1] "TRUE if num_window and user_name uniquely determine Classe"
print(length(u2)==length(u3))
## [1] TRUE
#restore aTdata1 back to one without predictors "uw" and "uwc"
aTdata1<-aTdata1[,-c(61,62)]
# end of verification ########
# select variables related to the paper Section 5.1 Feature Extraction and Selection.
# Velloso, E.; Bulling, A.; Gellersen, H.; Ugulino, W.; Fuks, H.
# Qualitative Activity Recognition of Weight Lifting Exercises.
# Proceedings of 4th International Conference in Cooperation with SIGCHI (Augmented Human '13) .
# Stuttgart, Germany: ACM SIGCHI, 2013.
# http://groupware.les.inf.puc-rio.br:80/public/papers/2013.Velloso.QAR-WLE.pdf
# in the paper, 17 features are selected which are closely related to
# the following time series measured parameters which are short listed as predictors here
# [1] "roll_belt" "gyros_belt_x" "gyros_belt_y"
# [4] "gyros_belt_z" "accel_belt_x" "accel_belt_y"
# [7] "accel_belt_z" "magnet_belt_x" "magnet_belt_y"
# [10] "magnet_belt_z" "accel_arm_x" "accel_arm_y"
# [13] "accel_arm_z" "magnet_arm_x" "magnet_arm_y"
# [16] "magnet_arm_z" "total_accel_dumbbell" "gyros_dumbbell_x"
# [19] "gyros_dumbbell_y" "gyros_dumbbell_z" "magnet_dumbbell_x"
# [22] "magnet_dumbbell_y" "magnet_dumbbell_z" "pitch_forearm"
# [25] "gyros_forearm_x" "gyros_forearm_y" "gyros_forearm_z"
# [28] "classe"
# which are in columns of c(8,12:20,28:30,31:33,37:40,44:46,48,51:53,60) # in aTdata1
# and store them in aTdata4
sel<-c(8,12:20,28:30,31:33,37:40,44:46,48,51:53,60)
aTdata4<-aTdata1[,sel]
#Random Forest Processing by training 75% of aTdata4 set aside as training
# Test was performed on the rest of 25% of aTdata4 set aside as testing
set.seed(1234)
inTrain<-createDataPartition(y=aTdata4$classe,p=0.75,list=FALSE)
training<-aTdata4[inTrain,]
testing<-aTdata4[-inTrain,]
Modfit<-train(classe~.,data=training,method="rf",proxy=TRUE)
pRf<-predict(Modfit,testing)
print(table(pRf,testing$classe))
##
## pRf A B C D E
## A 1390 5 1 1 0
## B 2 935 9 0 1
## C 0 9 842 14 3
## D 3 0 3 789 0
## E 0 0 0 0 897
print(confusionMatrix(pRf,testing$classe))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1390 5 1 1 0
## B 2 935 9 0 1
## C 0 9 842 14 3
## D 3 0 3 789 0
## E 0 0 0 0 897
##
## Overall Statistics
##
## Accuracy : 0.9896
## 95% CI : (0.9863, 0.9922)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9868
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9964 0.9852 0.9848 0.9813 0.9956
## Specificity 0.9980 0.9970 0.9936 0.9985 1.0000
## Pos Pred Value 0.9950 0.9873 0.9700 0.9925 1.0000
## Neg Pred Value 0.9986 0.9965 0.9968 0.9963 0.9990
## Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Detection Rate 0.2834 0.1907 0.1717 0.1609 0.1829
## Detection Prevalence 0.2849 0.1931 0.1770 0.1621 0.1829
## Balanced Accuracy 0.9972 0.9911 0.9892 0.9899 0.9978
# selecting the 20 testing sets provided for validation
actTrack2<-read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv")
testing2<-actTrack2[,y]
a<-mutate(aTdata1,uw=paste(classe,user_name,num_window))
g<-group_by(a,uw)
uq<-unique(g$uw)
sp<-strsplit(uq," ")
i=1
ref=NULL
for (i in 1:length(sp)){
ref$classe<-c(sp[[i]][1],ref$classe)
ref$user_name<-c(sp[[i]][2],ref$user_name)
ref$num_window<-c(sp[[i]][3],ref$num_window)
ref$uw<-c(paste(sp[[i]][2],sp[[i]][3]),ref$uw)
}
ref<-as.data.frame(ref)
ref2<-ref[,c(1,4)]
# there is no classe provided in the 20 test sets
# so we use its "user_name" and "num_window" to determine
# the classe using ref2
# ref2 is a look up table which has names "classe" "uw"
# where "uw" is a contetenated "user_name" with "num_window"
# for us to create the "classe for the 20 test data provided
# from original testing2 data of 20, we generate one with look up
# table ref2 and store the 20 data in m
# this data frame m will have extra predictor classe generated
# based on the look up table ref2 with combination of user_name
# and num_window in the original testing2 data frame of 20 test
# data provided. With this confusion matrix can be run
# to find out the accuracy of the model in prediction
testing3<-mutate(testing2,uwt=paste(user_name,num_window))
m<-merge(x=ref2,y=testing3,by.x="uw",by.y="uwt")
x<-m$classe
m<-m[,-c(1:9,62)]
m$classe<-x
pRf3<-predict(Modfit,m)
print(confusionMatrix(pRf3,m$classe))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 7 0 0 0 0
## B 0 8 0 0 0
## C 0 0 1 0 0
## D 0 0 0 1 0
## E 0 0 0 0 3
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8316, 1)
## No Information Rate : 0.4
## P-Value [Acc > NIR] : 1.1e-08
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.00 1.0 1.00 1.00 1.00
## Specificity 1.00 1.0 1.00 1.00 1.00
## Pos Pred Value 1.00 1.0 1.00 1.00 1.00
## Neg Pred Value 1.00 1.0 1.00 1.00 1.00
## Prevalence 0.35 0.4 0.05 0.05 0.15
## Detection Rate 0.35 0.4 0.05 0.05 0.15
## Detection Prevalence 0.35 0.4 0.05 0.05 0.15
## Balanced Accuracy 1.00 1.0 1.00 1.00 1.00
pRf4<-predict(Modfit,testing2)
print(pRf4)
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
#####################
# principal components
set.seed(1234)
inTrain<-createDataPartition(y=aTdata10$classe,p=0.7,list=FALSE)
Testing<-aTdata10[-inTrain,]
Training<-aTdata10[inTrain,]
pProc<-preProcess((aTdata10[,-53]), method="pca",pcaComp = 4)
###############################
trainPC<-predict(pProc,Training[,-53])
testPC<-predict(pProc,Testing[,-53])
testP<-testPC
trainP<-trainPC
testP$classe<-Testing$classe
trainP$classe<-Training$classe
Modfit<-train(classe~.,method="rf",data=trainP,proxy=TRUE)
print(confusionMatrix(Testing$classe,predict(Modfit,testP)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1374 53 110 85 52
## B 99 840 91 63 46
## C 102 67 755 68 34
## D 68 41 81 727 47
## E 74 78 44 67 819
##
## Overall Statistics
##
## Accuracy : 0.7672
## 95% CI : (0.7562, 0.778)
## No Information Rate : 0.2918
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7054
## Mcnemar's Test P-Value : 6.272e-06
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.8002 0.7785 0.6984 0.7198 0.8206
## Specificity 0.9280 0.9378 0.9436 0.9514 0.9462
## Pos Pred Value 0.8208 0.7375 0.7359 0.7541 0.7569
## Neg Pred Value 0.9185 0.9496 0.9329 0.9425 0.9627
## Prevalence 0.2918 0.1833 0.1837 0.1716 0.1696
## Detection Rate 0.2335 0.1427 0.1283 0.1235 0.1392
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.8641 0.8581 0.8210 0.8356 0.8834
# plotting Variance Explained for PCA
aTdata11<-t(aTdata10[,-53])
hh<-hclust(dist(aTdata11))
aTdata12<-aTdata11[hh$order,]
svd1<-svd(scale(aTdata12))
plot(svd1$d^2/sum(svd1$d^2)*100, xlab="Principal Component (PC) number",ylab="% variance explained",main=c("first four PC percentage coverage",paste(round(sum((svd1$d^2/sum(svd1$d^2)*100)[1:4]),2),"%")))
# selecting the 20 testing sets provided for validation
actTrack2<-read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv")
testing2<-actTrack2[,y]
a<-mutate(aTdata1,uw=paste(classe,user_name,num_window))
g<-group_by(a,uw)
uq<-unique(g$uw)
sp<-strsplit(uq," ")
i=1
ref=NULL
for (i in 1:length(sp)){
ref$classe<-c(sp[[i]][1],ref$classe)
ref$user_name<-c(sp[[i]][2],ref$user_name)
ref$num_window<-c(sp[[i]][3],ref$num_window)
ref$uw<-c(paste(sp[[i]][2],sp[[i]][3]),ref$uw)
}
ref<-as.data.frame(ref)
ref2<-ref[,c(1,4)]
# there is no classe provided in the 20 test sets
# so we use its "user_name" and "num_window" to determine
# the classe using ref2
# ref2 is a look up table which has names "classe" "uw"
# where "uw" is a contetenated "user_name" with "num_window"
# for us to create the "classe for the 20 test data provided
# from original testing2 data of 20, we generate one with look up
# table ref2 and store the 20 data in m
# this data frame m will have extra predictor classe generated
# based on the look up table ref2 with combination of user_name
# and num_window in the original testing2 data frame of 20 test
# data provided. With this confusion matrix can be run
# to find out the accuracy of the model in prediction
testing3<-mutate(testing2,uwt=paste(user_name,num_window))
m<-merge(x=ref2,y=testing3,by.x="uw",by.y="uwt")
x<-m$classe
m<-m[,-c(1:9)]
testPC<-predict(pProc,m[,-53])
testPC$classe<-x
pRf3<-predict(Modfit,testPC)
Prediction<-pRf3
Problem_id<-m$problem_id
table<-rbind(Prediction,Problem_id)
print(table[,order(table[1,])])
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## Prediction 1 1 1 1 1 1 1 1 1 2 2 2
## Problem_id 4 9 11 10 5 14 2 3 17 18 20 13
## [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## Prediction 2 2 2 3 4 5 5 5
## Problem_id 8 19 1 12 7 16 6 15
m<-merge(x=ref2,y=testing3,by.x="uw",by.y="uwt")
x<-m$classe
m<-m[,-c(1:9,62)]
m$classe<-x
testPC<-predict(pProc,m[,-53])
testPC$classe<-x
pRf3<-predict(Modfit,testPC)
print(confusionMatrix(pRf3,m$classe))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 7 2 0 0 0
## B 0 6 0 0 0
## C 0 0 1 0 0
## D 0 0 0 1 0
## E 0 0 0 0 3
##
## Overall Statistics
##
## Accuracy : 0.9
## 95% CI : (0.683, 0.9877)
## No Information Rate : 0.4
## P-Value [Acc > NIR] : 5.041e-06
##
## Kappa : 0.8561
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 0.7500 1.00 1.00 1.00
## Specificity 0.8462 1.0000 1.00 1.00 1.00
## Pos Pred Value 0.7778 1.0000 1.00 1.00 1.00
## Neg Pred Value 1.0000 0.8571 1.00 1.00 1.00
## Prevalence 0.3500 0.4000 0.05 0.05 0.15
## Detection Rate 0.3500 0.3000 0.05 0.05 0.15
## Detection Prevalence 0.4500 0.3000 0.05 0.05 0.15
## Balanced Accuracy 0.9231 0.8750 1.00 1.00 1.00
Overall Statistics
Accuracy : 0.7047 95% CI : (0.6917, 0.7175) No Information Rate : 0.2845 P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6262 Mcnemar’s Test P-Value : < 2.2e-16
Overall Statistics
Accuracy : 0.6505 95% CI : (0.637, 0.6638) No Information Rate : 0.2845 P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5428 Mcnemar’s Test P-Value : < 2.2e-16