Download the training and testing data and load them with training and testing. Library the required packages.
download.file( "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
, destfile = "training.csv")
data<- read.csv("training.csv")
download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv", destfile = "testing.csv")
test<- read.csv("testing.csv")
library(caret);library(ggplot2);
## Loading required package: lattice
## Loading required package: ggplot2
library(AppliedPredictiveModeling);library(pgmm)
library(ElemStatLearn);library(rpart)
First, have a look at the data and omit some variables uncorrelated to the question need to be answered. Get the data form accelerometers on the belt, forearm, arm, and dumbell for my model.
View(test)
table(data$classe)
##
## A B C D E
## 5580 3797 3422 3216 3607
dat<- data[,grepl("accel|classe", names(data))]
#remove the variable with too much NAs
dat<-dat[,sapply(dat, function(x) mean(is.na(x)))<.9]
Here we get the data set dat
with 16 features to build our machine learning algarithm. Split dat into training and testing data set. The training training data set is used for building the model, and the the testing data set is used for validation.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(1234)
inTrain <- createDataPartition(y=data$classe,p=0.7, list=FALSE)
training <- dat[inTrain,]; testing <- dat[-inTrain,]
dim(training); dim(testing)
## [1] 13737 17
## [1] 5885 17
str(training)
## 'data.frame': 13737 obs. of 17 variables:
## $ total_accel_belt : int 3 3 3 3 3 3 3 3 3 3 ...
## $ accel_belt_x : int -22 -20 -22 -21 -21 -22 -22 -20 -21 -21 ...
## $ accel_belt_y : int 4 5 3 2 4 3 4 2 4 2 ...
## $ accel_belt_z : int 22 23 21 24 21 21 21 24 22 23 ...
## $ total_accel_arm : int 34 34 34 34 34 34 34 34 34 34 ...
## $ accel_arm_x : int -290 -289 -289 -289 -289 -289 -289 -288 -288 -290 ...
## $ accel_arm_y : int 110 110 111 111 111 111 111 109 110 110 ...
## $ accel_arm_z : int -125 -126 -123 -123 -122 -125 -124 -122 -124 -123 ...
## $ total_accel_dumbbell: int 37 37 37 37 37 37 37 37 37 37 ...
## $ accel_dumbbell_x : int -233 -232 -232 -233 -234 -232 -234 -232 -235 -233 ...
## $ accel_dumbbell_y : int 47 46 48 48 48 47 46 47 48 47 ...
## $ accel_dumbbell_z : int -269 -270 -269 -270 -269 -270 -272 -269 -270 -269 ...
## $ total_accel_forearm : int 36 36 36 36 36 36 36 36 36 36 ...
## $ accel_forearm_x : int 192 196 189 189 193 195 193 193 190 193 ...
## $ accel_forearm_y : int 203 204 206 206 203 205 205 204 205 205 ...
## $ accel_forearm_z : int -216 -213 -214 -214 -215 -215 -213 -214 -215 -214 ...
## $ classe : Factor w/ 5 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
Then we are trying to fit the data with multiple linear regression model. For the first goal is statistical and the second goal is data compression, we preprocess the ‘dat’ to see if there is any features with high correlation.
M <- abs(cor(training[,-17]))
diag(M) <- 0
which(M > 0.8,arr.ind=T)
## row col
## accel_belt_y 3 1
## accel_belt_z 4 1
## total_accel_belt 1 3
## accel_belt_z 4 3
## total_accel_belt 1 4
## accel_belt_y 3 4
regularized regression:
v<- c(3,4)
modFit <- train(classe ~ .,method="lda",data=training[,-v],preProcess=c("center","scale"))
## Loading required package: MASS
confusionMatrix(testing$classe,predict(modFit,testing))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1144 34 80 289 127
## B 296 402 136 105 200
## C 519 17 268 99 123
## D 172 66 88 581 57
## E 269 257 126 158 272
##
## Overall Statistics
##
## Accuracy : 0.4532
## 95% CI : (0.4404, 0.466)
## No Information Rate : 0.4078
## P-Value [Acc > NIR] : 1e-12
##
## Kappa : 0.2982
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.4767 0.51804 0.38395 0.47159 0.34917
## Specificity 0.8479 0.85574 0.85387 0.91769 0.84136
## Pos Pred Value 0.6834 0.35294 0.26121 0.60270 0.25139
## Neg Pred Value 0.7017 0.92120 0.91150 0.86771 0.89444
## Prevalence 0.4078 0.13186 0.11861 0.20935 0.13237
## Detection Rate 0.1944 0.06831 0.04554 0.09873 0.04622
## Detection Prevalence 0.2845 0.19354 0.17434 0.16381 0.18386
## Balanced Accuracy 0.6623 0.68689 0.61891 0.69464 0.59526
There are three features accel_belt_y, accel_belt_z and total_accel_belt are correlated with correlation bigger than 0.8, we use PCA to precess the data.
library(MASS);library(dplyr); library(klaR)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:MASS':
##
## select
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# the first method with preprocess of PCA
modelFit <- train(training$classe ~ .,method="lda",preProcess="pca",data=training)
confusionMatrix(testing$classe,predict(modelFit,testing))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1137 71 90 252 124
## B 333 430 94 145 137
## C 590 20 173 135 108
## D 210 138 57 464 95
## E 315 280 106 209 172
##
## Overall Statistics
##
## Accuracy : 0.4037
## 95% CI : (0.3912, 0.4164)
## No Information Rate : 0.4393
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.231
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.4398 0.45793 0.33269 0.38506 0.27044
## Specificity 0.8373 0.85665 0.84101 0.89316 0.82663
## Pos Pred Value 0.6792 0.37752 0.16862 0.48133 0.15896
## Neg Pred Value 0.6561 0.89275 0.92859 0.84942 0.90339
## Prevalence 0.4393 0.15956 0.08836 0.20476 0.10807
## Detection Rate 0.1932 0.07307 0.02940 0.07884 0.02923
## Detection Prevalence 0.2845 0.19354 0.17434 0.16381 0.18386
## Balanced Accuracy 0.6386 0.65729 0.58685 0.63911 0.54854
# standardizing the predictors
modFit <- train(classe ~ .,method="lda",data=training,preProcess=c("center","scale"))
print(modFit$finalModel)
## Call:
## lda(x, grouping = y)
##
## Prior probabilities of groups:
## A B C D E
## 0.2843416 0.1934920 0.1744195 0.1639368 0.1838101
##
## Group means:
## total_accel_belt accel_belt_x accel_belt_y accel_belt_z total_accel_arm
## A -0.06965492 -0.006122185 -0.03183476 0.085509988 0.17437310
## B -0.03809889 0.013037774 0.04706354 0.008619362 0.11172235
## C -0.03271509 0.065898768 0.01176845 0.032788501 -0.12324089
## D -0.01786326 -0.105889317 0.01350338 0.036204247 -0.20896175
## E 0.19483263 0.027654710 -0.02350699 -0.204754649 -0.08403655
## accel_arm_x accel_arm_y accel_arm_z total_accel_dumbbell
## A -0.38510254 0.11868773 -0.02964544 0.08021788
## B 0.09963998 -0.06204155 -0.20281309 0.06630115
## C -0.10009904 0.09582424 0.13522140 -0.05890471
## D 0.40062166 -0.06155929 0.17899655 -0.23829724
## E 0.22851675 -0.15431731 -0.02860136 0.07454321
## accel_dumbbell_x accel_dumbbell_y accel_dumbbell_z total_accel_forearm
## A -0.30988770 -0.006703601 -0.16500854 -0.25690920
## B 0.40798687 0.211458802 0.20715678 0.06002746
## C -0.18602269 -0.250614402 -0.14652540 0.03245429
## D 0.09082372 -0.003706652 0.05165772 0.14913911
## E 0.14541290 0.028889607 0.13015537 0.17042082
## accel_forearm_x accel_forearm_y accel_forearm_z
## A 0.32163398 0.03325904 -0.01038370
## B -0.07252678 -0.13909724 0.07018928
## C 0.08801118 0.24659870 -0.05666123
## D -0.51897877 -0.06382153 0.05069642
## E -0.04184584 -0.08210446 -0.04927224
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3 LD4
## total_accel_belt 0.33643907 -0.19452394 -0.81556485 0.1094395
## accel_belt_x -0.60718736 0.01437296 0.55992645 0.5726786
## accel_belt_y -2.58683312 -0.81915157 2.84419380 0.7198351
## accel_belt_z -2.52316183 -2.20011072 1.52705422 0.3953662
## total_accel_arm -0.10941127 -0.30662862 -0.06781856 -0.5387922
## accel_arm_x 0.45764042 -0.66733642 -0.18665373 0.1126292
## accel_arm_y 0.32446086 0.42231819 0.50715994 1.0083296
## accel_arm_z -0.25204304 -0.97391962 -0.62833121 -0.8171840
## total_accel_dumbbell 0.39011444 0.67761660 0.32506709 1.1102486
## accel_dumbbell_x 0.54850551 0.16324268 0.78317992 1.0078562
## accel_dumbbell_y 0.40337209 -0.26744001 0.42977886 -1.6373006
## accel_dumbbell_z 0.27383730 0.46129095 0.18638483 -0.6176547
## total_accel_forearm 0.23299623 0.04948437 -0.06503092 0.1613019
## accel_forearm_x -0.26378437 0.82822922 -0.18123416 -0.2770226
## accel_forearm_y -0.26522711 -0.38370696 -0.01540071 0.3205066
## accel_forearm_z -0.06484333 -0.16726101 -0.06372807 -0.4019544
##
## Proportion of trace:
## LD1 LD2 LD3 LD4
## 0.4534 0.2972 0.1858 0.0637
confusionMatrix(testing$classe,predict(modFit,testing))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1177 69 100 289 39
## B 329 491 175 104 40
## C 537 86 289 102 12
## D 168 57 62 602 75
## E 156 220 86 153 467
##
## Overall Statistics
##
## Accuracy : 0.5142
## 95% CI : (0.5013, 0.527)
## No Information Rate : 0.4022
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3768
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.4973 0.53196 0.40590 0.4816 0.73776
## Specificity 0.8587 0.86941 0.85753 0.9219 0.88290
## Pos Pred Value 0.7031 0.43108 0.28168 0.6245 0.43161
## Neg Pred Value 0.7174 0.90898 0.91295 0.8683 0.96544
## Prevalence 0.4022 0.15684 0.12099 0.2124 0.10756
## Detection Rate 0.2000 0.08343 0.04911 0.1023 0.07935
## Detection Prevalence 0.2845 0.19354 0.17434 0.1638 0.18386
## Balanced Accuracy 0.6780 0.70068 0.63171 0.7017 0.81033
# # using naive byes model
# modnb = train(classe ~ ., data=training,method="nb")
# pnb = predict(modnb,testing)
# confusionMatrix(testing$classe,pnb)
It seems that the naive byes model perform the best on prediction, so we using this model to predict and write it into txt files to
answers2 = predict(modFit,test)
## Loading required package: MASS
answers3 = predict(modelFit,test)
table(answers3,answers2)
## answers2
## answers3 A B C D E
## A 8 0 1 0 1
## B 1 3 0 0 0
## C 1 0 0 0 1
## D 0 2 0 1 0
## E 0 0 0 0 1
pml_write_files = function(x){
n = length(x)
for(i in 1:n){
filename = paste0("problem_id_",i,".txt")
write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.names=FALSE)
}
}
pml_write_files(answers2)
The accuracy of those models list above is not high engough, So we combining different predictors to improve the accuracy.
library(caret)
set.seed(123)
inTrain <- createDataPartition(y=data$classe,p=0.7, list=FALSE)
training <- dat[inTrain,]; testing <- dat[-inTrain,]
modelFit2 <- train(training$classe ~ .,method="lda",preProcess="pca",data=training)
confusionMatrix(testing$classe,predict(modelFit2,testing))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1107 82 104 265 116
## B 336 435 107 131 130
## C 565 26 178 147 110
## D 215 156 55 432 106
## E 328 272 103 199 180
##
## Overall Statistics
##
## Accuracy : 0.3963
## 95% CI : (0.3837, 0.4089)
## No Information Rate : 0.4335
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2218
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.4339 0.44799 0.32541 0.36797 0.28037
## Specificity 0.8299 0.85674 0.84114 0.88707 0.82796
## Pos Pred Value 0.6613 0.38191 0.17349 0.44813 0.16636
## Neg Pred Value 0.6571 0.88706 0.92406 0.84922 0.90381
## Prevalence 0.4335 0.16500 0.09295 0.19949 0.10909
## Detection Rate 0.1881 0.07392 0.03025 0.07341 0.03059
## Detection Prevalence 0.2845 0.19354 0.17434 0.16381 0.18386
## Balanced Accuracy 0.6319 0.65236 0.58328 0.62752 0.55417
answers4 = predict(modelFit2,test)
library(caret)
set.seed(13)
inTrain <- createDataPartition(y=data$classe,p=0.7, list=FALSE)
training <- dat[inTrain,]; testing <- dat[-inTrain,]
dim(training); dim(testing)
## [1] 13737 17
## [1] 5885 17
modelFit3 <- train(training$classe ~ .,method="lda",preProcess="pca",data=training)
confusionMatrix(testing$classe,predict(modelFit3,testing))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1110 89 102 269 104
## B 334 453 95 120 137
## C 568 31 173 147 107
## D 190 155 59 478 82
## E 317 264 120 189 192
##
## Overall Statistics
##
## Accuracy : 0.4088
## 95% CI : (0.3962, 0.4215)
## No Information Rate : 0.428
## P-Value [Acc > NIR] : 0.9986
##
## Kappa : 0.2386
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.4407 0.45665 0.31512 0.39734 0.30868
## Specificity 0.8324 0.85980 0.84014 0.89620 0.83089
## Pos Pred Value 0.6631 0.39772 0.16862 0.49585 0.17745
## Neg Pred Value 0.6654 0.88643 0.92262 0.85267 0.91047
## Prevalence 0.4280 0.16856 0.09329 0.20442 0.10569
## Detection Rate 0.1886 0.07698 0.02940 0.08122 0.03263
## Detection Prevalence 0.2845 0.19354 0.17434 0.16381 0.18386
## Balanced Accuracy 0.6365 0.65823 0.57763 0.64677 0.56979
answers5 = predict(modelFit3,test)
ans<- data.frame(answers2,answers3,answers4,answers5)
ans
## answers2 answers3 answers4 answers5
## 1 B D D D
## 2 A A A A
## 3 A A A A
## 4 A A A A
## 5 A A A A
## 6 E C C A
## 7 D D D D
## 8 E E E E
## 9 A A A A
## 10 A A A A
## 11 A C C C
## 12 A A A A
## 13 B B B A
## 14 A A A A
## 15 C A A A
## 16 B B B B
## 17 E A A A
## 18 A B B D
## 19 B D D D
## 20 B B B B