For the data downloaded from the web, severals tasks for cleaning the data are done: removing near zero variance variables, variables with no values and descriptive columuns (user_name) that provide no information for the model. The training data is split in two subsets (one for training and the remaining 30% for testing) and the testing data has been kept aside in order to make the final prediction. Several algorithmsn are tested: KNN, Random Forest and RPART The best is Random Forest. ***
Downloading files and importing to R
url_train <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
url_quiz <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
# download files
if(!file.exists("pml-training.csv"))
{
download.file(url_train, destfile = "pml-training.csv")
}
training <- read_delim(file="pml-training.csv", delim=",", guess_max=20000)
if(!file.exists("pml-testing.csv"))
{
download.file(url_test, destfile = "pml-testing.csv")
}
quiz <- read_delim(file="pml-testing.csv", delim=",", guess_max=20000)
# Split of reining in training and test 70%/30%
inx <- createDataPartition(training$classe, p=0.75, list=FALSE)
testing <- training[-inx, ]
training2 <- training[ inx, ]
# Removing variables with nearlyzero variance
NZV <- nearZeroVar(training2)
training2 <- training2[, -NZV]
NZVtest <- nearZeroVar(testing)
testing <- testing[, -NZVtest]
# Removing variables mainly NA
na_var <- sapply(training2, function(x) mean(is.na(x))) > 0.95
training2 <- training2[ , na_var == FALSE]
na_var <- sapply(testing, function(x) mean(is.na(x))) > 0.95
testing <- testing [ , na_var == FALSE]
10 random rows of training table
sample_n(training2,10)
10 random rows of testing table
sample_n(testing,10)
6 usernames that made the exercises and samples available per user
table(training2$user_name)
##
## adelmo carlitos charles eurico jeremy pedro
## 2914 2311 2658 2329 2558 1948
table(testing$user_name)
##
## adelmo carlitos charles eurico jeremy pedro
## 978 801 878 741 844 662
# Columns that do not contain variables are removed: X1, user_name, raw_timestamp_part_1, raw_timestamp_part_2, cvtd_timestamp,num_window
training2 <- training2[ , -(1:5)]
testing <- testing [ , -(1:5)]
Correlation among the variables
## FUNCTION: test significance of cor results
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
var_corr <- cor(training2[,-54])
p.mat <- cor.mtest(var_corr)
g <- ggcorrplot(var_corr, p.mat = p.mat, hc.order = TRUE,type = "lower", insig = "blank", tl.cex=6)
g
<img src=“PredictionAssignment_files/figure-html/fig.align==”CENTER“-1.png” width=“672” />
set.seed(7)
modelKNN <- train(classe ~ .,
data = training2,
method = "knn",
trControl = trainControl(method = "boot"))
modelKNN$finalModel
## 5-nearest neighbor model
## Training set outcome distribution:
##
## A B C D E
## 4185 2848 2567 2412 2706
predictKNN <- predict(modelKNN, newdata=testing)
confKNN <- confusionMatrix(predictKNN, as.factor(testing$classe))
confKNN
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1350 40 13 8 7
## B 13 848 29 4 28
## C 13 28 787 46 18
## D 14 13 21 725 36
## E 5 20 5 21 812
##
## Overall Statistics
##
## Accuracy : 0.9221
## 95% CI : (0.9142, 0.9295)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9014
##
## Mcnemar's Test P-Value : 6.2e-06
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9677 0.8936 0.9205 0.9017 0.9012
## Specificity 0.9806 0.9813 0.9741 0.9795 0.9873
## Pos Pred Value 0.9520 0.9197 0.8823 0.8962 0.9409
## Neg Pred Value 0.9871 0.9746 0.9831 0.9807 0.9780
## Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Detection Rate 0.2753 0.1729 0.1605 0.1478 0.1656
## Detection Prevalence 0.2892 0.1880 0.1819 0.1650 0.1760
## Balanced Accuracy 0.9742 0.9374 0.9473 0.9406 0.9442
set.seed(7)
ctrlRF <- trainControl(method = "repeatedcv", number = 5, repeats = 2)
modelRF <- train(classe ~ .,
data = training2,
method = "rf",
trControl = ctrlRF,
verbose = FALSE)
modelRF$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry, verbose = FALSE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 27
##
## OOB estimate of error rate: 0.2%
## Confusion matrix:
## A B C D E class.error
## A 4184 1 0 0 0 0.0002389486
## B 4 2840 4 0 0 0.0028089888
## C 0 2 2563 2 0 0.0015582392
## D 0 0 10 2401 1 0.0045605307
## E 0 1 0 4 2701 0.0018477458
predictRF <- predict(modelRF, newdata=testing)
confMRF <- confusionMatrix(predictRF, as.factor(testing$classe))
confMRF
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1394 1 0 0 0
## B 0 948 2 0 0
## C 0 0 853 2 0
## D 0 0 0 802 2
## E 1 0 0 0 899
##
## Overall Statistics
##
## Accuracy : 0.9984
## 95% CI : (0.9968, 0.9993)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9979
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9993 0.9989 0.9977 0.9975 0.9978
## Specificity 0.9997 0.9995 0.9995 0.9995 0.9998
## Pos Pred Value 0.9993 0.9979 0.9977 0.9975 0.9989
## Neg Pred Value 0.9997 0.9997 0.9995 0.9995 0.9995
## Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Detection Rate 0.2843 0.1933 0.1739 0.1635 0.1833
## Detection Prevalence 0.2845 0.1937 0.1743 0.1639 0.1835
## Balanced Accuracy 0.9995 0.9992 0.9986 0.9985 0.9988
set.seed(7)
modelRPART <- train(classe ~ .,
data = training2,
method = "rpart",
tuneLength = 9)
modelRPART$finalModel
## n= 14718
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 14718 10533 A (0.28 0.19 0.17 0.16 0.18)
## 2) roll_belt< 130.5 13493 9319 A (0.31 0.21 0.19 0.18 0.11)
## 4) pitch_forearm< -33.95 1186 8 A (0.99 0.0067 0 0 0) *
## 5) pitch_forearm>=-33.95 12307 9311 A (0.24 0.23 0.21 0.2 0.12)
## 10) num_window>=45.5 11776 8780 A (0.25 0.24 0.22 0.2 0.091)
## 20) magnet_dumbbell_y< 436.5 10022 7087 A (0.29 0.19 0.25 0.19 0.085)
## 40) roll_forearm< 123.5 6268 3638 A (0.42 0.19 0.18 0.16 0.044)
## 80) num_window< 241.5 1577 353 A (0.78 0.12 0.0013 0.069 0.029) *
## 81) num_window>=241.5 4691 3285 A (0.3 0.21 0.25 0.2 0.049)
## 162) magnet_dumbbell_z< -28.5 1392 339 A (0.76 0.18 0.018 0.039 0.0036) *
## 163) magnet_dumbbell_z>=-28.5 3299 2168 C (0.11 0.22 0.34 0.26 0.069)
## 326) accel_dumbbell_y>=-40.5 2821 1970 D (0.13 0.25 0.24 0.3 0.08)
## 652) roll_belt>=125.5 668 279 C (0 0.38 0.58 0.036 0.0045)
## 1304) pitch_belt< -42.65 260 19 B (0 0.93 0 0.062 0.012) *
## 1305) pitch_belt>=-42.65 408 19 C (0 0.027 0.95 0.02 0) *
## 653) roll_belt< 125.5 2153 1326 D (0.16 0.21 0.14 0.38 0.1) *
## 327) accel_dumbbell_y< -40.5 478 38 C (0 0.052 0.92 0.027 0) *
## 41) roll_forearm>=123.5 3754 2431 C (0.081 0.19 0.35 0.22 0.15)
## 82) magnet_dumbbell_y< 290.5 2235 1104 C (0.096 0.14 0.51 0.15 0.1) *
## 83) magnet_dumbbell_y>=290.5 1519 1020 D (0.059 0.26 0.13 0.33 0.23)
## 166) roll_dumbbell< 43.32512 290 63 B (0.066 0.78 0.017 0.083 0.052) *
## 167) roll_dumbbell>=43.32512 1229 754 D (0.058 0.14 0.15 0.39 0.27) *
## 21) magnet_dumbbell_y>=436.5 1754 806 B (0.035 0.54 0.049 0.25 0.12)
## 42) total_accel_dumbbell>=5.5 1198 335 B (0.051 0.72 0.071 0.026 0.13) *
## 43) total_accel_dumbbell< 5.5 556 146 D (0 0.15 0.0018 0.74 0.11) *
## 11) num_window< 45.5 531 105 E (0 0 0 0.2 0.8) *
## 3) roll_belt>=130.5 1225 11 E (0.009 0 0 0 0.99) *
fancyRpartPlot(modelRPART$finalModel)
predictRPART <- predict(modelRPART, newdata=testing)
confRPART <- confusionMatrix(predictRPART, as.factor(testing$classe))
confRPART
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1147 150 8 55 22
## B 40 467 26 25 53
## C 67 90 647 119 79
## D 138 242 174 564 183
## E 3 0 0 41 564
##
## Overall Statistics
##
## Accuracy : 0.6911
## 95% CI : (0.6779, 0.704)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6108
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.8222 0.49210 0.7567 0.7015 0.6260
## Specificity 0.9330 0.96359 0.9123 0.8202 0.9890
## Pos Pred Value 0.8300 0.76432 0.6457 0.4335 0.9276
## Neg Pred Value 0.9296 0.88772 0.9467 0.9334 0.9216
## Prevalence 0.2845 0.19352 0.1743 0.1639 0.1837
## Detection Rate 0.2339 0.09523 0.1319 0.1150 0.1150
## Detection Prevalence 0.2818 0.12459 0.2043 0.2653 0.1240
## Balanced Accuracy 0.8776 0.72784 0.8345 0.7609 0.8075
Finally, I’m applying the Ranfom Forest algorithm because it has a 99.82% of accuracy in front of an 65.05% for RPart and 91.95% for KNN.
So the predicted values are:
predictFINAL <- predict(modelRF, newdata=quiz)
predictFINAL
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E