library(dplyr)
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
training <- read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv")
test <- read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv")
training <- training[,!(names(training) %in% c('X','user_name',"raw_timestamp_part_1","raw_timestamp_part_2") )]
test <- test[,!(names(training) %in% c('X','user_name',"raw_timestamp_part_1","raw_timestamp_part_2") )]
The first part of the process is to perform a data cleaning, a function is built to detect the percentage of missing values in the data set.
na.percent <- function(x){
sum(is.na(x))/length(x)
}
x <- sapply(training, na.percent)
x <- names(x[x>0])
x
## [1] "max_roll_belt" "max_picth_belt"
## [3] "min_roll_belt" "min_pitch_belt"
## [5] "amplitude_roll_belt" "amplitude_pitch_belt"
## [7] "var_total_accel_belt" "avg_roll_belt"
## [9] "stddev_roll_belt" "var_roll_belt"
## [11] "avg_pitch_belt" "stddev_pitch_belt"
## [13] "var_pitch_belt" "avg_yaw_belt"
## [15] "stddev_yaw_belt" "var_yaw_belt"
## [17] "var_accel_arm" "avg_roll_arm"
## [19] "stddev_roll_arm" "var_roll_arm"
## [21] "avg_pitch_arm" "stddev_pitch_arm"
## [23] "var_pitch_arm" "avg_yaw_arm"
## [25] "stddev_yaw_arm" "var_yaw_arm"
## [27] "max_roll_arm" "max_picth_arm"
## [29] "max_yaw_arm" "min_roll_arm"
## [31] "min_pitch_arm" "min_yaw_arm"
## [33] "amplitude_roll_arm" "amplitude_pitch_arm"
## [35] "amplitude_yaw_arm" "max_roll_dumbbell"
## [37] "max_picth_dumbbell" "min_roll_dumbbell"
## [39] "min_pitch_dumbbell" "amplitude_roll_dumbbell"
## [41] "amplitude_pitch_dumbbell" "var_accel_dumbbell"
## [43] "avg_roll_dumbbell" "stddev_roll_dumbbell"
## [45] "var_roll_dumbbell" "avg_pitch_dumbbell"
## [47] "stddev_pitch_dumbbell" "var_pitch_dumbbell"
## [49] "avg_yaw_dumbbell" "stddev_yaw_dumbbell"
## [51] "var_yaw_dumbbell" "max_roll_forearm"
## [53] "max_picth_forearm" "min_roll_forearm"
## [55] "min_pitch_forearm" "amplitude_roll_forearm"
## [57] "amplitude_pitch_forearm" "var_accel_forearm"
## [59] "avg_roll_forearm" "stddev_roll_forearm"
## [61] "var_roll_forearm" "avg_pitch_forearm"
## [63] "stddev_pitch_forearm" "var_pitch_forearm"
## [65] "avg_yaw_forearm" "stddev_yaw_forearm"
## [67] "var_yaw_forearm"
It is found that many of the variables present an exaggerated amount of missing values, approximately 97%, which is why they are omitted. Additionally, variables that provide very little variance are filtered.
training <- training[,!(names(training)%in%x)]
testing <- test[,!(names(training)%in%x)]
vars <- names(training)[nearZeroVar(training)]
vars
## [1] "new_window" "kurtosis_roll_belt"
## [3] "kurtosis_picth_belt" "kurtosis_yaw_belt"
## [5] "skewness_roll_belt" "skewness_roll_belt.1"
## [7] "skewness_yaw_belt" "max_yaw_belt"
## [9] "min_yaw_belt" "amplitude_yaw_belt"
## [11] "kurtosis_roll_arm" "kurtosis_picth_arm"
## [13] "kurtosis_yaw_arm" "skewness_roll_arm"
## [15] "skewness_pitch_arm" "skewness_yaw_arm"
## [17] "kurtosis_roll_dumbbell" "kurtosis_picth_dumbbell"
## [19] "kurtosis_yaw_dumbbell" "skewness_roll_dumbbell"
## [21] "skewness_pitch_dumbbell" "skewness_yaw_dumbbell"
## [23] "max_yaw_dumbbell" "min_yaw_dumbbell"
## [25] "amplitude_yaw_dumbbell" "kurtosis_roll_forearm"
## [27] "kurtosis_picth_forearm" "kurtosis_yaw_forearm"
## [29] "skewness_roll_forearm" "skewness_pitch_forearm"
## [31] "skewness_yaw_forearm" "max_yaw_forearm"
## [33] "min_yaw_forearm" "amplitude_yaw_forearm"
After performing the cleaning process, the data within the data set is divided into a training set and a validation set, since the amount of data that is presented for training is very few.
training <- training[,!(names(training) %in% vars)]
testing <- test[,!(names(training) %in% vars)]
set.seed(123)
index <- createDataPartition(training$classe, p = 0.5, list = FALSE)
sub.train <- training[index,]
sub.test <- training[-index,]
The first model to be estimated is a decision tree model, The decision matrix shows the degree of successes it presents, showing a high precision in the classification.
model1 <- rpart::rpart(classe~., data = sub.train, method = "class")
pred1 <- predict(model1, sub.test, type = "class")
table(pred1,sub.test$classe)
##
## pred1 A B C D E
## A 2680 353 0 0 0
## B 62 1329 217 90 0
## C 48 205 1461 181 65
## D 0 11 18 1030 95
## E 0 0 15 307 1643
As can be seen, the model is almost correct in its entirety, with a accurancy of 83%.
sum(diag(table(pred1,sub.test$classe)))/length(sub.test$classe)
## [1] 0.8300714
The second model to estimate a random forest, with cross validation.
controlRF <- trainControl(method="cv", number=5)
model2 <- train(classe ~ ., data=sub.train, method="rf", trControl = controlRF)
It can be seen that the precision of the model improves substantially, placing it at 99% precision.
rfPrediction <- predict(model2, sub.test)
table(sub.test$classe, rfPrediction)
## rfPrediction
## A B C D E
## A 2790 0 0 0 0
## B 7 1888 3 0 0
## C 0 5 1704 2 0
## D 0 0 2 1605 1
## E 0 0 0 0 1803
sum(diag(table(rfPrediction,sub.test$classe)))/length(sub.test$classe)
## [1] 0.9979613
dt.test <- predict(model1,testing, type = "class")
rfPrediction.test <- predict(model2, testing)
data.frame(DECISION.TREE = as.character(dt.test),
RANDOM.FOREST = as.character(rfPrediction.test),
COINCIDENCE = dt.test == rfPrediction.test)
## DECISION.TREE RANDOM.FOREST COINCIDENCE
## 1 C B FALSE
## 2 A A TRUE
## 3 C B FALSE
## 4 A A TRUE
## 5 A A TRUE
## 6 E E TRUE
## 7 D D TRUE
## 8 C B FALSE
## 9 A A TRUE
## 10 A A TRUE
## 11 B B TRUE
## 12 C C TRUE
## 13 B B TRUE
## 14 A A TRUE
## 15 E E TRUE
## 16 E E TRUE
## 17 A A TRUE
## 18 A B FALSE
## 19 A B FALSE
## 20 B B TRUE
#Conclusion.
The model estimated by random forest is taken since in the validation it presents a higher precision than model 1, the models represent a 75% coincidence, so the results of the decision tree also seem to be good but not as precise as the estimated by random forest.