sapply(
list(
"tidyverse",
"caret",
"lubridate",
"kableExtra"
),
require,character.only = T
)
## [1] TRUE TRUE TRUE TRUE
training <- read.csv2("pml-training.csv",
header = TRUE,
sep = ",",
stringsAsFactors = FALSE)
testing <- read.csv2("pml-testing.csv",
header = TRUE,
sep = ",",
stringsAsFactors = FALSE)
Showing the data:
training %>% dim() # Lines / Columns
## [1] 19622 160
testing %>% dim() # Lines / Columns
## [1] 20 160
As all of you could see, the training dataset has a lot columns (more than a hundred), but the most part of them doesn’t could be used without some previous analysis.
First of all, I will verify the quantity of NA present in each column.
check_NA <- training %>%
summarise_all(
.funs = list(
~sum(is.na(.))
)) %>% mutate_all(
.funs = ~ifelse(. >19200,"BAD","GOOD") # Looking for NA
)
check_NA %>%
pivot_longer(
cols = names(.)
) %>% head()
## # A tibble: 6 x 2
## name value
## <chr> <chr>
## 1 X GOOD
## 2 user_name GOOD
## 3 raw_timestamp_part_1 GOOD
## 4 raw_timestamp_part_2 GOOD
## 5 cvtd_timestamp GOOD
## 6 new_window GOOD
If the number of NA is bigger than 19200 (almost 98%), it means that we have few values available. In other words, would be little difficult to imput this data by some statistical method (knnImpute/bagImpute, for example).
check_NA %>%
pivot_longer(
cols = names(.)
) %>%
group_by(value) %>%
summarise(n = n()
)
## # A tibble: 2 x 2
## value n
## <chr> <int>
## 1 BAD 67
## 2 GOOD 93
Using the “GOOD” attributes:
training <-
training[,which(check_NA=="GOOD")] # 93 covariates
After the previous analysis, only 93 covariates are good to be used
Now, let’s sum the empty values ("") using the same method.
check_NULL <- training %>%
summarise_all(
.funs = list(
~sum(.=="")
)
) %>%
mutate_all(
.funs = list(
~ifelse(. > 19000,"BAD","GOOD")
)
)
check_NULL %>%
pivot_longer(
cols = names(.)
) %>% head()
## # A tibble: 6 x 2
## name value
## <chr> <chr>
## 1 X GOOD
## 2 user_name GOOD
## 3 raw_timestamp_part_1 GOOD
## 4 raw_timestamp_part_2 GOOD
## 5 cvtd_timestamp GOOD
## 6 new_window GOOD
Using the “GOOD” attributes we’ve got:
check_NULL %>%
pivot_longer(
cols = names(.)
) %>%
group_by(value) %>%
summarise(
n = n()
)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## value n
## <chr> <int>
## 1 BAD 33
## 2 GOOD 60
Now, we only have 60 useful variables
training <-
training[,which(check_NULL=="GOOD")] # 60 useful covariates
Let’s change some classes of variables.
training <- training[,-1] # The first column is not necessary!
training$classe <-
as.factor(training$classe) # Categorical variable (pos. 59)
training$user_name <-
as.factor(training$user_name) # Categorical Variable (pos. 1)
training$cvtd_timestamp <-
as.POSIXct(training$cvtd_timestamp,
format = "%d/%m/%Y %H:%M") # DATE-TIME Variable (pos. 4)
training <- training %>%
mutate_at(
c(6:58),.funs = as.numeric
) # Changing the character variables into numeric
After it:
str(training)
## 'data.frame': 19622 obs. of 59 variables:
## $ user_name : Factor w/ 6 levels "adelmo","carlitos",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ raw_timestamp_part_1: int 1323084231 1323084231 1323084231 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 ...
## $ raw_timestamp_part_2: int 788290 808298 820366 120339 196328 304277 368296 440390 484323 484434 ...
## $ cvtd_timestamp : POSIXct, format: "2011-12-05 11:23:00" "2011-12-05 11:23:00" ...
## $ new_window : chr "no" "no" "no" "no" ...
## $ num_window : num 11 11 11 12 12 12 12 12 12 12 ...
## $ roll_belt : num 1.41 1.41 1.42 1.48 1.48 1.45 1.42 1.42 1.43 1.45 ...
## $ pitch_belt : num 8.07 8.07 8.07 8.05 8.07 8.06 8.09 8.13 8.16 8.17 ...
## $ yaw_belt : num -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...
## $ total_accel_belt : num 3 3 3 3 3 3 3 3 3 3 ...
## $ gyros_belt_x : num 0 0.02 0 0.02 0.02 0.02 0.02 0.02 0.02 0.03 ...
## $ gyros_belt_y : num 0 0 0 0 0.02 0 0 0 0 0 ...
## $ gyros_belt_z : num -0.02 -0.02 -0.02 -0.03 -0.02 -0.02 -0.02 -0.02 -0.02 0 ...
## $ accel_belt_x : num -21 -22 -20 -22 -21 -21 -22 -22 -20 -21 ...
## $ accel_belt_y : num 4 4 5 3 2 4 3 4 2 4 ...
## $ accel_belt_z : num 22 22 23 21 24 21 21 21 24 22 ...
## $ magnet_belt_x : num -3 -7 -2 -6 -6 0 -4 -2 1 -3 ...
## $ magnet_belt_y : num 599 608 600 604 600 603 599 603 602 609 ...
## $ magnet_belt_z : num -313 -311 -305 -310 -302 -312 -311 -313 -312 -308 ...
## $ roll_arm : num -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 ...
## $ pitch_arm : num 22.5 22.5 22.5 22.1 22.1 22 21.9 21.8 21.7 21.6 ...
## $ yaw_arm : num -161 -161 -161 -161 -161 -161 -161 -161 -161 -161 ...
## $ total_accel_arm : num 34 34 34 34 34 34 34 34 34 34 ...
## $ gyros_arm_x : num 0 0.02 0.02 0.02 0 0.02 0 0.02 0.02 0.02 ...
## $ gyros_arm_y : num 0 -0.02 -0.02 -0.03 -0.03 -0.03 -0.03 -0.02 -0.03 -0.03 ...
## $ gyros_arm_z : num -0.02 -0.02 -0.02 0.02 0 0 0 0 -0.02 -0.02 ...
## $ accel_arm_x : num -288 -290 -289 -289 -289 -289 -289 -289 -288 -288 ...
## $ accel_arm_y : num 109 110 110 111 111 111 111 111 109 110 ...
## $ accel_arm_z : num -123 -125 -126 -123 -123 -122 -125 -124 -122 -124 ...
## $ magnet_arm_x : num -368 -369 -368 -372 -374 -369 -373 -372 -369 -376 ...
## $ magnet_arm_y : num 337 337 344 344 337 342 336 338 341 334 ...
## $ magnet_arm_z : num 516 513 513 512 506 513 509 510 518 516 ...
## $ roll_dumbbell : num 13.1 13.1 12.9 13.4 13.4 ...
## $ pitch_dumbbell : num -70.5 -70.6 -70.3 -70.4 -70.4 ...
## $ yaw_dumbbell : num -84.9 -84.7 -85.1 -84.9 -84.9 ...
## $ total_accel_dumbbell: num 37 37 37 37 37 37 37 37 37 37 ...
## $ gyros_dumbbell_x : num 0 0 0 0 0 0 0 0 0 0 ...
## $ gyros_dumbbell_y : num -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 ...
## $ gyros_dumbbell_z : num 0 0 0 -0.02 0 0 0 0 0 0 ...
## $ accel_dumbbell_x : num -234 -233 -232 -232 -233 -234 -232 -234 -232 -235 ...
## $ accel_dumbbell_y : num 47 47 46 48 48 48 47 46 47 48 ...
## $ accel_dumbbell_z : num -271 -269 -270 -269 -270 -269 -270 -272 -269 -270 ...
## $ magnet_dumbbell_x : num -559 -555 -561 -552 -554 -558 -551 -555 -549 -558 ...
## $ magnet_dumbbell_y : num 293 296 298 303 292 294 295 300 292 291 ...
## $ magnet_dumbbell_z : num -65 -64 -63 -60 -68 -66 -70 -74 -65 -69 ...
## $ roll_forearm : num 28.4 28.3 28.3 28.1 28 27.9 27.9 27.8 27.7 27.7 ...
## $ pitch_forearm : num -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.8 -63.8 -63.8 ...
## $ yaw_forearm : num -153 -153 -152 -152 -152 -152 -152 -152 -152 -152 ...
## $ total_accel_forearm : num 36 36 36 36 36 36 36 36 36 36 ...
## $ gyros_forearm_x : num 0.03 0.02 0.03 0.02 0.02 0.02 0.02 0.02 0.03 0.02 ...
## $ gyros_forearm_y : num 0 0 -0.02 -0.02 0 -0.02 0 -0.02 0 0 ...
## $ gyros_forearm_z : num -0.02 -0.02 0 0 -0.02 -0.03 -0.02 0 -0.02 -0.02 ...
## $ accel_forearm_x : num 192 192 196 189 189 193 195 193 193 190 ...
## $ accel_forearm_y : num 203 203 204 206 206 203 205 205 204 205 ...
## $ accel_forearm_z : num -215 -216 -213 -214 -214 -215 -215 -213 -214 -215 ...
## $ magnet_forearm_x : num -17 -18 -18 -16 -17 -9 -18 -9 -16 -22 ...
## $ magnet_forearm_y : num 654 661 658 658 655 660 659 660 653 656 ...
## $ magnet_forearm_z : num 476 473 469 469 473 478 470 474 476 473 ...
## $ classe : Factor w/ 5 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
Now, we need to look for correlations between the numeric columns (columns with higher correlation should not be used)
M <- corr_matrix <-
cor(training[,-c(1:5,59)]) # Matriz de correlação
diag(M) <- 0 # Giving zero where the correlation is equal to 1
Showing the matrix:
M[1:10,1:5]
## num_window roll_belt pitch_belt yaw_belt
## num_window 0.00000000 0.07198319 -0.10233913 0.08589479
## roll_belt 0.07198319 0.00000000 -0.21592515 0.81522971
## pitch_belt -0.10233913 -0.21592515 0.00000000 -0.69975194
## yaw_belt 0.08589479 0.81522971 -0.69975194 0.00000000
## total_accel_belt 0.06603491 0.98092414 -0.13898122 0.76209628
## gyros_belt_x 0.21023489 -0.11746960 -0.43596398 0.14504784
## gyros_belt_y 0.22399752 0.46371838 -0.39711179 0.53004475
## gyros_belt_z 0.06711874 -0.45903753 -0.10698831 -0.27498319
## accel_belt_x 0.13330057 0.25683537 -0.96573340 0.70802772
## accel_belt_y 0.04838485 0.92489827 0.08151866 0.60054662
## total_accel_belt
## num_window 0.06603491
## roll_belt 0.98092414
## pitch_belt -0.13898122
## yaw_belt 0.76209628
## total_accel_belt 0.00000000
## gyros_belt_x -0.16531119
## gyros_belt_y 0.40931499
## gyros_belt_z -0.47537283
## accel_belt_x 0.17226448
## accel_belt_y 0.92780692
The attributes with more than 85% of correlation are (they should not be used):
vars_highCorr <-
colnames(M)[which(abs(M) > 0.85,arr.ind = T)[,2] %>%
unique()] # 12 covariates with more than 85% of correlation
AFTER ALL OF THESE ANALYSIS, THE MOST USEFUL VARIABLES SELECTED WERE ONLY 46
training <-
training[,setdiff(names(training),
vars_highCorr)]
Now, we need to change the type of the columns of the testing dataset.
testing <-
testing[,intersect(names(testing), names(training))]
testing$user_name <-
as.factor(testing$user_name)
testing$cvtd_timestamp <-
as.POSIXct(testing$cvtd_timestamp,
format = "%d/%m/%Y %H:%M")
testing <- testing %>%
mutate_at(
c(6:46),.funs = as.numeric
)
We obtained:
testing %>%
dim()
## [1] 20 46
In this case, we used the Random Forest Algorithm.
modFit_sprf <-
train(classe ~.,
data = training,
method = "rf",
preProcess = c("center","scale"),
trControl = trainControl(method = "cv", number = 10)
)
Informations about the model I fitted:
modFit_sprf
## Random Forest
##
## 19622 samples
## 46 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## Pre-processing: centered (50), scaled (50)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 17660, 17660, 17661, 17659, 17660, 17661, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9976558 0.9970348
## 26 0.9994905 0.9993556
## 50 0.9975028 0.9968413
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 26.
The attibutes importance:
varImp(modFit_sprf) # The importance of the first 20 variables
## rf variable importance
##
## only 20 most important variables shown (out of 50)
##
## Overall
## raw_timestamp_part_1 100.000
## num_window 48.759
## pitch_forearm 27.775
## yaw_belt 25.563
## cvtd_timestamp 23.701
## magnet_dumbbell_z 22.084
## magnet_belt_y 18.970
## magnet_dumbbell_y 15.514
## roll_forearm 11.713
## magnet_belt_z 8.117
## roll_dumbbell 8.077
## magnet_dumbbell_x 7.962
## accel_dumbbell_y 7.765
## gyros_belt_z 7.403
## total_accel_dumbbell 5.904
## accel_dumbbell_z 4.929
## accel_forearm_x 4.778
## yaw_dumbbell 3.487
## accel_forearm_z 3.271
## roll_arm 3.220
Let’s predict
prediction <-
predict(modFit_sprf,
testing)
prediction
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E