Coursera - Practical Machine Learning

LOADING THE PACKAGES

sapply(
  list(
    "tidyverse",
    "caret",
    "lubridate",
    "kableExtra"
    ), 
  require,character.only = T
  )

## [1] TRUE TRUE TRUE TRUE

LOADING THE DATA

training <- read.csv2("pml-training.csv",
                      header = TRUE,
                      sep = ",",
                      stringsAsFactors = FALSE)


testing <- read.csv2("pml-testing.csv",
                      header = TRUE,
                      sep = ",",
                      stringsAsFactors = FALSE)

Showing the data:

training %>% dim() # Lines / Columns

## [1] 19622   160

testing %>% dim() # Lines / Columns

## [1]  20 160

As all of you could see, the training dataset has a lot columns (more than a hundred), but the most part of them doesn’t could be used without some previous analysis.

ANALYZING THE DATA - P.1 (TRAINING DATASET)

First of all, I will verify the quantity of NA present in each column.

check_NA <- training %>%
  summarise_all(
    .funs = list(
      ~sum(is.na(.))
    )) %>% mutate_all(
      .funs = ~ifelse(. >19200,"BAD","GOOD") # Looking for NA
    ) 

check_NA %>% 
  pivot_longer(
    cols = names(.)
    ) %>% head()

## # A tibble: 6 x 2
##   name                 value
##   <chr>                <chr>
## 1 X                    GOOD 
## 2 user_name            GOOD 
## 3 raw_timestamp_part_1 GOOD 
## 4 raw_timestamp_part_2 GOOD 
## 5 cvtd_timestamp       GOOD 
## 6 new_window           GOOD

If the number of NA is bigger than 19200 (almost 98%), it means that we have few values available. In other words, would be little difficult to imput this data by some statistical method (knnImpute/bagImpute, for example).

check_NA %>% 
  pivot_longer(
      cols = names(.)
  ) %>% 
  group_by(value) %>%
  summarise(n = n()
            )

## # A tibble: 2 x 2
##   value     n
##   <chr> <int>
## 1 BAD      67
## 2 GOOD     93

Using the “GOOD” attributes:

training <-
  training[,which(check_NA=="GOOD")] # 93  covariates

After the previous analysis, only 93 covariates are good to be used

Now, let’s sum the empty values ("") using the same method.

check_NULL <- training %>%
  summarise_all(
    .funs = list(
      ~sum(.=="")
    )
  ) %>%
  mutate_all(
    .funs = list(
      ~ifelse(. > 19000,"BAD","GOOD")
    )
  )

check_NULL %>%
  pivot_longer(
    cols = names(.)
  ) %>% head()

## # A tibble: 6 x 2
##   name                 value
##   <chr>                <chr>
## 1 X                    GOOD 
## 2 user_name            GOOD 
## 3 raw_timestamp_part_1 GOOD 
## 4 raw_timestamp_part_2 GOOD 
## 5 cvtd_timestamp       GOOD 
## 6 new_window           GOOD

Using the “GOOD” attributes we’ve got:

check_NULL %>%
  pivot_longer(
    cols = names(.)
  ) %>%
  group_by(value) %>%
  summarise(
    n = n()
  )

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 2
##   value     n
##   <chr> <int>
## 1 BAD      33
## 2 GOOD     60

Now, we only have 60 useful variables

training <-
  training[,which(check_NULL=="GOOD")] # 60 useful covariates

ANALYZING THE DATA P.2 (TRAINING DATASET)

Let’s change some classes of variables.

training <- training[,-1] # The first column is not necessary!

training$classe <-
  as.factor(training$classe) # Categorical variable (pos. 59)

training$user_name <-
  as.factor(training$user_name) # Categorical Variable (pos. 1)

training$cvtd_timestamp <-
  as.POSIXct(training$cvtd_timestamp,
             format = "%d/%m/%Y %H:%M") # DATE-TIME Variable (pos. 4)

training <- training %>%
  mutate_at(
    c(6:58),.funs = as.numeric
  ) # Changing the character variables into numeric

After it:

str(training)

## 'data.frame':    19622 obs. of  59 variables:
##  $ user_name           : Factor w/ 6 levels "adelmo","carlitos",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ raw_timestamp_part_1: int  1323084231 1323084231 1323084231 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 ...
##  $ raw_timestamp_part_2: int  788290 808298 820366 120339 196328 304277 368296 440390 484323 484434 ...
##  $ cvtd_timestamp      : POSIXct, format: "2011-12-05 11:23:00" "2011-12-05 11:23:00" ...
##  $ new_window          : chr  "no" "no" "no" "no" ...
##  $ num_window          : num  11 11 11 12 12 12 12 12 12 12 ...
##  $ roll_belt           : num  1.41 1.41 1.42 1.48 1.48 1.45 1.42 1.42 1.43 1.45 ...
##  $ pitch_belt          : num  8.07 8.07 8.07 8.05 8.07 8.06 8.09 8.13 8.16 8.17 ...
##  $ yaw_belt            : num  -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...
##  $ total_accel_belt    : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ gyros_belt_x        : num  0 0.02 0 0.02 0.02 0.02 0.02 0.02 0.02 0.03 ...
##  $ gyros_belt_y        : num  0 0 0 0 0.02 0 0 0 0 0 ...
##  $ gyros_belt_z        : num  -0.02 -0.02 -0.02 -0.03 -0.02 -0.02 -0.02 -0.02 -0.02 0 ...
##  $ accel_belt_x        : num  -21 -22 -20 -22 -21 -21 -22 -22 -20 -21 ...
##  $ accel_belt_y        : num  4 4 5 3 2 4 3 4 2 4 ...
##  $ accel_belt_z        : num  22 22 23 21 24 21 21 21 24 22 ...
##  $ magnet_belt_x       : num  -3 -7 -2 -6 -6 0 -4 -2 1 -3 ...
##  $ magnet_belt_y       : num  599 608 600 604 600 603 599 603 602 609 ...
##  $ magnet_belt_z       : num  -313 -311 -305 -310 -302 -312 -311 -313 -312 -308 ...
##  $ roll_arm            : num  -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 ...
##  $ pitch_arm           : num  22.5 22.5 22.5 22.1 22.1 22 21.9 21.8 21.7 21.6 ...
##  $ yaw_arm             : num  -161 -161 -161 -161 -161 -161 -161 -161 -161 -161 ...
##  $ total_accel_arm     : num  34 34 34 34 34 34 34 34 34 34 ...
##  $ gyros_arm_x         : num  0 0.02 0.02 0.02 0 0.02 0 0.02 0.02 0.02 ...
##  $ gyros_arm_y         : num  0 -0.02 -0.02 -0.03 -0.03 -0.03 -0.03 -0.02 -0.03 -0.03 ...
##  $ gyros_arm_z         : num  -0.02 -0.02 -0.02 0.02 0 0 0 0 -0.02 -0.02 ...
##  $ accel_arm_x         : num  -288 -290 -289 -289 -289 -289 -289 -289 -288 -288 ...
##  $ accel_arm_y         : num  109 110 110 111 111 111 111 111 109 110 ...
##  $ accel_arm_z         : num  -123 -125 -126 -123 -123 -122 -125 -124 -122 -124 ...
##  $ magnet_arm_x        : num  -368 -369 -368 -372 -374 -369 -373 -372 -369 -376 ...
##  $ magnet_arm_y        : num  337 337 344 344 337 342 336 338 341 334 ...
##  $ magnet_arm_z        : num  516 513 513 512 506 513 509 510 518 516 ...
##  $ roll_dumbbell       : num  13.1 13.1 12.9 13.4 13.4 ...
##  $ pitch_dumbbell      : num  -70.5 -70.6 -70.3 -70.4 -70.4 ...
##  $ yaw_dumbbell        : num  -84.9 -84.7 -85.1 -84.9 -84.9 ...
##  $ total_accel_dumbbell: num  37 37 37 37 37 37 37 37 37 37 ...
##  $ gyros_dumbbell_x    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ gyros_dumbbell_y    : num  -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 ...
##  $ gyros_dumbbell_z    : num  0 0 0 -0.02 0 0 0 0 0 0 ...
##  $ accel_dumbbell_x    : num  -234 -233 -232 -232 -233 -234 -232 -234 -232 -235 ...
##  $ accel_dumbbell_y    : num  47 47 46 48 48 48 47 46 47 48 ...
##  $ accel_dumbbell_z    : num  -271 -269 -270 -269 -270 -269 -270 -272 -269 -270 ...
##  $ magnet_dumbbell_x   : num  -559 -555 -561 -552 -554 -558 -551 -555 -549 -558 ...
##  $ magnet_dumbbell_y   : num  293 296 298 303 292 294 295 300 292 291 ...
##  $ magnet_dumbbell_z   : num  -65 -64 -63 -60 -68 -66 -70 -74 -65 -69 ...
##  $ roll_forearm        : num  28.4 28.3 28.3 28.1 28 27.9 27.9 27.8 27.7 27.7 ...
##  $ pitch_forearm       : num  -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.8 -63.8 -63.8 ...
##  $ yaw_forearm         : num  -153 -153 -152 -152 -152 -152 -152 -152 -152 -152 ...
##  $ total_accel_forearm : num  36 36 36 36 36 36 36 36 36 36 ...
##  $ gyros_forearm_x     : num  0.03 0.02 0.03 0.02 0.02 0.02 0.02 0.02 0.03 0.02 ...
##  $ gyros_forearm_y     : num  0 0 -0.02 -0.02 0 -0.02 0 -0.02 0 0 ...
##  $ gyros_forearm_z     : num  -0.02 -0.02 0 0 -0.02 -0.03 -0.02 0 -0.02 -0.02 ...
##  $ accel_forearm_x     : num  192 192 196 189 189 193 195 193 193 190 ...
##  $ accel_forearm_y     : num  203 203 204 206 206 203 205 205 204 205 ...
##  $ accel_forearm_z     : num  -215 -216 -213 -214 -214 -215 -215 -213 -214 -215 ...
##  $ magnet_forearm_x    : num  -17 -18 -18 -16 -17 -9 -18 -9 -16 -22 ...
##  $ magnet_forearm_y    : num  654 661 658 658 655 660 659 660 653 656 ...
##  $ magnet_forearm_z    : num  476 473 469 469 473 478 470 474 476 473 ...
##  $ classe              : Factor w/ 5 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...

Now, we need to look for correlations between the numeric columns (columns with higher correlation should not be used)

M <- corr_matrix <-
  cor(training[,-c(1:5,59)]) # Matriz de correlação

diag(M) <- 0 # Giving zero where the correlation is equal to 1

Showing the matrix:

M[1:10,1:5]

##                   num_window   roll_belt  pitch_belt    yaw_belt
## num_window        0.00000000  0.07198319 -0.10233913  0.08589479
## roll_belt         0.07198319  0.00000000 -0.21592515  0.81522971
## pitch_belt       -0.10233913 -0.21592515  0.00000000 -0.69975194
## yaw_belt          0.08589479  0.81522971 -0.69975194  0.00000000
## total_accel_belt  0.06603491  0.98092414 -0.13898122  0.76209628
## gyros_belt_x      0.21023489 -0.11746960 -0.43596398  0.14504784
## gyros_belt_y      0.22399752  0.46371838 -0.39711179  0.53004475
## gyros_belt_z      0.06711874 -0.45903753 -0.10698831 -0.27498319
## accel_belt_x      0.13330057  0.25683537 -0.96573340  0.70802772
## accel_belt_y      0.04838485  0.92489827  0.08151866  0.60054662
##                  total_accel_belt
## num_window             0.06603491
## roll_belt              0.98092414
## pitch_belt            -0.13898122
## yaw_belt               0.76209628
## total_accel_belt       0.00000000
## gyros_belt_x          -0.16531119
## gyros_belt_y           0.40931499
## gyros_belt_z          -0.47537283
## accel_belt_x           0.17226448
## accel_belt_y           0.92780692

The attributes with more than 85% of correlation are (they should not be used):

vars_highCorr <-
  colnames(M)[which(abs(M) > 0.85,arr.ind = T)[,2] %>% 
                unique()] # 12 covariates with more than 85% of correlation

AFTER ALL OF THESE ANALYSIS, THE MOST USEFUL VARIABLES SELECTED WERE ONLY 46

training <- 
  training[,setdiff(names(training),
                    vars_highCorr)]

Now, we need to change the type of the columns of the testing dataset.

testing <- 
  testing[,intersect(names(testing), names(training))]

testing$user_name <-
  as.factor(testing$user_name) 

testing$cvtd_timestamp <-
  as.POSIXct(testing$cvtd_timestamp,
             format = "%d/%m/%Y %H:%M") 

testing <- testing %>%
  mutate_at(
    c(6:46),.funs = as.numeric
  )

We obtained:

testing %>%
  dim()

## [1] 20 46

FITTING THE MODEL

In this case, we used the Random Forest Algorithm.

modFit_sprf <-
  train(classe ~.,
        data = training,
        method = "rf",
        preProcess = c("center","scale"),
        trControl = trainControl(method = "cv", number = 10)
        )

Informations about the model I fitted:

modFit_sprf

## Random Forest 
## 
## 19622 samples
##    46 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## Pre-processing: centered (50), scaled (50) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 17660, 17660, 17661, 17659, 17660, 17661, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9976558  0.9970348
##   26    0.9994905  0.9993556
##   50    0.9975028  0.9968413
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 26.

The attibutes importance:

varImp(modFit_sprf) # The importance of the first 20 variables

## rf variable importance
## 
##   only 20 most important variables shown (out of 50)
## 
##                      Overall
## raw_timestamp_part_1 100.000
## num_window            48.759
## pitch_forearm         27.775
## yaw_belt              25.563
## cvtd_timestamp        23.701
## magnet_dumbbell_z     22.084
## magnet_belt_y         18.970
## magnet_dumbbell_y     15.514
## roll_forearm          11.713
## magnet_belt_z          8.117
## roll_dumbbell          8.077
## magnet_dumbbell_x      7.962
## accel_dumbbell_y       7.765
## gyros_belt_z           7.403
## total_accel_dumbbell   5.904
## accel_dumbbell_z       4.929
## accel_forearm_x        4.778
## yaw_dumbbell           3.487
## accel_forearm_z        3.271
## roll_arm               3.220

PREDICTING

Let’s predict

prediction <-
  predict(modFit_sprf,
          testing)

prediction

##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E