Repositorio de Github

https://github.com/AndresSotoA/PROYECTOMODELO/blob/master/Prueba.Rmd ## Cargando librería

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2

Leyendo los datos

testing <- read.csv("pml-testing.csv")
training <- read.csv("pml-training.csv")
dim(testing)
## [1]  20 160
dim(training)
## [1] 19622   160
#Tenemos un total de 160 características y 19622 filas en training. Y un total de 160 características y 20 filas en testing.

Diviendo los datos

inTrain <- createDataPartition(y=training$classe, p = 0.75, list = FALSE)
training1 <-training[inTrain,]
testing1 <-training[-inTrain,]

Visualizando la estructura de los datos

str(training1[1:10])
'data.frame':   14718 obs. of  10 variables:
 $ X                   : int  2 3 5 6 8 9 10 11 15 17 ...
 $ user_name           : Factor w/ 6 levels "adelmo","carlitos",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ raw_timestamp_part_1: int  1323084231 1323084231 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 ...
 $ raw_timestamp_part_2: int  808298 820366 196328 304277 440390 484323 484434 500302 604281 692324 ...
 $ cvtd_timestamp      : Factor w/ 20 levels "02/12/2011 13:32",..: 9 9 9 9 9 9 9 9 9 9 ...
 $ new_window          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ num_window          : int  11 11 12 12 12 12 12 12 12 12 ...
 $ roll_belt           : num  1.41 1.42 1.48 1.45 1.42 1.43 1.45 1.45 1.45 1.51 ...
 $ pitch_belt          : num  8.07 8.07 8.07 8.06 8.13 8.16 8.17 8.18 8.2 8.12 ...
 $ yaw_belt            : num  -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...

Quitando campos que no servirán como predictoras

#En training1
training1$X <- NULL
training1$user_name <- NULL
training1$cvtd_timestamp <- NULL
#En testing1
testing1$X <- NULL
testing1$user_name <- NULL
testing1$cvtd_timestamp <- NULL
#En testing
testing$X <- NULL
testing$user_name <- NULL
testing$cvtd_timestamp <- NULL

Quitando variables que no tienen variabilidad

cero <- nearZeroVar(training1)
training1 <- training1[, -cero]
testing1 <- testing1[, -cero]
testing <- testing[, names(training1[-which(names(training1)== "classe")])]

Valores perdidos

sum(is.na(training1))
## [1] 605346
tna <- apply(training1, 2,is.na)
sna <- apply(tna, 2,sum)

#Eliminando variables con al menos 20% de perdidos.
perdidos <- which(sna/dim(training1)[1]>0.20)
training1clean <-training1[,-perdidos]
testing1clean <-testing1[,-perdidos]
testclean <-testing[,-perdidos]

Correlación entre variables predictoras

y <- which(names(training1clean)== "classe")
#Aplicando valor absoluto a la matriz de correlaciones
m <- abs(cor(training1clean[,-y]))
#Asignando a la diagonal de la matriz ceros
diag(m) <- 0
#Seleccionando aquellas correlaciones mayores que 0.6
head(which(m >0.6, arr.ind = T))
##                   row col
## magnet_dumbbell_z  42   1
## accel_forearm_z    52   1
## yaw_belt            6   4
## total_accel_belt    7   4
## accel_belt_y       12   4
## accel_belt_z       13   4
#Hay muchas variables que están correlacionadas, por lo tanto usaremos ACP.

ACP

#Quitando classe de la tabla de datos para aplicar ACP
y <- which(names(training1clean)== "classe")
preProc <- preProcess(training1clean[,-y], method = "pca", thresh = 0.95)
#Aplicando ACP al resto de tablas de datos
train1PC <- predict(preProc, training1clean)
testing1PC <- predict(preProc, testing1clean)
testingPC <- predict(preProc, testclean)

Construyendo el modelo con random forest

modrf <- train(classe ~ .,data = train1PC, method = "rf")
summary(modrf)
##                 Length Class      Mode     
## call                4  -none-     call     
## type                1  -none-     character
## predicted       14718  factor     numeric  
## err.rate         3000  -none-     numeric  
## confusion          30  -none-     numeric  
## votes           73590  matrix     numeric  
## oob.times       14718  -none-     numeric  
## classes             5  -none-     character
## importance         26  -none-     numeric  
## importanceSD        0  -none-     NULL     
## localImportance     0  -none-     NULL     
## proximity           0  -none-     NULL     
## ntree               1  -none-     numeric  
## mtry                1  -none-     numeric  
## forest             14  -none-     list     
## y               14718  factor     numeric  
## test                0  -none-     NULL     
## inbag               0  -none-     NULL     
## xNames             26  -none-     character
## problemType         1  -none-     character
## tuneValue           1  data.frame list     
## obsLevels           5  -none-     character
## param               0  -none-     list

Evaluando el modelo Random Forest

Con los datos de train (error 0%)

confusionMatrix(train1PC$classe, predict(modrf, train1PC))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 4185    0    0    0    0
##          B    0 2848    0    0    0
##          C    0    0 2567    0    0
##          D    0    0    0 2412    0
##          E    0    0    0    0 2706
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9997, 1)
##     No Information Rate : 0.2843     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   1.0000   1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000
## Prevalence             0.2843   0.1935   0.1744   0.1639   0.1839
## Detection Rate         0.2843   0.1935   0.1744   0.1639   0.1839
## Detection Prevalence   0.2843   0.1935   0.1744   0.1639   0.1839
## Balanced Accuracy      1.0000   1.0000   1.0000   1.0000   1.0000

El error es 0% con los datos train, sin embargo hay problema de overfitting. Ahora probaremos con los de testing que posiblemente nos de un error del 5%.

Con los datos de testing

confusionMatrix(testing1PC$classe, predict(modrf, testing1PC))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1384    1    1    5    4
##          B    7  932   10    0    0
##          C    1    6  846    1    1
##          D    0    3   32  765    4
##          E    0    2    3    1  895
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9833          
##                  95% CI : (0.9793, 0.9867)
##     No Information Rate : 0.2838          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9789          
##                                           
##  Mcnemar's Test P-Value : 1.46e-07        
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9943   0.9873   0.9484   0.9909   0.9900
## Specificity            0.9969   0.9957   0.9978   0.9906   0.9985
## Pos Pred Value         0.9921   0.9821   0.9895   0.9515   0.9933
## Neg Pred Value         0.9977   0.9970   0.9886   0.9983   0.9978
## Prevalence             0.2838   0.1925   0.1819   0.1574   0.1843
## Detection Rate         0.2822   0.1900   0.1725   0.1560   0.1825
## Detection Prevalence   0.2845   0.1935   0.1743   0.1639   0.1837
## Balanced Accuracy      0.9956   0.9915   0.9731   0.9907   0.9943
#El porcentaje de aciertos del total de valores reales para cada clase supera 94%. Por lo tanto concluimos que es un buen modelo.

Prediciendo para nuevos casos

data.frame(1:20,Clase =predict(modrf, testingPC))
##    X1.20 Clase
## 1      1     B
## 2      2     A
## 3      3     A
## 4      4     A
## 5      5     A
## 6      6     E
## 7      7     D
## 8      8     B
## 9      9     A
## 10    10     A
## 11    11     B
## 12    12     C
## 13    13     B
## 14    14     A
## 15    15     E
## 16    16     E
## 17    17     A
## 18    18     B
## 19    19     B
## 20    20     B