https://github.com/AndresSotoA/PROYECTOMODELO/blob/master/Prueba.Rmd ## Cargando librería
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
testing <- read.csv("pml-testing.csv")
training <- read.csv("pml-training.csv")
dim(testing)
## [1] 20 160
dim(training)
## [1] 19622 160
#Tenemos un total de 160 características y 19622 filas en training. Y un total de 160 características y 20 filas en testing.
inTrain <- createDataPartition(y=training$classe, p = 0.75, list = FALSE)
training1 <-training[inTrain,]
testing1 <-training[-inTrain,]
str(training1[1:10])
'data.frame': 14718 obs. of 10 variables:
$ X : int 2 3 5 6 8 9 10 11 15 17 ...
$ user_name : Factor w/ 6 levels "adelmo","carlitos",..: 2 2 2 2 2 2 2 2 2 2 ...
$ raw_timestamp_part_1: int 1323084231 1323084231 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 ...
$ raw_timestamp_part_2: int 808298 820366 196328 304277 440390 484323 484434 500302 604281 692324 ...
$ cvtd_timestamp : Factor w/ 20 levels "02/12/2011 13:32",..: 9 9 9 9 9 9 9 9 9 9 ...
$ new_window : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
$ num_window : int 11 11 12 12 12 12 12 12 12 12 ...
$ roll_belt : num 1.41 1.42 1.48 1.45 1.42 1.43 1.45 1.45 1.45 1.51 ...
$ pitch_belt : num 8.07 8.07 8.07 8.06 8.13 8.16 8.17 8.18 8.2 8.12 ...
$ yaw_belt : num -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...
#En training1
training1$X <- NULL
training1$user_name <- NULL
training1$cvtd_timestamp <- NULL
#En testing1
testing1$X <- NULL
testing1$user_name <- NULL
testing1$cvtd_timestamp <- NULL
#En testing
testing$X <- NULL
testing$user_name <- NULL
testing$cvtd_timestamp <- NULL
cero <- nearZeroVar(training1)
training1 <- training1[, -cero]
testing1 <- testing1[, -cero]
testing <- testing[, names(training1[-which(names(training1)== "classe")])]
sum(is.na(training1))
## [1] 605346
tna <- apply(training1, 2,is.na)
sna <- apply(tna, 2,sum)
#Eliminando variables con al menos 20% de perdidos.
perdidos <- which(sna/dim(training1)[1]>0.20)
training1clean <-training1[,-perdidos]
testing1clean <-testing1[,-perdidos]
testclean <-testing[,-perdidos]
y <- which(names(training1clean)== "classe")
#Aplicando valor absoluto a la matriz de correlaciones
m <- abs(cor(training1clean[,-y]))
#Asignando a la diagonal de la matriz ceros
diag(m) <- 0
#Seleccionando aquellas correlaciones mayores que 0.6
head(which(m >0.6, arr.ind = T))
## row col
## magnet_dumbbell_z 42 1
## accel_forearm_z 52 1
## yaw_belt 6 4
## total_accel_belt 7 4
## accel_belt_y 12 4
## accel_belt_z 13 4
#Hay muchas variables que están correlacionadas, por lo tanto usaremos ACP.
#Quitando classe de la tabla de datos para aplicar ACP
y <- which(names(training1clean)== "classe")
preProc <- preProcess(training1clean[,-y], method = "pca", thresh = 0.95)
#Aplicando ACP al resto de tablas de datos
train1PC <- predict(preProc, training1clean)
testing1PC <- predict(preProc, testing1clean)
testingPC <- predict(preProc, testclean)
modrf <- train(classe ~ .,data = train1PC, method = "rf")
summary(modrf)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 14718 factor numeric
## err.rate 3000 -none- numeric
## confusion 30 -none- numeric
## votes 73590 matrix numeric
## oob.times 14718 -none- numeric
## classes 5 -none- character
## importance 26 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 14718 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 26 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 5 -none- character
## param 0 -none- list
Con los datos de train (error 0%)
confusionMatrix(train1PC$classe, predict(modrf, train1PC))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 4185 0 0 0 0
## B 0 2848 0 0 0
## C 0 0 2567 0 0
## D 0 0 0 2412 0
## E 0 0 0 0 2706
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9997, 1)
## No Information Rate : 0.2843
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 1.0000 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000 1.0000 1.0000
## Prevalence 0.2843 0.1935 0.1744 0.1639 0.1839
## Detection Rate 0.2843 0.1935 0.1744 0.1639 0.1839
## Detection Prevalence 0.2843 0.1935 0.1744 0.1639 0.1839
## Balanced Accuracy 1.0000 1.0000 1.0000 1.0000 1.0000
El error es 0% con los datos train, sin embargo hay problema de overfitting. Ahora probaremos con los de testing que posiblemente nos de un error del 5%.
Con los datos de testing
confusionMatrix(testing1PC$classe, predict(modrf, testing1PC))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1384 1 1 5 4
## B 7 932 10 0 0
## C 1 6 846 1 1
## D 0 3 32 765 4
## E 0 2 3 1 895
##
## Overall Statistics
##
## Accuracy : 0.9833
## 95% CI : (0.9793, 0.9867)
## No Information Rate : 0.2838
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9789
##
## Mcnemar's Test P-Value : 1.46e-07
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9943 0.9873 0.9484 0.9909 0.9900
## Specificity 0.9969 0.9957 0.9978 0.9906 0.9985
## Pos Pred Value 0.9921 0.9821 0.9895 0.9515 0.9933
## Neg Pred Value 0.9977 0.9970 0.9886 0.9983 0.9978
## Prevalence 0.2838 0.1925 0.1819 0.1574 0.1843
## Detection Rate 0.2822 0.1900 0.1725 0.1560 0.1825
## Detection Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Balanced Accuracy 0.9956 0.9915 0.9731 0.9907 0.9943
#El porcentaje de aciertos del total de valores reales para cada clase supera 94%. Por lo tanto concluimos que es un buen modelo.
data.frame(1:20,Clase =predict(modrf, testingPC))
## X1.20 Clase
## 1 1 B
## 2 2 A
## 3 3 A
## 4 4 A
## 5 5 A
## 6 6 E
## 7 7 D
## 8 8 B
## 9 9 A
## 10 10 A
## 11 11 B
## 12 12 C
## 13 13 B
## 14 14 A
## 15 15 E
## 16 16 E
## 17 17 A
## 18 18 B
## 19 19 B
## 20 20 B