library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
source("bnlearnRegression.R")
source("bnlearn-utils.R")
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: timeDate
## This is forecast 5.8
## Loading required package: graph
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:graph':
## 
##     degree, edges
source("dataset-processing.R")


# Load dataset dacc
dd <-get.dataset(dataset[1])
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_date(format = ""),
##   date = col_date(format = "")
## )
## See spec(...) for full column specifications.
sensores <- dd$data[-1] #quito columna date o primer columna

pred_sensores <- dd$pred

set.seed(3456)
porc_train = 0.68

Training set y test dataset

until <- round(nrow(sensores)*porc_train)
training.set = as.data.frame(sensores[1:until-1, ]) # This is training set to learn the parameters
test.set = as.data.frame(sensores[until:nrow(sensores), ])

X <- training.set 
Y <- as.numeric(training.set[,pred_sensores[1]])
varpred <- pred_sensores[1]
print(varpred)
## [1] "junin.temp_min"
#real <- test.set[,pred_sensores[1]]

MOdelo entrenado usando caret + timeslices

load("../dacc--junin.temp_min--normal--all--1--rf.RData")

print(model$bestTune)
##   mtry
## 1    2
# da mtry = 2 y ntree=500 by default, parametros que luego usamos para entrenar los modelos.

Test modelo random forest entrenado por caret para predecir usando datos test.set, sin convertirlos a timeslices: ¿los convierte caret por si mismo?

pruebo prediccion pasando test.set asi como viene, vemos qeu hace caret dentro. Es como pasan test.set en muchos ejemplos de caret. ¿es lo mismo para timeseries?

pred.caret <- predict(model,test.set)
evaluate(pred.caret,test.set$junin.temp_min)
## $rmse
## [1] 1.12
## 
## $r2
## [1] 0.97
## 
## $sens
## [1] 0.77
## 
## $spec
## [1] 0.99
## 
## $prec
## [1] 0.93
## 
## $acc
## Accuracy 
##     0.96 
## 
## $cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (-20,0] (0,50]
##    (-20,0]     171     13
##    (0,50]       51   1532
##                                         
##                Accuracy : 0.9638        
##                  95% CI : (0.954, 0.972)
##     No Information Rate : 0.8744        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.8221        
##  Mcnemar's Test P-Value : 3.746e-06     
##                                         
##             Sensitivity : 0.77027       
##             Specificity : 0.99159       
##          Pos Pred Value : 0.92935       
##          Neg Pred Value : 0.96778       
##               Precision : 0.92935       
##                  Recall : 0.77027       
##                      F1 : 0.84236       
##              Prevalence : 0.12564       
##          Detection Rate : 0.09677       
##    Detection Prevalence : 0.10413       
##       Balanced Accuracy : 0.88093       
##                                         
##        'Positive' Class : (-20,0]       
##                                         
## 
## $MAE
## [1] 0.84
plot(test.set$junin.temp_min, col = "red", ylab = "true (red) , pred (blue)", ylim = range(c(pred.caret,test.set$junin.temp_min)))
points(pred.caret, col = "blue") 

Testeando modelo caret pero pasando lso datos test.set en formato timeSlices pruebo pasando el test.set como timeslices, al mismo modelo entrenado por caret, el anterior

timeSlicesTest <- createTimeSlices(1:nrow(test.set), 
                                   initialWindow = 1, horizon = 1, fixedWindow = TRUE)

xSlices <- timeSlicesTest[[1]]
ySlices <- timeSlicesTest[[2]]
head(xSlices)
## $Training0001
## [1] 1
## 
## $Training0002
## [1] 2
## 
## $Training0003
## [1] 3
## 
## $Training0004
## [1] 4
## 
## $Training0005
## [1] 5
## 
## $Training0006
## [1] 6
head(ySlices)
## $Testing0001
## [1] 2
## 
## $Testing0002
## [1] 3
## 
## $Testing0003
## [1] 4
## 
## $Testing0004
## [1] 5
## 
## $Testing0005
## [1] 6
## 
## $Testing0006
## [1] 7
pred.caret2 <- predict(model,test.set[unlist(xSlices),])
evaluate(pred.caret2,test.set[unlist(ySlices),varpred])
## $rmse
## [1] 2.4
## 
## $r2
## [1] 0.87
## 
## $sens
## [1] 0.63
## 
## $spec
## [1] 0.97
## 
## $prec
## [1] 0.76
## 
## $acc
## Accuracy 
##     0.93 
## 
## $cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (-20,0] (0,50]
##    (-20,0]     140     44
##    (0,50]       82   1500
##                                           
##                Accuracy : 0.9287          
##                  95% CI : (0.9156, 0.9402)
##     No Information Rate : 0.8743          
##     P-Value [Acc > NIR] : 8.413e-14       
##                                           
##                   Kappa : 0.6497          
##  Mcnemar's Test P-Value : 0.0009799       
##                                           
##             Sensitivity : 0.63063         
##             Specificity : 0.97150         
##          Pos Pred Value : 0.76087         
##          Neg Pred Value : 0.94817         
##               Precision : 0.76087         
##                  Recall : 0.63063         
##                      F1 : 0.68966         
##              Prevalence : 0.12571         
##          Detection Rate : 0.07928         
##    Detection Prevalence : 0.10419         
##       Balanced Accuracy : 0.80107         
##                                           
##        'Positive' Class : (-20,0]         
##                                           
## 
## $MAE
## [1] 1.85
plot(test.set[unlist(ySlices),varpred], col = "red", ylab = "true (red) , pred (blue)", ylim = range(c(pred.caret2,test.set[unlist(ySlices),varpred])))
points(pred.caret2, col = "blue") 

¿Será que caret ya lo vuelve a convertir dentro en timeSlices y rompo el formato?

Entrenando y prediciendo modelo randomForest con mi método de desfasar dataset T

Entrenar randomForest con mtry=2 usando T=1, usando mi metodo de desfasar dataset. Este metodo renombra las variables temporales pprevias, por lo que a fines del modelo de random Forest se crean más variables. Por este motivo creo que el resultado es diferente

train.data <- desfasar.dataset.T(T_value = 1,training.set,pred_sensores = pred_sensores)

print(train.data$vars)
## [1] "junin.temp_min_t"       "tunuyan.temp_min_t"    
## [3] "agua_amarga.temp_min_t" "las_paredes.temp_min_t"
## [5] "la_llave.temp_min_t"
print(head(train.data$data))
##   junin.temp_max_T_1 junin.humedad_max_T_1 junin.temp_med_T_1
## 1              38.50                    79           27.84583
## 2              38.58                    69           28.35833
## 3              31.53                    78           25.12500
## 4              37.85                    96           26.40417
## 5              34.82                    97           26.53333
## 6              33.38                    91           25.81250
##   junin.humedad_med_T_1 junin.temp_min_T_1 junin.humedad_min_T_1
## 1              37.12500              16.89                    13
## 2              41.66667              18.28                    16
## 3              53.83333              20.55                    26
## 4              58.58333              15.83                     1
## 5              66.95833              17.87                    42
## 6              56.04167              19.70                    28
##   tunuyan.temp_max_T_1 tunuyan.humedad_max_T_1 tunuyan.temp_med_T_1
## 1                33.41                      87             23.20833
## 2                35.16                      83             23.85833
## 3                29.28                      83             23.25417
## 4                34.57                      90             22.92917
## 5                34.10                      85             24.42083
## 6                32.40                      87             22.95000
##   tunuyan.humedad_med_T_1 tunuyan.temp_min_T_1 tunuyan.humedad_min_T_1
## 1                57.66667                13.65                      25
## 2                54.54167                13.65                      27
## 3                59.20833                17.48                      41
## 4                61.29167                11.47                      25
## 5                61.00000                15.83                       4
## 6                60.50000                15.55                      28
##   agua_amarga.temp_max_T_1 agua_amarga.humedad_max_T_1
## 1                    32.92                          61
## 2                    35.11                          50
## 3                    29.00                          74
## 4                    34.93                          60
## 5                    34.18                          73
## 6                    31.09                          77
##   agua_amarga.temp_med_T_1 agua_amarga.humedad_med_T_1
## 1                 24.41667                    41.37500
## 2                 25.96250                    37.20833
## 3                 23.44583                    53.33333
## 4                 24.66250                    42.62500
## 5                 25.50833                    48.41667
## 6                 23.49167                    49.08333
##   agua_amarga.temp_min_T_1 agua_amarga.humedad_min_T_1
## 1                    16.50                          25
## 2                    17.27                          25
## 3                    19.28                          37
## 4                    14.18                          20
## 5                    18.43                          35
## 6                    17.74                          30
##   las_paredes.temp_max_T_1 las_paredes.humedad_max_T_1
## 1                    35.94                          69
## 2                    41.04                          49
## 3                    30.78                          90
## 4                    36.25                          88
## 5                    37.62                          69
## 6                    32.89                          71
##   las_paredes.temp_med_T_1 las_paredes.humedad_med_T_1
## 1                 25.30000                    39.54167
## 2                 27.94583                    38.16667
## 3                 23.30000                    61.62500
## 4                 24.97083                    58.16667
## 5                 27.10833                    49.95833
## 6                 24.93750                    46.95833
##   las_paredes.temp_min_T_1 las_paredes.humedad_min_T_1
## 1                    13.85                          21
## 2                    19.15                          25
## 3                    16.96                          41
## 4                    15.88                          30
## 5                    19.75                          35
## 6                    15.42                          33
##   la_llave.temp_max_T_1 la_llave.humedad_max_T_1 la_llave.temp_med_T_1
## 1                 37.52                       78              26.22500
## 2                 39.31                       73              25.80870
## 3                 30.39                       91              22.90435
## 4                 36.79                       91              25.96667
## 5                 38.19                       71              26.63636
## 6                 34.00                       86              25.97500
##   la_llave.humedad_med_T_1 la_llave.temp_min_T_1 la_llave.humedad_min_T_1
## 1                 46.54167                 14.06                       28
## 2                 53.13043                 17.27                       36
## 3                 68.34783                 16.91                       46
## 4                 62.33333                 16.73                       35
## 5                 56.09091                 20.03                       41
## 6                 51.87500                 15.47                       38
##   junin.temp_min_t tunuyan.temp_min_t agua_amarga.temp_min_t
## 1            18.28              13.65                  17.27
## 2            20.55              17.48                  19.28
## 3            15.83              11.47                  14.18
## 4            17.87              15.83                  18.43
## 5            19.70              15.55                  17.74
## 6            20.49              13.21                  17.84
##   las_paredes.temp_min_t la_llave.temp_min_t
## 1                  19.15               17.27
## 2                  16.96               16.91
## 3                  15.88               16.73
## 4                  19.75               20.03
## 5                  15.42               15.47
## 6                  18.51               19.77
# entrenar random forest
model.rf1 <- randomForest(x = train.data$data[,-which(colnames(train.data$data) %in% train.data$vars)],y=train.data$data[,train.data$vars[1]], 
                      importance = TRUE, proximity=FALSE,
                      ntree=500, mtry=2)
# armar testset
test.data <- desfasar.dataset.T(T_value = 1,test.set,pred_sensores = pred_sensores)

# predecir
pred.rf <- predict(model.rf1,test.data$data[,-which(colnames(test.data$data) %in% test.data$vars)] )
# evaluar resultados
real <- test.data$data[,test.data$vars[1]]

df <- data.frame(pred=pred.rf, obs= real)

eee <- evaluate(pred.rf, real)
print(eee)
## $rmse
## [1] 2.06
## 
## $r2
## [1] 0.91
## 
## $sens
## [1] 0.55
## 
## $spec
## [1] 0.99
## 
## $prec
## [1] 0.88
## 
## $acc
## Accuracy 
##     0.93 
## 
## $cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (-20,0] (0,50]
##    (-20,0]     122     17
##    (0,50]      100   1527
##                                           
##                Accuracy : 0.9337          
##                  95% CI : (0.9211, 0.9449)
##     No Information Rate : 0.8743          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6412          
##  Mcnemar's Test P-Value : 3.432e-14       
##                                           
##             Sensitivity : 0.54955         
##             Specificity : 0.98899         
##          Pos Pred Value : 0.87770         
##          Neg Pred Value : 0.93854         
##               Precision : 0.87770         
##                  Recall : 0.54955         
##                      F1 : 0.67590         
##              Prevalence : 0.12571         
##          Detection Rate : 0.06908         
##    Detection Prevalence : 0.07871         
##       Balanced Accuracy : 0.76927         
##                                           
##        'Positive' Class : (-20,0]         
##                                           
## 
## $MAE
## [1] 1.61
plot(real, col = "red", ylab = "true (red) , pred (blue)", ylim = range(c(pred.rf,real)))
points(pred.rf, col = "blue") 

Entreno randomForest (sin caret) usando timeSlices

ahora si uso timeSlices para entrenar randomForest model y para predecir.

timeSlices <- createTimeSlices(1:nrow(training.set), 
                               initialWindow = 1, horizon = 1, fixedWindow = TRUE)

str(timeSlices,max.level = 1)
## List of 2
##  $ train:List of 3750
##  $ test :List of 3750
Xx <- timeSlices[[1]]
Yx <- timeSlices[[2]]

model.rf <- randomForest(x = training.set[unlist(Xx),],y=training.set[unlist(Yx),varpred], 
                         importance = TRUE, proximity=FALSE,
                         ntree=500, mtry=2)


pred <- predict(model.rf,test.set[unlist(xSlices),])

true <- test.set[unlist(ySlices),varpred]

plot(true, col = "red", ylab = "true (red) , pred (blue)", ylim = range(c(pred,true)))
points(pred, col = "blue") 

print(evaluate(pred, true))
## $rmse
## [1] 2.06
## 
## $r2
## [1] 0.91
## 
## $sens
## [1] 0.57
## 
## $spec
## [1] 0.99
## 
## $prec
## [1] 0.88
## 
## $acc
## Accuracy 
##     0.94 
## 
## $cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (-20,0] (0,50]
##    (-20,0]     126     18
##    (0,50]       96   1526
##                                          
##                Accuracy : 0.9354         
##                  95% CI : (0.923, 0.9465)
##     No Information Rate : 0.8743         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.6543         
##  Mcnemar's Test P-Value : 5.525e-13      
##                                          
##             Sensitivity : 0.56757        
##             Specificity : 0.98834        
##          Pos Pred Value : 0.87500        
##          Neg Pred Value : 0.94081        
##               Precision : 0.87500        
##                  Recall : 0.56757        
##                      F1 : 0.68852        
##              Prevalence : 0.12571        
##          Detection Rate : 0.07135        
##    Detection Prevalence : 0.08154        
##       Balanced Accuracy : 0.77795        
##                                          
##        'Positive' Class : (-20,0]        
##                                          
## 
## $MAE
## [1] 1.61

¿cuál de todos los enfoques usar?