library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
source("bnlearnRegression.R")
source("bnlearn-utils.R")
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: timeDate
## This is forecast 5.8
## Loading required package: graph
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:graph':
##
## degree, edges
source("dataset-processing.R")
# Load dataset dacc
dd <-get.dataset(dataset[1])
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_date(format = ""),
## date = col_date(format = "")
## )
## See spec(...) for full column specifications.
sensores <- dd$data[-1] #quito columna date o primer columna
pred_sensores <- dd$pred
set.seed(3456)
porc_train = 0.68
until <- round(nrow(sensores)*porc_train)
training.set = as.data.frame(sensores[1:until-1, ]) # This is training set to learn the parameters
test.set = as.data.frame(sensores[until:nrow(sensores), ])
X <- training.set
Y <- as.numeric(training.set[,pred_sensores[1]])
varpred <- pred_sensores[1]
print(varpred)
## [1] "junin.temp_min"
#real <- test.set[,pred_sensores[1]]
MOdelo entrenado usando caret + timeslices
load("../dacc--junin.temp_min--normal--all--1--rf.RData")
print(model$bestTune)
## mtry
## 1 2
# da mtry = 2 y ntree=500 by default, parametros que luego usamos para entrenar los modelos.
pruebo prediccion pasando test.set asi como viene, vemos qeu hace caret dentro. Es como pasan test.set en muchos ejemplos de caret. ¿es lo mismo para timeseries?
pred.caret <- predict(model,test.set)
evaluate(pred.caret,test.set$junin.temp_min)
## $rmse
## [1] 1.12
##
## $r2
## [1] 0.97
##
## $sens
## [1] 0.77
##
## $spec
## [1] 0.99
##
## $prec
## [1] 0.93
##
## $acc
## Accuracy
## 0.96
##
## $cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction (-20,0] (0,50]
## (-20,0] 171 13
## (0,50] 51 1532
##
## Accuracy : 0.9638
## 95% CI : (0.954, 0.972)
## No Information Rate : 0.8744
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8221
## Mcnemar's Test P-Value : 3.746e-06
##
## Sensitivity : 0.77027
## Specificity : 0.99159
## Pos Pred Value : 0.92935
## Neg Pred Value : 0.96778
## Precision : 0.92935
## Recall : 0.77027
## F1 : 0.84236
## Prevalence : 0.12564
## Detection Rate : 0.09677
## Detection Prevalence : 0.10413
## Balanced Accuracy : 0.88093
##
## 'Positive' Class : (-20,0]
##
##
## $MAE
## [1] 0.84
plot(test.set$junin.temp_min, col = "red", ylab = "true (red) , pred (blue)", ylim = range(c(pred.caret,test.set$junin.temp_min)))
points(pred.caret, col = "blue")
Testeando modelo caret pero pasando lso datos test.set en formato timeSlices pruebo pasando el test.set como timeslices, al mismo modelo entrenado por caret, el anterior
timeSlicesTest <- createTimeSlices(1:nrow(test.set),
initialWindow = 1, horizon = 1, fixedWindow = TRUE)
xSlices <- timeSlicesTest[[1]]
ySlices <- timeSlicesTest[[2]]
head(xSlices)
## $Training0001
## [1] 1
##
## $Training0002
## [1] 2
##
## $Training0003
## [1] 3
##
## $Training0004
## [1] 4
##
## $Training0005
## [1] 5
##
## $Training0006
## [1] 6
head(ySlices)
## $Testing0001
## [1] 2
##
## $Testing0002
## [1] 3
##
## $Testing0003
## [1] 4
##
## $Testing0004
## [1] 5
##
## $Testing0005
## [1] 6
##
## $Testing0006
## [1] 7
pred.caret2 <- predict(model,test.set[unlist(xSlices),])
evaluate(pred.caret2,test.set[unlist(ySlices),varpred])
## $rmse
## [1] 2.4
##
## $r2
## [1] 0.87
##
## $sens
## [1] 0.63
##
## $spec
## [1] 0.97
##
## $prec
## [1] 0.76
##
## $acc
## Accuracy
## 0.93
##
## $cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction (-20,0] (0,50]
## (-20,0] 140 44
## (0,50] 82 1500
##
## Accuracy : 0.9287
## 95% CI : (0.9156, 0.9402)
## No Information Rate : 0.8743
## P-Value [Acc > NIR] : 8.413e-14
##
## Kappa : 0.6497
## Mcnemar's Test P-Value : 0.0009799
##
## Sensitivity : 0.63063
## Specificity : 0.97150
## Pos Pred Value : 0.76087
## Neg Pred Value : 0.94817
## Precision : 0.76087
## Recall : 0.63063
## F1 : 0.68966
## Prevalence : 0.12571
## Detection Rate : 0.07928
## Detection Prevalence : 0.10419
## Balanced Accuracy : 0.80107
##
## 'Positive' Class : (-20,0]
##
##
## $MAE
## [1] 1.85
plot(test.set[unlist(ySlices),varpred], col = "red", ylab = "true (red) , pred (blue)", ylim = range(c(pred.caret2,test.set[unlist(ySlices),varpred])))
points(pred.caret2, col = "blue")
¿Será que caret ya lo vuelve a convertir dentro en timeSlices y rompo el formato?
Entrenar randomForest con mtry=2 usando T=1, usando mi metodo de desfasar dataset. Este metodo renombra las variables temporales pprevias, por lo que a fines del modelo de random Forest se crean más variables. Por este motivo creo que el resultado es diferente
train.data <- desfasar.dataset.T(T_value = 1,training.set,pred_sensores = pred_sensores)
print(train.data$vars)
## [1] "junin.temp_min_t" "tunuyan.temp_min_t"
## [3] "agua_amarga.temp_min_t" "las_paredes.temp_min_t"
## [5] "la_llave.temp_min_t"
print(head(train.data$data))
## junin.temp_max_T_1 junin.humedad_max_T_1 junin.temp_med_T_1
## 1 38.50 79 27.84583
## 2 38.58 69 28.35833
## 3 31.53 78 25.12500
## 4 37.85 96 26.40417
## 5 34.82 97 26.53333
## 6 33.38 91 25.81250
## junin.humedad_med_T_1 junin.temp_min_T_1 junin.humedad_min_T_1
## 1 37.12500 16.89 13
## 2 41.66667 18.28 16
## 3 53.83333 20.55 26
## 4 58.58333 15.83 1
## 5 66.95833 17.87 42
## 6 56.04167 19.70 28
## tunuyan.temp_max_T_1 tunuyan.humedad_max_T_1 tunuyan.temp_med_T_1
## 1 33.41 87 23.20833
## 2 35.16 83 23.85833
## 3 29.28 83 23.25417
## 4 34.57 90 22.92917
## 5 34.10 85 24.42083
## 6 32.40 87 22.95000
## tunuyan.humedad_med_T_1 tunuyan.temp_min_T_1 tunuyan.humedad_min_T_1
## 1 57.66667 13.65 25
## 2 54.54167 13.65 27
## 3 59.20833 17.48 41
## 4 61.29167 11.47 25
## 5 61.00000 15.83 4
## 6 60.50000 15.55 28
## agua_amarga.temp_max_T_1 agua_amarga.humedad_max_T_1
## 1 32.92 61
## 2 35.11 50
## 3 29.00 74
## 4 34.93 60
## 5 34.18 73
## 6 31.09 77
## agua_amarga.temp_med_T_1 agua_amarga.humedad_med_T_1
## 1 24.41667 41.37500
## 2 25.96250 37.20833
## 3 23.44583 53.33333
## 4 24.66250 42.62500
## 5 25.50833 48.41667
## 6 23.49167 49.08333
## agua_amarga.temp_min_T_1 agua_amarga.humedad_min_T_1
## 1 16.50 25
## 2 17.27 25
## 3 19.28 37
## 4 14.18 20
## 5 18.43 35
## 6 17.74 30
## las_paredes.temp_max_T_1 las_paredes.humedad_max_T_1
## 1 35.94 69
## 2 41.04 49
## 3 30.78 90
## 4 36.25 88
## 5 37.62 69
## 6 32.89 71
## las_paredes.temp_med_T_1 las_paredes.humedad_med_T_1
## 1 25.30000 39.54167
## 2 27.94583 38.16667
## 3 23.30000 61.62500
## 4 24.97083 58.16667
## 5 27.10833 49.95833
## 6 24.93750 46.95833
## las_paredes.temp_min_T_1 las_paredes.humedad_min_T_1
## 1 13.85 21
## 2 19.15 25
## 3 16.96 41
## 4 15.88 30
## 5 19.75 35
## 6 15.42 33
## la_llave.temp_max_T_1 la_llave.humedad_max_T_1 la_llave.temp_med_T_1
## 1 37.52 78 26.22500
## 2 39.31 73 25.80870
## 3 30.39 91 22.90435
## 4 36.79 91 25.96667
## 5 38.19 71 26.63636
## 6 34.00 86 25.97500
## la_llave.humedad_med_T_1 la_llave.temp_min_T_1 la_llave.humedad_min_T_1
## 1 46.54167 14.06 28
## 2 53.13043 17.27 36
## 3 68.34783 16.91 46
## 4 62.33333 16.73 35
## 5 56.09091 20.03 41
## 6 51.87500 15.47 38
## junin.temp_min_t tunuyan.temp_min_t agua_amarga.temp_min_t
## 1 18.28 13.65 17.27
## 2 20.55 17.48 19.28
## 3 15.83 11.47 14.18
## 4 17.87 15.83 18.43
## 5 19.70 15.55 17.74
## 6 20.49 13.21 17.84
## las_paredes.temp_min_t la_llave.temp_min_t
## 1 19.15 17.27
## 2 16.96 16.91
## 3 15.88 16.73
## 4 19.75 20.03
## 5 15.42 15.47
## 6 18.51 19.77
# entrenar random forest
model.rf1 <- randomForest(x = train.data$data[,-which(colnames(train.data$data) %in% train.data$vars)],y=train.data$data[,train.data$vars[1]],
importance = TRUE, proximity=FALSE,
ntree=500, mtry=2)
# armar testset
test.data <- desfasar.dataset.T(T_value = 1,test.set,pred_sensores = pred_sensores)
# predecir
pred.rf <- predict(model.rf1,test.data$data[,-which(colnames(test.data$data) %in% test.data$vars)] )
# evaluar resultados
real <- test.data$data[,test.data$vars[1]]
df <- data.frame(pred=pred.rf, obs= real)
eee <- evaluate(pred.rf, real)
print(eee)
## $rmse
## [1] 2.06
##
## $r2
## [1] 0.91
##
## $sens
## [1] 0.55
##
## $spec
## [1] 0.99
##
## $prec
## [1] 0.88
##
## $acc
## Accuracy
## 0.93
##
## $cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction (-20,0] (0,50]
## (-20,0] 122 17
## (0,50] 100 1527
##
## Accuracy : 0.9337
## 95% CI : (0.9211, 0.9449)
## No Information Rate : 0.8743
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6412
## Mcnemar's Test P-Value : 3.432e-14
##
## Sensitivity : 0.54955
## Specificity : 0.98899
## Pos Pred Value : 0.87770
## Neg Pred Value : 0.93854
## Precision : 0.87770
## Recall : 0.54955
## F1 : 0.67590
## Prevalence : 0.12571
## Detection Rate : 0.06908
## Detection Prevalence : 0.07871
## Balanced Accuracy : 0.76927
##
## 'Positive' Class : (-20,0]
##
##
## $MAE
## [1] 1.61
plot(real, col = "red", ylab = "true (red) , pred (blue)", ylim = range(c(pred.rf,real)))
points(pred.rf, col = "blue")
ahora si uso timeSlices para entrenar randomForest model y para predecir.
timeSlices <- createTimeSlices(1:nrow(training.set),
initialWindow = 1, horizon = 1, fixedWindow = TRUE)
str(timeSlices,max.level = 1)
## List of 2
## $ train:List of 3750
## $ test :List of 3750
Xx <- timeSlices[[1]]
Yx <- timeSlices[[2]]
model.rf <- randomForest(x = training.set[unlist(Xx),],y=training.set[unlist(Yx),varpred],
importance = TRUE, proximity=FALSE,
ntree=500, mtry=2)
pred <- predict(model.rf,test.set[unlist(xSlices),])
true <- test.set[unlist(ySlices),varpred]
plot(true, col = "red", ylab = "true (red) , pred (blue)", ylim = range(c(pred,true)))
points(pred, col = "blue")
print(evaluate(pred, true))
## $rmse
## [1] 2.06
##
## $r2
## [1] 0.91
##
## $sens
## [1] 0.57
##
## $spec
## [1] 0.99
##
## $prec
## [1] 0.88
##
## $acc
## Accuracy
## 0.94
##
## $cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction (-20,0] (0,50]
## (-20,0] 126 18
## (0,50] 96 1526
##
## Accuracy : 0.9354
## 95% CI : (0.923, 0.9465)
## No Information Rate : 0.8743
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6543
## Mcnemar's Test P-Value : 5.525e-13
##
## Sensitivity : 0.56757
## Specificity : 0.98834
## Pos Pred Value : 0.87500
## Neg Pred Value : 0.94081
## Precision : 0.87500
## Recall : 0.56757
## F1 : 0.68852
## Prevalence : 0.12571
## Detection Rate : 0.07135
## Detection Prevalence : 0.08154
## Balanced Accuracy : 0.77795
##
## 'Positive' Class : (-20,0]
##
##
## $MAE
## [1] 1.61