To install frost package run:
# install.packages("devtools")
# library(devtools)
# install_github("anadiedrichs/frost")
# notebook published --> http://rpubs.com/adiedrichs/410193
# variables: colnames o conjunto de variables del dataset
# var: nombre variable a predecir,ejemplo *_tmin
vars.del.sensor <- function(var,variables,dataset_tmin_chaar=FALSE)
{
v <- variables
sensor <- unlist(strsplit(var,split=".",fixed = TRUE))[1]
if(dataset_tmin_chaar==TRUE) sensor <- paste(sensor,".",sep="") # esto es lo diferente, por bug #
vars <- v[grepl( sensor, v, fixed = TRUE)] # extraigo todas las variables relacionadas con sensor
#vars <- vars[-length(vars)] # quito la última variable min_t
return(vars)
}
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
source("../dataset-processing.R")
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
source("../metrics.R")
dataset <- get.dataset("dacc")
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_date(format = ""),
## date = col_date(format = "")
## )
## See spec(...) for full column specifications.
data <- dataset$data
#dividir dataset en entrenamiento y testeo.
### Training set y test dataset
# porcentaje para train set split
porc_train = 0.68
until <- round(nrow(data)*porc_train)
consideramos estacion junin
print(dataset$pred[1])
## [1] "junin.temp_min"
vars <- vars.del.sensor(dataset$pred[1],colnames(data))
training.set = data[1:until-1, vars] # This is training set to learn the parameters
test.set = data[until:nrow(data), vars]
library(frost)
Maldonado
dewpoint <- calcDewPoint(training.set$junin.humedad_med,training.set$junin.temp_med,mode = "B")
## Warning in if (RH < 50) {: the condition has length > 1 and only the first
## element will be used
dw.test <- calcDewPoint(test.set$junin.humedad_med,test.set$junin.temp_med,mode = "B")
## Warning in if (RH < 50) {: the condition has length > 1 and only the first
## element will be used
model.mza <- buildMdz(dw=dewpoint, tempMax=training.set$junin.temp_max, tmin=training.set$junin.temp_min)
# espero un arreglo de valores. si da error, deberé usar sapply.
predmza <- predMdz(dw = dw.test, tempMax = test.set$junin.temp_max, model=model.mza)
evaluate(predmza,test.set$junin.temp_min)
## $rmse
## [1] 17.44
##
## $r2
## [1] 0.79
##
## $sens
## [1] 0
##
## $spec
## [1] 1
##
## $prec
## [1] NA
##
## $acc
## Accuracy
## 0.87
##
## $cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction (-20,0] (0,50]
## (-20,0] 0 0
## (0,50] 222 1545
##
## Accuracy : 0.8744
## 95% CI : (0.858, 0.8895)
## No Information Rate : 0.8744
## P-Value [Acc > NIR] : 0.5179
##
## Kappa : 0
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.8744
## Precision : NA
## Recall : 0.0000
## F1 : NA
## Prevalence : 0.1256
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : (-20,0]
##
##
## $MAE
## [1] 17.14
plot(predmza,test.set$junin.temp_min)
FAO modelo
model.FAO <- buildFAO(dw=dewpoint,temp = training.set$junin.temp_med,tmin=training.set$junin.temp_min)
# espero un arreglo de valores. si da error, deberé usar sapply.
pred <- predFAO(model=model.FAO,t=test.set$junin.temp_med,dw=dw.test)
# comparar resultados
evaluate(pred,test.set$junin.temp_min)
## $rmse
## [1] 2.51
##
## $r2
## [1] 0.89
##
## $sens
## [1] 0.53
##
## $spec
## [1] 0.99
##
## $prec
## [1] 0.87
##
## $acc
## Accuracy
## 0.93
##
## $cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction (-20,0] (0,50]
## (-20,0] 118 17
## (0,50] 104 1528
##
## Accuracy : 0.9315
## 95% CI : (0.9187, 0.9429)
## No Information Rate : 0.8744
## P-Value [Acc > NIR] : 3.206e-15
##
## Kappa : 0.6255
## Mcnemar's Test P-Value : 5.359e-15
##
## Sensitivity : 0.53153
## Specificity : 0.98900
## Pos Pred Value : 0.87407
## Neg Pred Value : 0.93627
## Precision : 0.87407
## Recall : 0.53153
## F1 : 0.66106
## Prevalence : 0.12564
## Detection Rate : 0.06678
## Detection Prevalence : 0.07640
## Balanced Accuracy : 0.76026
##
## 'Positive' Class : (-20,0]
##
##
## $MAE
## [1] 2.01
plot(pred,test.set$junin.temp_min)
random forest
library(readr)
dacc_junin_rf <- read_csv("dacc--junin.temp_min--normal--all--1--rf--Y-vs-Y_pred.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## y_real = col_double(),
## y_pred = col_double()
## )
evaluate(dacc_junin_rf$y_pred,dacc_junin_rf$y_real)
## $rmse
## [1] 1.12
##
## $r2
## [1] 0.97
##
## $sens
## [1] 0.77
##
## $spec
## [1] 0.99
##
## $prec
## [1] 0.93
##
## $acc
## Accuracy
## 0.96
##
## $cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction (-20,0] (0,50]
## (-20,0] 171 13
## (0,50] 51 1532
##
## Accuracy : 0.9638
## 95% CI : (0.954, 0.972)
## No Information Rate : 0.8744
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8221
## Mcnemar's Test P-Value : 3.746e-06
##
## Sensitivity : 0.77027
## Specificity : 0.99159
## Pos Pred Value : 0.92935
## Neg Pred Value : 0.96778
## Precision : 0.92935
## Recall : 0.77027
## F1 : 0.84236
## Prevalence : 0.12564
## Detection Rate : 0.09677
## Detection Prevalence : 0.10413
## Balanced Accuracy : 0.88093
##
## 'Positive' Class : (-20,0]
##
##
## $MAE
## [1] 0.84
plot(dacc_junin_rf$y_pred,dacc_junin_rf$y_real)
dacc_junin_bn <- read_csv("dacc--junin.temp_min--normal--all--1--bnReg--Y-vs-Y_pred.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## y_real = col_double(),
## y_pred = col_double()
## )
evaluate(dacc_junin_bn$y_pred,dacc_junin_bn$y_real)
## $rmse
## [1] 1.17
##
## $r2
## [1] 0.97
##
## $sens
## [1] 0.84
##
## $spec
## [1] 0.99
##
## $prec
## [1] 0.9
##
## $acc
## Accuracy
## 0.97
##
## $cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction (-20,0] (0,50]
## (-20,0] 186 21
## (0,50] 36 1524
##
## Accuracy : 0.9677
## 95% CI : (0.9584, 0.9755)
## No Information Rate : 0.8744
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8488
## Mcnemar's Test P-Value : 0.06369
##
## Sensitivity : 0.8378
## Specificity : 0.9864
## Pos Pred Value : 0.8986
## Neg Pred Value : 0.9769
## Precision : 0.8986
## Recall : 0.8378
## F1 : 0.8671
## Prevalence : 0.1256
## Detection Rate : 0.1053
## Detection Prevalence : 0.1171
## Balanced Accuracy : 0.9121
##
## 'Positive' Class : (-20,0]
##
##
## $MAE
## [1] 0.88
plot(dacc_junin_bn$y_pred,dacc_junin_bn$y_real)
ignorar desde aqui abajo
### AHORA dejando solo los dias claros/despejados
load dataset
#library(readr)
#df <- read_csv("~/phd-repos/datasets/darksky/competitor-frost-lib-iot-journal-fao-maldonado/dataset-FAO-test.csv",
# col_types = cols(time = col_character(),
# time_dw_temp = col_character()))
#train <- training.set[which(training.set$icon %in% c("clear-day","partly-cloudy-day")),]
#test <- test.set[which(test.set$icon %in% c("clear-day","partly-cloudy-day")),]