fao-model-dataset.R

To install frost package run:

# install.packages("devtools")
# library(devtools)
# install_github("anadiedrichs/frost")

# notebook published --> http://rpubs.com/adiedrichs/410193

# variables: colnames o conjunto de variables del dataset

# var: nombre variable a predecir,ejemplo *_tmin
vars.del.sensor <- function(var,variables,dataset_tmin_chaar=FALSE)
{
  v <- variables
  sensor <- unlist(strsplit(var,split=".",fixed = TRUE))[1]
  if(dataset_tmin_chaar==TRUE) sensor <- paste(sensor,".",sep="") # esto es lo diferente, por bug #
  
  vars <- v[grepl( sensor, v, fixed = TRUE)] # extraigo todas las variables relacionadas con sensor
  #vars <- vars[-length(vars)] # quito la última variable min_t
  return(vars)
}
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

source("../dataset-processing.R")

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

source("../metrics.R")

dataset <- get.dataset("dacc")

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_date(format = ""),
##   date = col_date(format = "")
## )

## See spec(...) for full column specifications.

data <- dataset$data

#dividir dataset en entrenamiento y testeo.
### Training set y test dataset
# porcentaje para train set split
porc_train = 0.68
until <- round(nrow(data)*porc_train)

consideramos estacion junin

print(dataset$pred[1])

## [1] "junin.temp_min"

vars <- vars.del.sensor(dataset$pred[1],colnames(data))
training.set = data[1:until-1, vars] # This is training set to learn the parameters
test.set = data[until:nrow(data), vars]

library(frost)

Maldonado

dewpoint <- calcDewPoint(training.set$junin.humedad_med,training.set$junin.temp_med,mode = "B")

## Warning in if (RH < 50) {: the condition has length > 1 and only the first
## element will be used

dw.test <- calcDewPoint(test.set$junin.humedad_med,test.set$junin.temp_med,mode = "B")

## Warning in if (RH < 50) {: the condition has length > 1 and only the first
## element will be used

model.mza <- buildMdz(dw=dewpoint, tempMax=training.set$junin.temp_max, tmin=training.set$junin.temp_min)

# espero un arreglo de valores. si da error, deberé usar sapply.
predmza <- predMdz(dw = dw.test, tempMax = test.set$junin.temp_max, model=model.mza)

evaluate(predmza,test.set$junin.temp_min)

## $rmse
## [1] 17.44
## 
## $r2
## [1] 0.79
## 
## $sens
## [1] 0
## 
## $spec
## [1] 1
## 
## $prec
## [1] NA
## 
## $acc
## Accuracy 
##     0.87 
## 
## $cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (-20,0] (0,50]
##    (-20,0]       0      0
##    (0,50]      222   1545
##                                          
##                Accuracy : 0.8744         
##                  95% CI : (0.858, 0.8895)
##     No Information Rate : 0.8744         
##     P-Value [Acc > NIR] : 0.5179         
##                                          
##                   Kappa : 0              
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.0000         
##             Specificity : 1.0000         
##          Pos Pred Value :    NaN         
##          Neg Pred Value : 0.8744         
##               Precision :     NA         
##                  Recall : 0.0000         
##                      F1 :     NA         
##              Prevalence : 0.1256         
##          Detection Rate : 0.0000         
##    Detection Prevalence : 0.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : (-20,0]        
##                                          
## 
## $MAE
## [1] 17.14

plot(predmza,test.set$junin.temp_min)

FAO modelo

model.FAO <- buildFAO(dw=dewpoint,temp = training.set$junin.temp_med,tmin=training.set$junin.temp_min)

# espero un arreglo de valores. si da error, deberé usar sapply.
pred <- predFAO(model=model.FAO,t=test.set$junin.temp_med,dw=dw.test)
# comparar resultados
evaluate(pred,test.set$junin.temp_min)

## $rmse
## [1] 2.51
## 
## $r2
## [1] 0.89
## 
## $sens
## [1] 0.53
## 
## $spec
## [1] 0.99
## 
## $prec
## [1] 0.87
## 
## $acc
## Accuracy 
##     0.93 
## 
## $cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (-20,0] (0,50]
##    (-20,0]     118     17
##    (0,50]      104   1528
##                                           
##                Accuracy : 0.9315          
##                  95% CI : (0.9187, 0.9429)
##     No Information Rate : 0.8744          
##     P-Value [Acc > NIR] : 3.206e-15       
##                                           
##                   Kappa : 0.6255          
##  Mcnemar's Test P-Value : 5.359e-15       
##                                           
##             Sensitivity : 0.53153         
##             Specificity : 0.98900         
##          Pos Pred Value : 0.87407         
##          Neg Pred Value : 0.93627         
##               Precision : 0.87407         
##                  Recall : 0.53153         
##                      F1 : 0.66106         
##              Prevalence : 0.12564         
##          Detection Rate : 0.06678         
##    Detection Prevalence : 0.07640         
##       Balanced Accuracy : 0.76026         
##                                           
##        'Positive' Class : (-20,0]         
##                                           
## 
## $MAE
## [1] 2.01

plot(pred,test.set$junin.temp_min)

random forest

library(readr)
dacc_junin_rf <- read_csv("dacc--junin.temp_min--normal--all--1--rf--Y-vs-Y_pred.csv")

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   y_real = col_double(),
##   y_pred = col_double()
## )

evaluate(dacc_junin_rf$y_pred,dacc_junin_rf$y_real)

## $rmse
## [1] 1.12
## 
## $r2
## [1] 0.97
## 
## $sens
## [1] 0.77
## 
## $spec
## [1] 0.99
## 
## $prec
## [1] 0.93
## 
## $acc
## Accuracy 
##     0.96 
## 
## $cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (-20,0] (0,50]
##    (-20,0]     171     13
##    (0,50]       51   1532
##                                         
##                Accuracy : 0.9638        
##                  95% CI : (0.954, 0.972)
##     No Information Rate : 0.8744        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.8221        
##  Mcnemar's Test P-Value : 3.746e-06     
##                                         
##             Sensitivity : 0.77027       
##             Specificity : 0.99159       
##          Pos Pred Value : 0.92935       
##          Neg Pred Value : 0.96778       
##               Precision : 0.92935       
##                  Recall : 0.77027       
##                      F1 : 0.84236       
##              Prevalence : 0.12564       
##          Detection Rate : 0.09677       
##    Detection Prevalence : 0.10413       
##       Balanced Accuracy : 0.88093       
##                                         
##        'Positive' Class : (-20,0]       
##                                         
## 
## $MAE
## [1] 0.84

plot(dacc_junin_rf$y_pred,dacc_junin_rf$y_real)

dacc_junin_bn <- read_csv("dacc--junin.temp_min--normal--all--1--bnReg--Y-vs-Y_pred.csv")

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   y_real = col_double(),
##   y_pred = col_double()
## )

evaluate(dacc_junin_bn$y_pred,dacc_junin_bn$y_real)

## $rmse
## [1] 1.17
## 
## $r2
## [1] 0.97
## 
## $sens
## [1] 0.84
## 
## $spec
## [1] 0.99
## 
## $prec
## [1] 0.9
## 
## $acc
## Accuracy 
##     0.97 
## 
## $cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction (-20,0] (0,50]
##    (-20,0]     186     21
##    (0,50]       36   1524
##                                           
##                Accuracy : 0.9677          
##                  95% CI : (0.9584, 0.9755)
##     No Information Rate : 0.8744          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8488          
##  Mcnemar's Test P-Value : 0.06369         
##                                           
##             Sensitivity : 0.8378          
##             Specificity : 0.9864          
##          Pos Pred Value : 0.8986          
##          Neg Pred Value : 0.9769          
##               Precision : 0.8986          
##                  Recall : 0.8378          
##                      F1 : 0.8671          
##              Prevalence : 0.1256          
##          Detection Rate : 0.1053          
##    Detection Prevalence : 0.1171          
##       Balanced Accuracy : 0.9121          
##                                           
##        'Positive' Class : (-20,0]         
##                                           
## 
## $MAE
## [1] 0.88

plot(dacc_junin_bn$y_pred,dacc_junin_bn$y_real)

ignorar desde aqui abajo

### AHORA dejando solo los dias claros/despejados

load dataset

#library(readr)
#df <- read_csv("~/phd-repos/datasets/darksky/competitor-frost-lib-iot-journal-fao-maldonado/dataset-FAO-test.csv", 
#                             col_types = cols(time = col_character(), 
#                                              time_dw_temp = col_character()))

#train <- training.set[which(training.set$icon %in% c("clear-day","partly-cloudy-day")),]
#test <-  test.set[which(test.set$icon  %in% c("clear-day","partly-cloudy-day")),]

fao-model-dataset.R

ana

Thu Feb 14 02:20:17 2019