Two dataset used:
# Dataset Station vs. Cmorph bilinear
station_vs_cmorph_mza=read.csv(
file="/home/hadoop/paula/experiments/experiment3-cmorph-correction/station-vs-cmorph-mza-daily.csv",sep=',',header=F)
names(station_vs_cmorph_mza)<-c("ts","station","cmorph")
# Dataset Station vs. 4 Cmorph points
mza_cmorph_station_fit_data=read.csv(file="/home/hadoop/paula/experiments/experiment3-cmorph-correction/mza_cmorph_station_fit.csv")
Bilinear extrapolation shows a Pearson Correlation Coeficient of 0.0612153
plot(rollapply(station_vs_cmorph_mza$station,5,mean,by=5),type='l',col='red',ylab='precip',main="Station vs CMORPH cummulative events for MZA (avg 5 data points)",xaxt='n')
lines(rollapply(station_vs_cmorph_mza$cmorph,5,mean,by=5),col='blue')
legend("topleft",legend=c("station","cmorph"),col=c("red","blue"),cex=0.8,bty="n",lty=1)
plot(station_vs_cmorph_mza$cmorph,station_vs_cmorph_mza$station,col='skyblue',main="predictions vs. observations",ylab="prediction",xlab="observation")
Selecting only the 90% of the dataset 2 for training. However, the 100% dataset is used for testing.
set.seed(212312)
trainindex <- createDataPartition(mza_cmorph_station_fit_data$station, p = 0.9, list=FALSE, times=1)
trainset <- mza_cmorph_station_fit_data[trainindex,-c(1)]
testset_full=mza_cmorph_station_fit_data %>% filter(ts<=20141231)
testset= testset_full %>% select(cmorph1,cmorph2,cmorph3,cmorph4)
ctrl_fast <- trainControl(method="cv",
repeats=2,
number=10,
# summaryFunction=twoClassSummary,
verboseIter=T,
classProbs=F,
savePredictions = TRUE,
allowParallel = TRUE)
# Random Forest
rfFit <- train(station ~ .,
data = trainset,
metric='RMSE',
method = "rf",
trControl = ctrl_fast)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
Model and crossvalidation results
rfFit
## Random Forest
##
## 1645 samples
## 4 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1481, 1480, 1480, 1480, 1481, 1481, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 6.733634 0.09897565
## 3 6.837732 0.09676287
## 4 6.961563 0.09364781
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
rfFit$resample
## RMSE Rsquared Resample
## 1 5.018049 0.00709514 Fold02
## 2 5.004449 0.05762942 Fold01
## 3 5.155153 0.16826051 Fold03
## 4 8.129372 0.01725069 Fold06
## 5 6.407142 0.12019942 Fold05
## 6 9.168591 0.03705400 Fold04
## 7 5.454564 0.22172733 Fold07
## 8 10.811764 0.04414834 Fold10
## 9 6.905912 0.27436783 Fold09
## 10 5.281344 0.04202387 Fold08
bwplot(~RMSE,rfFit$resample,xlab="crossvalidation RMSE over 10 folds")
rfpredicts=predict(rfFit,testset)
predict_data=as.data.frame(cbind(observation=testset_full$station,prediction=rfpredicts))
Random Forest model shows a Pearson Correlation Coeficiente of 0.7931453
plot(rollapply(predict_data$observation,5,mean,by=5),type='l',col='red',ylab='precip',main="Station vs RF predictions for MZA (avg 5 data points)")
lines( rollapply(predict_data$prediction,5,mean,by=5),col='blue')
legend("topleft",legend=c("station","RF predictions"),col=c("red","blue"),cex=0.8,bty="n",lty=1)
plot(predict_data,col='skyblue',main="predictions vs. observations")