Loading datasets

Two dataset used:

# Dataset Station vs. Cmorph bilinear
station_vs_cmorph_mza=read.csv(
  file="/home/hadoop/paula/experiments/experiment3-cmorph-correction/station-vs-cmorph-mza-daily.csv",sep=',',header=F)
names(station_vs_cmorph_mza)<-c("ts","station","cmorph")

# Dataset Station vs. 4 Cmorph points
mza_cmorph_station_fit_data=read.csv(file="/home/hadoop/paula/experiments/experiment3-cmorph-correction/mza_cmorph_station_fit.csv")

Bilinear extrapolation results from David

Bilinear extrapolation shows a Pearson Correlation Coeficient of 0.0612153

Time series plot using avg every 5 points

plot(rollapply(station_vs_cmorph_mza$station,5,mean,by=5),type='l',col='red',ylab='precip',main="Station vs CMORPH cummulative events for MZA (avg 5 data points)",xaxt='n')
lines(rollapply(station_vs_cmorph_mza$cmorph,5,mean,by=5),col='blue')
legend("topleft",legend=c("station","cmorph"),col=c("red","blue"),cex=0.8,bty="n",lty=1)

ScatterPlot Observed vs Predicted

plot(station_vs_cmorph_mza$cmorph,station_vs_cmorph_mza$station,col='skyblue',main="predictions vs. observations",ylab="prediction",xlab="observation")

Evaluation of Random Forest

Dataset setup.

Selecting only the 90% of the dataset 2 for training. However, the 100% dataset is used for testing.

set.seed(212312)
trainindex <- createDataPartition(mza_cmorph_station_fit_data$station, p = 0.9, list=FALSE, times=1)
trainset <- mza_cmorph_station_fit_data[trainindex,-c(1)]

testset_full=mza_cmorph_station_fit_data %>% filter(ts<=20141231) 
testset= testset_full %>% select(cmorph1,cmorph2,cmorph3,cmorph4)

Random Forest Tuning over 2x5-fold cross validation and RMSE as metric

ctrl_fast <- trainControl(method="cv", 
                     repeats=2,
                     number=10, 
                    # summaryFunction=twoClassSummary,
                     verboseIter=T,
                     classProbs=F,
                     savePredictions = TRUE,
                     allowParallel = TRUE)   
# Random Forest
rfFit <- train(station ~ .,
               data = trainset,
               metric='RMSE',
               method = "rf",
               trControl = ctrl_fast)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set

Crossvalidation results

Model and crossvalidation results

rfFit
## Random Forest 
## 
## 1645 samples
##    4 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1481, 1480, 1480, 1480, 1481, 1481, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared  
##   2     6.733634  0.09897565
##   3     6.837732  0.09676287
##   4     6.961563  0.09364781
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 2.
rfFit$resample
##         RMSE   Rsquared Resample
## 1   5.018049 0.00709514   Fold02
## 2   5.004449 0.05762942   Fold01
## 3   5.155153 0.16826051   Fold03
## 4   8.129372 0.01725069   Fold06
## 5   6.407142 0.12019942   Fold05
## 6   9.168591 0.03705400   Fold04
## 7   5.454564 0.22172733   Fold07
## 8  10.811764 0.04414834   Fold10
## 9   6.905912 0.27436783   Fold09
## 10  5.281344 0.04202387   Fold08
bwplot(~RMSE,rfFit$resample,xlab="crossvalidation RMSE over 10 folds")

Predictions on the complete dataset (including the 10% removed during training)

rfpredicts=predict(rfFit,testset)
predict_data=as.data.frame(cbind(observation=testset_full$station,prediction=rfpredicts))

Random Forest model shows a Pearson Correlation Coeficiente of 0.7931453

Time series plot using avg every 5 points

plot(rollapply(predict_data$observation,5,mean,by=5),type='l',col='red',ylab='precip',main="Station vs RF predictions for MZA (avg 5 data points)")
lines( rollapply(predict_data$prediction,5,mean,by=5),col='blue')
legend("topleft",legend=c("station","RF predictions"),col=c("red","blue"),cex=0.8,bty="n",lty=1)

ScatterPlot Observed vs Predicted

plot(predict_data,col='skyblue',main="predictions vs. observations")