library(lubridate)
library(tidyverse)
library(pROC)
library(plotROC)
library(caret)
library(doMC)
registerDoMC(cores=8)
winter and summer data was merged in one file
data %>% group_by(eng_season) %>% summarise(total=n())
Creating FNL lags for 6, 12, 18 and 24 hs before the target_day
lags<- data %>% select(-eng_season, -target_timestamp, -target_year, -target_day, -target_hour,-eng_cyclic_day, -eng_nino_year, -eng_nino_month, -eng_zonda,-target_month, -precip_3hr_sum) %>% imap_dfc(~set_names(map(1:4, lag, x = .x),
paste(.y, 'lag', c("6","12","18","24"), sep = '_')))
trainset<-lags %>% bind_cols(data %>% select(eng_season, target_timestamp, target_year, target_day, eng_cyclic_day, eng_nino_year, eng_nino_month, eng_zonda,target_month), .) %>% slice(7:nrow(data))
# considering only timestamp at 00 hour
trainset<-trainset[seq(1,nrow(trainset),4),]
# Removing the hours from timestamp
trainset<-trainset %>% mutate(target_timestamp=as.integer(target_timestamp/100))
head(trainset)
The cmorph daily file contains the 4 closest geo-points to Mendoza Aerodrome station. Taken from From ./2.selected/cmorph_mza_daily/cmorph_paste_mza_dly
cmorph_daily<-read_delim("data/cmorph_mza_dly",delim =" ")
Parsed with column specification:
cols(
`1998010100` = col_integer(),
`0.496000` = col_double(),
`0.340000` = col_double(),
`0.168000` = col_double(),
`0.157000` = col_double()
)
names(cmorph_daily)<-c("target_timestamp","cmorph1","cmorph2","cmorph3","cmorph4")
head(cmorph_daily)
Merging Cmoprh daily with previous trainset
trainset_merged <- inner_join(trainset,cmorph_daily %>% mutate(target_timestamp=as.integer(target_timestamp/100)),by="target_timestamp")
The two closest point to MWO are cmorph1 and cmorph3
library(ggmap)
library(sp)
mza_map <- get_map(location = "mendoza", zoom = 10,maptype = "roadmap",color = "bw")
Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=mendoza&zoom=10&size=640x640&scale=2&maptype=roadmap&language=en-EN&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=mendoza&sensor=false
mza_cmorph_geo_data=read.csv("/home/harpo/Dropbox/ongoing-work/meteo/scripts/R/cmorph_mza_data_geo.csv",header=T,sep=",")
selected_mza_cmorph_geo_data<-mza_cmorph_geo_data %>%
filter(cuadrante=="5_5" | cuadrante=="5_6" | cuadrante=="6_6" | cuadrante=="6_5" )
selected_mza_cmorph_geo_data<-cbind(nombre=c("cmorph1","cmorph2","cmorph3","cmorph4"),selected_mza_cmorph_geo_data)
gg<-ggmap(mza_map)+
geom_point(aes(x=lon - 360 ,y=lat),data=selected_mza_cmorph_geo_data,color='red')+
geom_text(aes(x=lon - 360.03 ,y=lat+0.02,label=cuadrante),size=3,data=selected_mza_cmorph_geo_data,color='red')+
geom_text(aes(x=lon - 360.03 ,y=lat+0.04,label=nombre),size=3,data=selected_mza_cmorph_geo_data,color='red')+
geom_point(y=-32.8331,x=-68.7715,size=3,color='blue')
gg
threshold=0.0
The point corresponding to CMORPH3 5_6 was chosen as the target precipitation volume
trainset_labeled <- trainset_merged %>%
mutate(precip_reg=cmorph3,precip=ifelse(precip_reg>threshold,'yes','no'),eng_season_num=ifelse(eng_season=="summer",1,0)) %>%
select(-cmorph1, -cmorph2, -cmorph3, -cmorph4, -eng_season)
trainset_labeled %>% group_by(eng_season_num,precip) %>% summarise(total=n()) %>% ggplot()+
geom_col(aes(x=as.factor(eng_season_num),y=total,fill=precip))+
theme_bw()
trainset_labeled<-trainset_labeled %>%
select( -contains("lag_24") ) %>%
select( -contains("lag_18") ) %>%
#select( -contains("lag_12") ) %>%
select(-target_year, -target_timestamp ,-eng_zonda)
#%>%
#filter(eng_season_num==1) %>%
#select(-eng_season_num)
#names(trainset_labeled)
as.data.frame(names(trainset_labeled_selected))
set.seed(12121)
trainset_labeled_selected$precip<-as.factor(trainset_labeled_selected$precip)
train_index <- createDataPartition(trainset_labeled_selected$precip , p=0.80, list=FALSE)
data_train <- trainset_labeled_selected[ train_index,]
data_test <- trainset_labeled_selected[-train_index,]
data_train <- data_train %>% select(-precip_reg)
#data_reg <- data_reg %>% slice(1:nrow(data_reg)) # check here for lag
# add precipitation numerical value to test
#data_test$precip_reg <- trainset_merged[-train_index,]$cmorph1
train_formula<-formula(precip ~ .)
nrow(data_train)
[1] 2171
registerDoMC(cores=8)
ctrl_fast <- trainControl(method="repeatedcv",
repeats=1,
number=5,
summaryFunction=twoClassSummary,
verboseIter=F,
#preProcOptions = list(pcaComp = 10),
classProbs=TRUE,
allowParallel = T)
nnGrid <- expand.grid(decay = c(0), size=c(10,20,50,100,200,300))
svmGrid <- expand.grid(sigma = c(0.001,0.00001), C=c(0.1,0.001,0.5))
kerasGrid <- expand.grid(
size=c(20,50,2000,5000),
lambda=c(0),
batch_size=c(1024),
decay=c(0),
activation=c('tanh','sigmoid','relu'),
lr=c(0.001), rho=c(0.9)
)
ctrl_fast$sampling<-"smote"
rfFit <- caret::train(train_formula,
data = data_train,
metric="ROC",
#preProcess=c("pca"),
#method = "mlpWeightDecay",
#method = "svmRadial",
#method ="mlpKerasDecay",
method = "rf",
#tuneGrid=kerasGrid,
tuneLength=11,
verbose=0,
#epochs=100,
trControl = ctrl_fast)
rfFit
Random Forest
2171 samples
112 predictors
2 classes: 'no', 'yes'
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 1 times)
Summary of sample sizes: 1736, 1737, 1737, 1737, 1737
Addtional sampling using SMOTE
Resampling results across tuning parameters:
mtry ROC Sens Spec
2 0.7545842 0.8185177 0.4931619
13 0.7554712 0.8382199 0.4592636
24 0.7542969 0.8318298 0.4693746
35 0.7485711 0.8381943 0.4111631
46 0.7396687 0.8366071 0.4177674
57 0.7474437 0.8440525 0.4626534
68 0.7443988 0.8259773 0.4386908
79 0.7361201 0.8323631 0.4143776
90 0.7456732 0.8270355 0.4522501
101 0.7455349 0.8392865 0.4044418
112 0.7402596 0.8243660 0.4586791
ROC was used to select the optimal model using the largest value.
The final value used for the model was mtry = 13.
ggplot(rfFit)+theme_bw()
THe list of the most relevant features used by the RandomForest Classifier
varImp(rfFit, scale = F)
rf variable importance
only 20 most important variables shown (out of 112)
Overall
eng_season_num 28.70
fnl_cin_3__1_t0_lag_12 27.89
fnl_cape_3__1_t0_lag_12 22.89
fnl_cape_3__3_t0_lag_12 21.44
target_month 18.02
fnl_cin_3__1_t0_lag_6 17.93
fnl_sealevelpress_3__3_t0_lag_6 16.26
fnl_cin_3__2_t0_lag_12 15.86
fnl_cape_3__1_t0_lag_6 14.88
fnl_sealevelpress_1__3_t0_lag_6 14.35
fnl_cin_3__3_t0_lag_12 14.27
fnl_cape_3__2_t0_lag_12 13.39
fnl_vwind_850_1__3_t0_lag_12 12.44
fnl_vwind_850_1__2_t0_lag_12 12.21
fnl_absvorticity_500_1__3_t0_lag_12 12.06
fnl_cape_3__2_t0_lag_6 11.97
fnl_vwind_850_1__1_t0_lag_12 11.77
fnl_vwind_850_3__3_t0_lag_12 11.70
fnl_vwind_850_2__3_t0_lag_12 11.43
fnl_uwind_850_1__1_t0_lag_6 11.26
data_test_results<-cbind(data_test,predicted_precip_probs=predict(rfFit,data_test,type='prob'))
data_test_results<-cbind(data_test_results,predicted_precip=ifelse(data_test_results$predicted_precip_probs.yes >=0.4,'yes','no'))
cm<-caret::confusionMatrix(as.factor(data_test_results$predicted_precip),data_test_results$precip,positive="yes")
cm
Confusion Matrix and Statistics
Reference
Prediction no yes
no 339 20
yes 130 52
Accuracy : 0.7227
95% CI : (0.6829, 0.7601)
No Information Rate : 0.8669
P-Value [Acc > NIR] : 1
Kappa : 0.2703
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.72222
Specificity : 0.72281
Pos Pred Value : 0.28571
Neg Pred Value : 0.94429
Prevalence : 0.13309
Detection Rate : 0.09612
Detection Prevalence : 0.33641
Balanced Accuracy : 0.72252
'Positive' Class : yes
#plot(roc(data_test$profile,predsrprofilerobsamp$Malicious))
ggplot(cbind(predict(rfFit,data_test,type='prob'),class=data_test$precip),
aes(m = yes, d = factor(class, labels=c("no","yes"),levels = c("no", "yes")))) +
geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') +
geom_abline(intercept = 0, slope = 1,colour='red')+
theme_bw()
FP_days<- as.data.frame(data_test_results) %>% filter(precip == 'no' & predicted_precip == 'yes')
TP_days<- as.data.frame(data_test_results) %>% filter(precip == 'yes' & predicted_precip == 'yes')
FN_days<- as.data.frame(data_test_results) %>% filter(precip == 'yes' & predicted_precip == 'no')
TN_days<- as.data.frame(data_test_results) %>% filter(precip == 'no' & predicted_precip == 'no')
rbind(data.frame(variable=rep("FN",length(FN_days$precip_reg)),value=FN_days$precip_reg),
data.frame(variable=rep("TP",length(TP_days$precip_reg)),value=TP_days$precip_reg)
#data.frame(variable=rep("TN",length(TN_days$precip_reg)),value=TN_days$precip_reg),
#data.frame(variable=rep("FP",length(FP_days$precip_reg)),value=FP_days$precip_reg)
) %>%
ggplot()+
geom_boxplot(aes(x=variable,y=value,fill=variable))+
ylab("precipitation volume")+xlab("results")+
theme_bw()