library(lubridate)
library(tidyverse)
library(pROC)
library(plotROC)
library(caret)
library(doMC)
registerDoMC(cores=8)
data %>% group_by(eng_season) %>% summarise(total=n())
Creating FNL lags for 6, 12, 18 and 24 hs before the target_day
lags<- data %>% select(-eng_season, -target_timestamp, -target_year, -target_day, -target_hour,-eng_cyclic_day, -eng_nino_year, -eng_nino_month, -eng_zonda,-target_month, -precip_3hr_sum) %>% imap_dfc(~set_names(map(1:4, lag, x = .x),
paste(.y, 'lag', c("6","12","18","24"), sep = '_')))
trainset<-lags %>% bind_cols(data %>% select(eng_season, target_timestamp, target_year, target_day, eng_cyclic_day, eng_nino_year, eng_nino_month, eng_zonda,target_month), .) %>% slice(7:nrow(data))
# considering only timestamp at 00 hour
trainset<-trainset[seq(1,nrow(trainset),4),]
# Removing the hours from timestamp
trainset<-trainset %>% mutate(target_timestamp=as.integer(target_timestamp/100))
head(trainset)
The cmorph daily file contains the 4 closestgeo-points to Mendoza Aerodrome station. Taken from From ./2.selected/cmorph_mza_daily/cmorph_paste_mza_dly
Merging Cmoprh daily with previous trainset
trainset_merged <- inner_join(trainset,cmorph_daily %>% mutate(target_timestamp=as.integer(target_timestamp/100)),by="target_timestamp")
The two closest point to MWO are cmorph1 and cmorph3
threshold=0.0
The point corresponding to CMORPH3 5_6 was chosen as the target precipitation volume
threshold=0.0
trainset_labeled <- trainset_merged %>%
mutate(precip_reg=cmorph3,precip=ifelse(precip_reg>threshold,'yes','no'),eng_season_num=ifelse(eng_season=="summer",1,0)) %>%
select(-cmorph1, -cmorph2, -cmorph3, -cmorph4, -eng_season)
trainset_labeled %>% group_by(eng_season_num,precip) %>% summarise(total=n()) %>% ggplot()+
geom_col(aes(x=as.factor(eng_season_num),y=total,fill=precip))+
theme_bw()
trainset_labeled<-trainset_labeled %>%
select( -contains("lag_24") ) %>%
#select( -contains("lag_18") ) %>%
#select( -contains("lag_12") ) %>%
select(-target_year, -target_timestamp ,-eng_zonda)
#%>%
#filter(eng_season_num==1) %>%
#select(-eng_season_num)
names(trainset_labeled)
names(trainset_labeled_selected)
[1] "target_day" "eng_nino_year" "eng_nino_month"
[4] "target_month" "fnl_absvorticity_500_1__1_t0_lag_6" "fnl_absvorticity_500_1__1_t0_lag_12"
[7] "fnl_absvorticity_500_1__1_t0_lag_18" "fnl_absvorticity_500_2__1_t0_lag_6" "fnl_absvorticity_500_2__1_t0_lag_12"
[10] "fnl_absvorticity_500_2__1_t0_lag_18" "fnl_absvorticity_500_3__1_t0_lag_6" "fnl_absvorticity_500_3__1_t0_lag_12"
[13] "fnl_absvorticity_500_3__1_t0_lag_18" "fnl_absvorticity_500_1__2_t0_lag_6" "fnl_absvorticity_500_1__2_t0_lag_12"
[16] "fnl_absvorticity_500_1__2_t0_lag_18" "fnl_absvorticity_500_2__2_t0_lag_6" "fnl_absvorticity_500_2__2_t0_lag_12"
[19] "fnl_absvorticity_500_2__2_t0_lag_18" "fnl_absvorticity_500_3__2_t0_lag_6" "fnl_absvorticity_500_3__2_t0_lag_12"
[22] "fnl_absvorticity_500_3__2_t0_lag_18" "fnl_absvorticity_500_1__3_t0_lag_6" "fnl_absvorticity_500_1__3_t0_lag_12"
[25] "fnl_absvorticity_500_1__3_t0_lag_18" "fnl_absvorticity_500_2__3_t0_lag_6" "fnl_absvorticity_500_2__3_t0_lag_12"
[28] "fnl_absvorticity_500_2__3_t0_lag_18" "fnl_absvorticity_500_3__3_t0_lag_6" "fnl_absvorticity_500_3__3_t0_lag_12"
[31] "fnl_absvorticity_500_3__3_t0_lag_18" "fnl_temperature_500_1__3_t0_lag_6" "fnl_uwind_850_1__1_t0_lag_6"
[34] "fnl_uwind_850_2__1_t0_lag_6" "fnl_uwind_850_2__1_t0_lag_12" "fnl_uwind_850_3__1_t0_lag_6"
[37] "fnl_uwind_850_3__1_t0_lag_12" "fnl_uwind_850_2__2_t0_lag_18" "fnl_uwind_850_3__2_t0_lag_6"
[40] "fnl_uwind_850_3__2_t0_lag_12" "fnl_uwind_850_3__2_t0_lag_18" "fnl_uwind_850_1__3_t0_lag_6"
[43] "fnl_uwind_850_1__3_t0_lag_12" "fnl_uwind_850_1__3_t0_lag_18" "fnl_uwind_850_2__3_t0_lag_6"
[46] "fnl_uwind_850_2__3_t0_lag_12" "fnl_uwind_850_2__3_t0_lag_18" "fnl_uwind_850_3__3_t0_lag_6"
[49] "fnl_uwind_850_3__3_t0_lag_12" "fnl_uwind_850_3__3_t0_lag_18" "fnl_vwind_850_1__1_t0_lag_6"
[52] "fnl_vwind_850_1__1_t0_lag_12" "fnl_vwind_850_1__1_t0_lag_18" "fnl_vwind_850_2__1_t0_lag_6"
[55] "fnl_vwind_850_2__1_t0_lag_12" "fnl_vwind_850_2__1_t0_lag_18" "fnl_vwind_850_3__1_t0_lag_6"
[58] "fnl_vwind_850_3__1_t0_lag_12" "fnl_vwind_850_1__2_t0_lag_6" "fnl_vwind_850_1__2_t0_lag_12"
[61] "fnl_vwind_850_2__2_t0_lag_6" "fnl_vwind_850_2__2_t0_lag_12" "fnl_vwind_850_2__2_t0_lag_18"
[64] "fnl_vwind_850_3__2_t0_lag_6" "fnl_vwind_850_3__2_t0_lag_12" "fnl_vwind_850_3__2_t0_lag_18"
[67] "fnl_vwind_850_1__3_t0_lag_6" "fnl_vwind_850_1__3_t0_lag_12" "fnl_vwind_850_2__3_t0_lag_6"
[70] "fnl_vwind_850_2__3_t0_lag_12" "fnl_vwind_850_2__3_t0_lag_18" "fnl_vwind_850_3__3_t0_lag_6"
[73] "fnl_vwind_850_3__3_t0_lag_12" "fnl_vwind_850_3__3_t0_lag_18" "fnl_cape_1__1_t0_lag_6"
[76] "fnl_cape_1__1_t0_lag_12" "fnl_cape_1__1_t0_lag_18" "fnl_cape_2__1_t0_lag_6"
[79] "fnl_cape_2__1_t0_lag_12" "fnl_cape_2__1_t0_lag_18" "fnl_cape_3__1_t0_lag_6"
[82] "fnl_cape_3__1_t0_lag_12" "fnl_cape_3__1_t0_lag_18" "fnl_cape_1__2_t0_lag_6"
[85] "fnl_cape_1__2_t0_lag_12" "fnl_cape_1__2_t0_lag_18" "fnl_cape_2__2_t0_lag_6"
[88] "fnl_cape_2__2_t0_lag_12" "fnl_cape_2__2_t0_lag_18" "fnl_cape_3__2_t0_lag_6"
[91] "fnl_cape_3__2_t0_lag_12" "fnl_cape_3__2_t0_lag_18" "fnl_cape_1__3_t0_lag_6"
[94] "fnl_cape_1__3_t0_lag_12" "fnl_cape_1__3_t0_lag_18" "fnl_cape_2__3_t0_lag_6"
[97] "fnl_cape_2__3_t0_lag_12" "fnl_cape_2__3_t0_lag_18" "fnl_cape_3__3_t0_lag_6"
[100] "fnl_cape_3__3_t0_lag_12" "fnl_cape_3__3_t0_lag_18" "fnl_sealevelpress_1__1_t0_lag_18"
[103] "fnl_sealevelpress_3__1_t0_lag_18" "fnl_sealevelpress_1__3_t0_lag_6" "fnl_sealevelpress_3__3_t0_lag_6"
[106] "fnl_relhum_850_1__1_t0_lag_18" "fnl_relhum_850_2__1_t0_lag_6" "fnl_cin_1__1_t0_lag_6"
[109] "fnl_cin_1__1_t0_lag_12" "fnl_cin_1__1_t0_lag_18" "fnl_cin_2__1_t0_lag_6"
[112] "fnl_cin_2__1_t0_lag_12" "fnl_cin_2__1_t0_lag_18" "fnl_cin_3__1_t0_lag_6"
[115] "fnl_cin_3__1_t0_lag_12" "fnl_cin_3__1_t0_lag_18" "fnl_cin_1__2_t0_lag_6"
[118] "fnl_cin_1__2_t0_lag_12" "fnl_cin_1__2_t0_lag_18" "fnl_cin_2__2_t0_lag_6"
[121] "fnl_cin_2__2_t0_lag_12" "fnl_cin_2__2_t0_lag_18" "fnl_cin_3__2_t0_lag_6"
[124] "fnl_cin_3__2_t0_lag_12" "fnl_cin_3__2_t0_lag_18" "fnl_cin_1__3_t0_lag_6"
[127] "fnl_cin_1__3_t0_lag_12" "fnl_cin_1__3_t0_lag_18" "fnl_cin_2__3_t0_lag_6"
[130] "fnl_cin_2__3_t0_lag_12" "fnl_cin_2__3_t0_lag_18" "fnl_cin_3__3_t0_lag_6"
[133] "fnl_cin_3__3_t0_lag_12" "fnl_cin_3__3_t0_lag_18" "fnl_divergence_200_1__1_t0_lag_6"
[136] "fnl_divergence_200_1__1_t0_lag_12" "fnl_divergence_200_1__1_t0_lag_18" "fnl_divergence_200_2__1_t0_lag_6"
[139] "fnl_divergence_200_2__1_t0_lag_12" "fnl_divergence_200_2__1_t0_lag_18" "fnl_divergence_200_3__1_t0_lag_6"
[142] "fnl_divergence_200_3__1_t0_lag_12" "fnl_divergence_200_3__1_t0_lag_18" "fnl_divergence_200_1__2_t0_lag_6"
[145] "fnl_divergence_200_1__2_t0_lag_12" "fnl_divergence_200_1__2_t0_lag_18" "fnl_divergence_200_2__2_t0_lag_6"
[148] "fnl_divergence_200_2__2_t0_lag_12" "fnl_divergence_200_2__2_t0_lag_18" "fnl_divergence_200_3__2_t0_lag_6"
[151] "fnl_divergence_200_3__2_t0_lag_12" "fnl_divergence_200_3__2_t0_lag_18" "fnl_divergence_200_1__3_t0_lag_6"
[154] "fnl_divergence_200_1__3_t0_lag_12" "fnl_divergence_200_1__3_t0_lag_18" "fnl_divergence_200_2__3_t0_lag_6"
[157] "fnl_divergence_200_2__3_t0_lag_12" "fnl_divergence_200_2__3_t0_lag_18" "fnl_divergence_200_3__3_t0_lag_6"
[160] "fnl_divergence_200_3__3_t0_lag_12" "fnl_divergence_200_3__3_t0_lag_18" "precip_reg"
[163] "precip" "eng_season_num"
set.seed(12121)
trainset_labeled_selected$precip<-as.factor(trainset_labeled_selected$precip)
train_index <- createDataPartition(trainset_labeled_selected$precip , p=0.80, list=FALSE)
data_train <- trainset_labeled_selected[ train_index,]
data_test <- trainset_labeled_selected[-train_index,]
data_train <- data_train %>% select(-precip_reg)
#data_reg <- data_reg %>% slice(1:nrow(data_reg)) # check here for lag
# add precipitation numerical value to test
#data_test$precip_reg <- trainset_merged[-train_index,]$cmorph1
train_formula<-formula(precip ~ .)
names(data_train)
nrow(data_train)
ctrl_fast <- trainControl(method="repeatedcv",
repeats=1,
number=5,
summaryFunction=twoClassSummary,
verboseIter=F,
#preProcOptions = list(pcaComp = 10),
classProbs=TRUE,
allowParallel = T)
nnGrid <- expand.grid(decay = c(0), size=c(10,20,50,100,200,300))
svmGrid <- expand.grid(sigma = c(0.001,0.00001), C=c(0.1,0.001,0.5))
kerasGrid <- expand.grid(
size=c(20,50,2000,5000),
lambda=c(0),
batch_size=c(1024),
decay=c(0),
activation=c('tanh','sigmoid','relu'),
lr=c(0.001), rho=c(0.9)
)
ctrl_fast$sampling<-"smote"
rfFit <- caret::train(train_formula,
data = data_train,
#metric="ROC",
#preProcess=c("pca"),
#method = "mlpWeightDecay",
#method = "svmRadial",
#method ="mlpKerasDecay",
method = "rf",
#tuneGrid=kerasGrid,
tuneLength=11,
verbose=0,
#epochs=100,
trControl = ctrl_fast)
rfFit
Random Forest
2171 samples
162 predictors
2 classes: 'no', 'yes'
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 1 times)
Summary of sample sizes: 1737, 1736, 1737, 1737, 1737
Addtional sampling using SMOTE
Resampling results across tuning parameters:
mtry ROC Sens Spec
2 0.7373925 0.8069821 0.4784894
18 0.7341498 0.8230763 0.4131433
34 0.7272402 0.8225162 0.4238430
50 0.7319841 0.8302847 0.3967049
66 0.7351522 0.8352816 0.4156238
82 0.7313635 0.8203001 0.4345428
98 0.7215759 0.8114204 0.4129582
114 0.7249941 0.8308464 0.4264347
130 0.7242694 0.8253001 0.4102184
146 0.7191343 0.8158757 0.4263976
162 0.7250953 0.8247276 0.4184006
ROC was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
ggplot(rfFit)+theme_bw()
THe list of the most relevant features used by the RandomForest Classifier
varImp(rfFit, scale = F)
rf variable importance
only 20 most important variables shown (out of 162)
Overall
fnl_sealevelpress_1__3_t0_lag_6 12.885
fnl_sealevelpress_3__1_t0_lag_18 11.787
fnl_sealevelpress_1__1_t0_lag_18 11.400
fnl_sealevelpress_3__3_t0_lag_6 11.377
fnl_temperature_500_1__3_t0_lag_6 11.305
fnl_vwind_850_1__2_t0_lag_12 11.081
fnl_cape_3__3_t0_lag_12 10.328
fnl_vwind_850_1__3_t0_lag_6 10.325
fnl_cape_3__2_t0_lag_12 10.241
fnl_cin_3__1_t0_lag_18 10.179
fnl_cape_3__3_t0_lag_18 10.136
fnl_vwind_850_1__3_t0_lag_12 10.133
fnl_cape_3__2_t0_lag_18 10.101
fnl_uwind_850_3__1_t0_lag_12 9.882
fnl_vwind_850_1__2_t0_lag_6 9.813
fnl_vwind_850_1__1_t0_lag_18 9.708
fnl_uwind_850_3__3_t0_lag_12 9.672
fnl_cin_3__2_t0_lag_12 9.621
fnl_cin_2__3_t0_lag_12 9.557
fnl_uwind_850_2__1_t0_lag_12 9.506
cm
Confusion Matrix and Statistics
Reference
Prediction no yes
no 290 22
yes 160 69
Accuracy : 0.6636
95% CI : (0.622, 0.7033)
No Information Rate : 0.8318
P-Value [Acc > NIR] : 1
Kappa : 0.2509
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.7582
Specificity : 0.6444
Pos Pred Value : 0.3013
Neg Pred Value : 0.9295
Prevalence : 0.1682
Detection Rate : 0.1275
Detection Prevalence : 0.4233
Balanced Accuracy : 0.7013
'Positive' Class : yes
#plot(roc(data_test$profile,predsrprofilerobsamp$Malicious))
ggplot(cbind(predsrprofilerobsamp,class=data_test$precip),
aes(m = yes, d = factor(class, labels=c("no","yes"),levels = c("no", "yes")))) +
geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') +
geom_abline(intercept = 0, slope = 1,colour='red')+
theme_bw()