International Seminar on Statistics with R
Universidade Federal da Bahia - UFBA
Escola Politécnica - link
Programa de Pós-Graduação em Engenharia Industrial - PEI/UFBA - link
Tabela A - Tabela contendo informações a cerca da pluviometria [mm] dos cem primeiros dias observados.
| V(i-1) | V(i-2) | Condicional V(i-1)+V(i-2) | Condicional V(i-1)+V(i-2)+v(i+3) |
|---|---|---|---|
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 7.0 | 0.0 | 0.0 | 0.0 |
| 12.0 | 7.0 | 19.0 | 0.0 |
| 1.0 | 12.0 | 13.0 | 20.0 |
| 0.0 | 1.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 32.0 | 0.0 | 0.0 | 0.0 |
| 4.0 | 32.0 | 36.0 | 0.0 |
| 6.0 | 4.0 | 10.0 | 42.0 |
| 0.0 | 6.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 16.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 16.0 | 0.0 | 0.0 |
| 6.0 | 0.0 | 0.0 | 0.0 |
| 2.0 | 6.0 | 8.0 | 0.0 |
| 17.0 | 2.0 | 19.0 | 25.0 |
| 0.1 | 17.0 | 17.1 | 19.1 |
| 0.9 | 0.1 | 1.0 | 18.0 |
| 1.0 | 0.9 | 1.9 | 2.0 |
| 11.0 | 1.0 | 12.0 | 12.9 |
| 2.0 | 11.0 | 13.0 | 14.0 |
| 40.0 | 2.0 | 42.0 | 53.0 |
| 4.0 | 40.0 | 44.0 | 46.0 |
| 8.0 | 4.0 | 12.0 | 52.0 |
| 2.0 | 8.0 | 10.0 | 14.0 |
| 0.0 | 2.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.1 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.1 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.6 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.6 | 0.0 | 0.0 |
| 1.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 1.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.1 | 0.0 | 0.0 | 0.0 |
| 0.5 | 0.1 | 0.6 | 0.0 |
| 0.7 | 0.5 | 1.2 | 1.3 |
| 0.0 | 0.7 | 0.0 | 0.0 |
| 0.4 | 0.0 | 0.0 | 0.0 |
| 6.0 | 0.4 | 6.4 | 0.0 |
| 1.0 | 6.0 | 7.0 | 7.4 |
| 0.0 | 1.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.8 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.8 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 34.0 | 0.0 | 0.0 | 0.0 |
| 0.2 | 34.0 | 34.2 | 0.0 |
| 0.0 | 0.2 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 13.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 13.0 | 0.0 | 0.0 |
| 1.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 1.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 5.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 5.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 7.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 7.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 79.0 | 0.0 | 0.0 | 0.0 |
| 0.3 | 79.0 | 79.3 | 0.0 |
| 15.0 | 0.3 | 15.3 | 94.3 |
| 0.0 | 15.0 | 0.0 | 0.0 |
| 24.9 | 0.0 | 0.0 | 0.0 |
| 10.0 | 24.9 | 34.9 | 0.0 |
| 8.0 | 10.0 | 18.0 | 42.9 |
| 0.4 | 8.0 | 8.4 | 18.4 |
| 10.0 | 0.4 | 10.4 | 18.4 |
| 12.0 | 10.0 | 22.0 | 22.4 |
| 0.0 | 12.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.2 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.2 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 0.2 | 0.0 | 0.0 | 0.0 |
| 0.0 | 0.2 | 0.0 | 0.0 |
| 0.0 | 0.0 | 0.0 | 0.0 |
| 13.0 | 0.0 | 0.0 | 0.0 |
| 8.0 | 13.0 | 21.0 | 0.0 |
| 22.0 | 8.0 | 30.0 | 43.0 |
O script aqui apresentado é referente ao artigo completo submetido ao SER IV, para o trabalho intitulado de “Modelagem Preditiva de Extravasamento em Bacia de Contenção de Efluentes”.
Foi utilizado a algoritmos KNN (k-nearest neighbors) e Random Forest (rf) para predicação de extravasamento/transbordamentos (falha) em uma bacia de contenção de efluentes industriais. Os modelos aqui propostos são de classificação, retornando a informação predita de operação normal ou falha para a CET 09.
Para o desenvolvimento deste trabalho, foram utilizadas as bibliotecas (packages) descritas no chunk abaixo.
# CLASSIFICAÇÃO DE MODELO
if (!require("pacman")) install.packages("pacman")
pacman::p_load(dplyr, tidyr, ggplot2, caret, cowplot, randomForest,
reshape2, pROC, readxl, matrixStats, plotly, data.table,
psych, wesanderson, ggridges, lubridate, pals, wesanderson,
scales, RColorBrewer, grid, ROSE, forcats, tibble, pROC, lemon)
setwd("C:/Users/BRENNER BIASI/Desktop/Mestrado/SER UFF/Classificacao")
Foram utilizados três data set para o presente trabalho. Um data set com informações de chuva, outro com histórico do nível da bacia de contenção, e outro com histórico de acionamento das bombas para descarga na bacia.
dados_chuva <- read.csv("C:/Users/BRENNER BIASI/Desktop/Mestrado/Novo trab/knn/dados_clusters.txt", header = T,
sep = ";")
Hist_Nivel_jan.set <- read.csv("C:/Users/BRENNER BIASI/Desktop/Mestrado/Time series/R/DADOS/MH/Hist_Nivel_jan-set.txt", header = T,
sep = ";",
stringsAsFactors = FALSE)
Hist_pump_jan.set <- read.csv("C:/Users/BRENNER BIASI/Desktop/Mestrado/Time series/R/DADOS/MH/Hist_pump_jan-set.txt", header = T,
sep = ";",
stringsAsFactors = FALSE)
A fim de facilitar a estética dos plots, foi desenvolvido um tema base para para ser utilizando no ggplot2.
tema <- theme_bw() +
theme(axis.title = element_text(color = "black", size = 18, face = "bold"),
axis.text.x = element_text(color = "black", size = 16, face = "bold"),
axis.text.y = element_text(color = "black", size = 16, face = "bold"),
legend.text = element_text(color = "black", size = 12, face = "bold"),
legend.title = element_text(color = "black", size = 14, face = "bold"),
plot.title = element_text(color = "black", size = 18, face = "bold"))
temag <- theme_bw() +
theme(axis.title = element_text(color = "black", size = 20, face = "bold"),
axis.text.x = element_text(color = "black", size = 18, face = "bold"),
axis.text.y = element_text(color = "black", size = 18, face = "bold"),
legend.text = element_text(color = "black", size = 14, face = "bold"),
legend.title = element_text(color = "black", size = 16, face = "bold"),
plot.title = element_text(color = "black", size = 16, face = "bold"))
# Breaks para axis x
datebreaks_m <- seq(as.Date("2011-01-01"),
as.Date("2011-09-14"),
by = "1 month")
Inicialmente foi realizado a manipulação de dados para estruturação dos data sets para facilitação da manipulação de dados, análise descritiva básica da CET 09 e criação dos modelos preditivos.
Datax <- Hist_Nivel_jan.set %>%
dplyr::select(Periodo, CET_09) # Histórico de nível 0 - 100%
CET_09 <- psych::describe(Datax$CET_09)
CET_09 # Descritiva do nível de efluente na CET 09
## vars n mean sd median trimmed mad min max range skew
## X1 1 370080 43.28 33.76 29.39 40.98 30.39 3.71 100 96.29 0.51
## kurtosis se
## X1 -1.34 0.06
# Nivel da bacia ao longo do tempo
ggplot2::ggplot(Datax,
aes(x = as.Date(Datax$Periodo),
y = Datax$CET_09)) +
geom_line(aes(group = 1),
size = 1) +
geom_point(aes(group = 1),
size = 0.5) +
scale_x_date(breaks = datebreaks_m,
labels = scales::date_format("%Y-%m-%d")) +
xlab("Período em Análise") + ylab("Nível %") +
tema + theme(axis.text.x = element_text(angle = 90))
Dataz <- Hist_pump_jan.set %>%
dplyr::select(Periodo, CET_09) %>% # Histórico de bomba acionada [0, 1] = OFF/ON
dplyr::rename("PUMP_09" = CET_09)
PUMP_09 <- psych::describe(Dataz$PUMP_09)
PUMP_09 # Descritiva de acionamento de bombas
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 370080 0.58 0.5 1 0.6 0 0 2 2 -0.26 -1.73 0
###
Estrutura dos dados.
Datax <- Datax %>%
dplyr::left_join(Dataz, by = "Periodo")
dplyr::glimpse(Datax)
## Observations: 370,080
## Variables: 3
## $ Periodo <chr> "2011-01-01 00:00:00", "2011-01-01 00:01:00", "2011-01...
## $ CET_09 <dbl> 10.90, 10.90, 10.90, 10.91, 10.91, 10.91, 10.91, 10.91...
## $ PUMP_09 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Algumas partes de código nos próximos chunck aparecem comentadas (#) mas este artifício foi utilizado apenas para reduzir o número de chuncks ao longo do código. Os códigos são plenamente executáveis em caso de remocação dos comentários.
# Grafico de violino para o nível e acionamento de bombas
Datax %>%
reshape2::melt(id.vars = "Periodo") %>%
ggplot2::ggplot() +
geom_violin(aes(x = variable, y = value,
fill = variable),
alpha = 0.85, show.legend = F) +
facet_wrap(~variable, scales = "free") +
scale_fill_manual(values = pals::tol(n = 2)) +
xlab("Variável") + ylab("Valor") +
tema
# Datax %>%
# reshape2::melt(id.vars = "Periodo") %>%
# ggplot2::ggplot() +
# stat_boxplot(aes(x = variable, y = value),
# size = 1,
# geom ='errorbar') +
# geom_boxplot(aes(x = variable, y = value,
# fill = variable),
# show.legend = F) +
# facet_wrap(~variable, scales = "free") +
# scale_fill_manual(values = pals::tol(n = 2)) +
# xlab("Variável") + ylab("Valor") +
# tema
K <- round(1 + 3.322 * log10(nrow(Datax))) # Bins
# Distribuição do nível durante toda a série
# Baixas contentrações = provavelmente tempo seco / standby
pp1 <- Datax %>%
ggplot2::ggplot() +
geom_histogram(aes(x = CET_09, y = ..density..),
bins = 19,
colour = "black", fill = "gray") +
geom_density(aes(x = CET_09),
alpha = .2, fill = "#FF6666") +
xlab("Bacia CET 09") + ylab("Densidade") +
ggtitle("Distribuição de Dados de Nível %") +
tema
pp1
# Quantidade de acionamentos de bombas durante toda a série
# A bomba esteve ligada na maior parte do tempo
Datax %>%
ggplot2::ggplot() +
geom_bar(aes(x = PUMP_09, fill = PUMP_09),
stat = "count", colour = "black") +
scale_fill_manual(values = pals::tol(n = 3)) +
tema
PUMP_09 <- as.data.frame(table(Datax$PUMP_09))
cxp1 <- stats::chisq.test(PUMP_09$Freq)
cxp1
##
## Chi-squared test for given probabilities
##
## data: PUMP_09$Freq
## X-squared = 194710, df = 2, p-value < 2.2e-16
# O valor da estatística de teste é Q = 194710. Com dois graus de
# liberdade (k – 1 = 2), obtemos valor-p < 2.2e-16, que pode ser calculado como
# Como o valor de p < 2.2e-16, rejeitamos a hipótese nula de que há independecia
# da frequência de acionamentos.
# Removendo a menor frequência
PUMP_09 <- PUMP_09 %>%
dplyr::slice(1:2)
cxp1 <- stats::chisq.test(PUMP_09$Freq)
cxp1
##
## Chi-squared test for given probabilities
##
## data: PUMP_09$Freq
## X-squared = 8860.6, df = 1, p-value < 2.2e-16
# Como o valor de p < 2.2e-16, rejeitamos a hipótese nula de que há independecia
# da frequência de acionamentos.
# Atributos por mês
Datax_m <- Datax %>%
dplyr::mutate(Periodo = as.Date(Periodo, format = "%Y-%m-%d"),
Mes = lubridate::month(Periodo),
Mes = as.factor(Mes))
levels(Datax_m$Mes) <- c("JAN", "FEV", "MAR", "ABR", "MAI",
"JUN", "JUL", "AGO", "SET")
# Distribuição do nível ao longo dos meses
# Janeiro, Abril, Maio e Agosto apresentam maior probabilidade de altos níveis.
ggplot2::ggplot(Datax_m,
aes(x = CET_09, y = Mes, fill = ..x..)) +
geom_density_ridges_gradient(scale = 3,
rel_min_height = 0.01,
gradient_lwd = 1.) +
scale_x_continuous(expand = c(0.01, 0)) +
scale_y_discrete(expand = c(0.01, 0)) +
scale_fill_gradientn(colours = c("#00A08A", "blue"),
name = "Nível %") +
xlab("Nível CET 09") + ylab("Mês") +
ggtitle("Distribuição do Nível por Mês") +
theme_ridges() +
tema
# Acionamentos
Datax_m <- Datax_m %>%
dplyr::mutate(PUMP_09 = as.factor(PUMP_09))
levels(Datax_m$PUMP_09) <- c("OFF", "1 ON", "2 ON")
p_save_pump <- Datax_m %>%
ggplot2::ggplot() +
geom_bar(aes(x = PUMP_09),
stat = "count",
colour = "black") +
facet_wrap(~Mes) +
xlab("CMB Bacia CET 09") + ylab("Status [min/mês]") +
tema
p_save_pump
# Informações da bacia
# Volume <- 6800 # Capacidade útil em m3
Qin <- 142 # Vazão Média inflow m3/h
Qdes <- 285 # Vazão de descargar/bomba m3/h
limiar_nivel <- 95 # Nível máximo considerado para balanço de massa
m3hm <- 60 # Conversao de m3/h para m3/min
Data.nivel <- Hist_Nivel_jan.set %>%
dplyr::select(Periodo, CET_09) %>%
dplyr::mutate(CET_09 = ifelse(CET_09 > limiar_nivel, (Qin/m3hm), 0)) # Condição extravasamento
Data_disc <- Hist_pump_jan.set %>%
dplyr::select(Periodo, CET_09) %>%
dplyr::mutate(Qc9 = (CET_09*Qdes/m3hm)) %>% # Vazão de descarga
dplyr::select(-CET_09)
Bacia_c09 <- Data.nivel %>%
dplyr::inner_join(Data_disc, by = "Periodo") %>%
dplyr::mutate(overc9 = ifelse(CET_09 > Qc9, CET_09, 0), # Volume extravasado [m3]
pumpc9 = ifelse(Qc9 > 0, 1, Qc9), # Bomba ON/OFF [min]
tempoverc9 = ifelse(CET_09 > Qc9, 1, 0)) %>% # Tempo em overflow [min]
dplyr::select(-CET_09, -Qc9)
# Análise por hora 00:00 - 23:59
hora_over <- Bacia_c09 %>%
dplyr::mutate(Hora = base::substr(Periodo, 12, 13),
Hora = as.factor(Hora)) %>%
dplyr::select(overc9, Hora) %>%
dplyr::mutate(overc9 = ifelse(overc9 > 0, 1, NA)) %>%
reshape2::melt(id.vars = "Hora") %>%
base::table() %>%
as.data.frame() %>%
dplyr::select(-value, -variable) %>%
dplyr::mutate(F_Acumu = (base::cumsum(Freq) / sum(Freq) * 100))
# ggplot2::ggplot(hora_over) +
# geom_bar(aes(x = Hora,
# y = Freq,
# fill = Hora),
# show.legend = F,
# stat = "identity",
# col = "black") +
# ylab("Frequência") + ggtitle("Frequência de Extravasamento por Hora - CET 09") +
# scale_fill_manual(values = pals::stepped(n = 24)) +
# tema + theme(axis.text.x = element_text(color = "black",
# size = 9,
# face = "bold",
# angle = 90))
ploth2 <- ggplot2::ggplot(hora_over) +
geom_bar(aes(x = Hora,
y = Freq,
fill = Hora),
show.legend = F,
stat = "identity",
col = "black") +
ylab("Frequência") + ggtitle("Freq. de Extravasamento por Hora") +
scale_fill_grey() +
tema + theme(axis.text.x = element_text(color = "black",
size = 9,
face = "bold",
angle = 90))
hora_pump <- Bacia_c09 %>%
dplyr::mutate(Hora = base::substr(Periodo, 12, 13),
Hora = as.factor(Hora)) %>%
dplyr::select(pumpc9, Hora) %>%
dplyr::mutate(pumpc9 = ifelse(pumpc9 > 0, 1, NA)) %>%
reshape2::melt(id.vars = "Hora") %>%
table() %>%
as.data.frame() %>%
dplyr::select(-value) %>%
dplyr::rename("Bacia" = variable) %>%
dplyr::mutate(F_Acumu = (base::cumsum(Freq) / sum(Freq) * 100))
# ggplot2::ggplot(hora_pump) +
# geom_bar(aes(x = Hora,
# y = Freq,
# fill = Hora),
# show.legend = F,
# stat = "identity",
# col = "black") +
# ylab("Frequência") + ggtitle("Frequência de Acionamentos de Bomba por Hora - CET 09") +
# scale_fill_manual(values = pals::stepped(n = 24)) +
# tema + theme(axis.text.x = element_text(color = "black",
# size = 9,
# face = "bold",
# angle = 90))
# Plot de frequência de extravasamento para intervalor entre horas de um dia
ploth3 <- ggplot2::ggplot(hora_pump) +
geom_bar(aes(x = Hora,
y = Freq,
fill = Hora),
show.legend = F,
stat = "identity",
col = "black") +
ylab("Frequência") + ggtitle("Freq. de Acionamentos de Bomba por Hora") +
scale_fill_grey() +
tema + theme(axis.text.x = element_text(color = "black",
size = 9,
face = "bold",
angle = 90))
plot_fre_ovfl <- cowplot::plot_grid(ploth2, ploth3, align = "hv", labels = c('(A)', '(B)'))
plot_fre_ovfl
###
Bacia_c09 <- Bacia_c09 %>%
dplyr::mutate(Periodo = as.Date(Periodo)) %>%
data.table::data.table()
Bacia2 <- Bacia_c09[,list(Ovfc9 = sum(overc9), # Volume extravasado [m3]
pumpc9 = sum(pumpc9), # Bomba ON/OFF [min]
tempoverc9 = sum(tempoverc9)), # Tempo em overflow [min]
by = c("Periodo")]
Bacia2 <- Bacia2 %>%
dplyr::mutate(Dia = seq(1, nrow(Bacia2), 1),
StatusC9 = ifelse(tempoverc9 > 0, "Extravasando", "Normal")) # Rótulos
Bacia2 <- dados_chuva %>%
dplyr::select(Dia, Vol01Dia, Vol02Dias,
Vol03Dias, Anterior) %>%
dplyr::inner_join(Bacia2, by = "Dia")
# Desvio Padrao e maximos niveis diarios na bacia
DP_nivel <- Datax %>%
dplyr::rename("DP_CET09" = CET_09) %>%
dplyr::mutate(Periodo = as.Date(Periodo)) %>%
data.table::data.table()
DP_nivel <- DP_nivel[,list(DP_CET09 = round(sqrt(var(DP_CET09)), 3)),
by = c("Periodo")]
##
maximos <- Datax %>%
dplyr::rename("MAX_CET09" = CET_09) %>%
dplyr::mutate(Periodo = as.Date(Periodo)) %>%
data.table::data.table()
# MAX_CET09 -> Maximo nivel diario na bacia cet 09
maximos <- maximos[,list(MAX_CET09 = max(MAX_CET09)),
by = c("Periodo")]
#
maximosH11 <- Datax %>%
dplyr::rename("MAX11H_CET09" = CET_09) %>%
dplyr::mutate(Hora = substr(Periodo, 12, 13),
Periodo = as.Date(Periodo)) %>%
dplyr::filter(Hora == 23) %>%
data.table::data.table()
# maximosH -> Maximo nivel durante o intervalo diário de 23:00hrs às 00:00hr hora na bacia cet 09
maximosH11 <- maximosH11[,list(MAX11H_CET09 = max(MAX11H_CET09)),
by = c("Periodo")]
# Mediana diaria do nivel
MEDIANA_C9 <- Datax %>%
dplyr::rename("MEDIANA_CET09" = CET_09) %>%
dplyr::mutate(Periodo = as.Date(Periodo)) %>%
data.table::data.table()
MEDIANA_C9 <- MEDIANA_C9[,list(MEDIANA_CET09 = median(MEDIANA_CET09)),
by = c("Periodo")]
# Range diaria do nivel
RANGE_C9 <- Datax %>%
dplyr::rename("RANGE_C9" = CET_09) %>%
dplyr::mutate(Periodo = as.Date(Periodo)) %>%
data.table::data.table()
RANGE_C9 <- RANGE_C9[,list(minn = min(RANGE_C9),
maxx = max(RANGE_C9)),
by = c("Periodo")]
RANGE_C9 <- RANGE_C9 %>%
dplyr::mutate(RANGE_C9 = maxx - minn) %>%
dplyr::select(Periodo, RANGE_C9)
df <- maximosH11 %>%
dplyr::inner_join(MEDIANA_C9, by = "Periodo") %>%
dplyr::inner_join(maximos, by = "Periodo") %>%
dplyr::inner_join(RANGE_C9, by = "Periodo") %>%
dplyr::inner_join(DP_nivel, by = "Periodo") %>%
dplyr::inner_join(Bacia2, by = "Periodo") %>%
dplyr::select(-Dia)
head(df, 8)
## Periodo MAX11H_CET09 MEDIANA_CET09 MAX_CET09 RANGE_C9 DP_CET09
## 1 2011-01-01 10.23 10.270 11.20 1.06 0.377
## 2 2011-01-02 14.87 9.450 14.87 7.11 1.641
## 3 2011-01-03 29.73 17.190 29.73 14.84 4.470
## 4 2011-01-04 59.16 33.580 59.16 29.43 10.200
## 5 2011-01-05 66.75 65.510 69.95 10.78 2.843
## 6 2011-01-06 62.46 64.390 65.76 3.87 0.778
## 7 2011-01-07 58.53 57.720 61.88 8.32 1.991
## 8 2011-01-08 67.80 56.885 73.45 23.36 7.664
## Vol01Dia Vol02Dias Vol03Dias Anterior Ovfc9 pumpc9 tempoverc9 StatusC9
## 1 0 0 0 0 0 72 0 Normal
## 2 0 0 0 0 0 642 0 Normal
## 3 0 0 0 0 0 56 0 Normal
## 4 0 0 0 0 0 0 0 Normal
## 5 0 0 0 0 0 278 0 Normal
## 6 0 0 0 0 0 412 0 Normal
## 7 7 0 0 0 0 552 0 Normal
## 8 12 19 0 7 0 592 0 Normal
Manipulação de dados e análise descritiva das chuvas diárias observadas.
chuva <- dados_chuva %>%
dplyr::select(Vol01Dia, Vol02Dias, Vol03Dias, Anterior) %>%
cbind(Periodo = df$Periodo) %>%
dplyr::mutate(Periodo = as.Date(Periodo, format = "%Y-%m-%d"),
Mes = lubridate::month(Periodo),
Mes = as.factor(Mes),
Vol01Dia = as.numeric(Vol01Dia))
chuvaE <- psych::describe(chuva$Vol01Dia)
# kurtosis = 21.45
chuva %>%
dplyr::select(-Periodo, -Mes) %>%
ggplot2::ggplot() +
geom_histogram(aes(x = Vol01Dia, y = ..density..),
bins = 19,
colour = "black", fill = "grey") +
geom_density(aes(x = Vol01Dia),
alpha = .2, fill = "#FF6666") +
ggtitle("Distribuição de Dados de Chuva Observada") +
xlab("Volume Diário de Chuva Acumulada [mm]") +
ylab("Densidade") +
tema
levels(chuva$Mes) <- c("JAN", "FEV", "MAR", "ABR", "MAI",
"JUN", "JUL", "AGO", "SET")
chuva_mensal <- chuva %>%
dplyr::select(Vol01Dia, Mes) %>%
data.table::data.table()
# Chuva acumulada mensal
chuva_mensal <- chuva_mensal[,list(Vol01Dia = sum(Vol01Dia)), # Chuva acumulada mensal
by = c("Mes")]
chuva_mensal <- ggplot2::ggplot(chuva_mensal) +
geom_bar(aes(x = Mes,
y = Vol01Dia,
fill = Mes),
stat = "identity") +
scale_fill_grey(name = "Legenda") +
xlab("Mês") + ylab("Vol. mês [mm]") +
tema + theme(axis.text.x = element_text(angle = 90))
chuva_mensal
ggplot2::ggplot(chuva,
aes(x = Vol01Dia, y = Mes)) +
geom_density_ridges_gradient(aes(fill = ..x..),
scale = 3,
size = 0.3) +
scale_fill_gradientn(colours = c("#00A08A", "blue"),
name = "Chuva [mm]") +
xlab("Índice Pluviométrico Diário [mm]") + ylab("Mês") +
theme_ridges() +
tema
chuvax <- chuva
chuvax <- psych::describe(chuvax$Vol01Dia)
chuvax
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 257 5.71 12.45 0.8 2.77 1.19 0 98 98 4.13 21.45
## se
## X1 0.78
chuvax <- chuva %>%
dplyr::select(-Periodo, -Mes) %>%
dplyr::rename("Vol. chuva (i-1)"= Vol01Dia,
"Vol. condicional (i-1)+(i-2)" = Vol02Dias,
"Vol. condicional (i-1)+(i-2)+(i-3)"= Vol03Dias,
"Vol. chuva (i-2)" = Anterior)
GGally::ggpairs(chuvax) +
labs(title = "Correlação, dispersão e distribuição de dados de chuva diária [mm]") +
theme(axis.text.x = element_text(size = 5)) +
theme_bw(base_size = 8)
chuvax <- chuva %>%
dplyr::select(-Mes) %>%
reshape2::melt(id.vars = c("Periodo")) %>%
dplyr::rename("Dia" = variable,
"Vol" = value) %>%
dplyr::mutate(Periodo = as.POSIXct(Periodo),
Periodo = as.Date(Periodo))
levels(chuvax$Dia) <- c("Vol. chuva (i-1)", "Vol. condicional (i-1)+(i-2)",
"Vol. condicional (i-1)+(i-2)+(i-3)", "Vol. chuva (i-2)")
chuvax %>%
ggplot2::ggplot(aes(x = Periodo, y = Vol)) +
geom_area(aes(fill = Dia), col = "black") +
facet_wrap(~Dia) +
scale_x_date(breaks = datebreaks_m,
labels = scales::date_format("%Y-%m-%d")) +
xlab("Período em Análise") + ylab("Volume Chuva [mm/Dia]") +
ggtitle("Ocorrência de Chuvas [mm]") +
tema + theme(axis.text.x = element_text(angle = 90),
legend.position = "bottom")
Verificação de desempenho da bacia de contenção baseado no limiar de transbordamento/extravasamento, que é a não ocorrência de extravasamento em cenários de tempo seco (ausência de chuvas) ou com chuvas com baixa altura (\(< 2 mm\)).
pcx1 <- df %>%
ggplot2::ggplot() +
geom_vline(aes(xintercept = 2),
linetype = 2,
size = 1.5,
colour = "red") +
geom_point(aes(x = Vol01Dia,
y = Ovfc9),
shape = 21, col = "blue",
fill = "black",
stroke = 0.1, alpha = 0.9, size = 3) +
scale_x_continuous(breaks = c(0, 2, 10, 25, 50, 75, 100),
limits = c(0, 100)) +
xlab("Vol. Chuva Diária [mm]") +
ylab("Transbordamentos [m³]") +
tema
plotly::ggplotly(pcx1)
# Transbordamentos ocorrem mesmo com volumes de chuva tão baixos
# quanto 2 mm indica que o equipamento não tem a capacidade de assimilar o fluxo.
# https://doi.org/10.1016/j.scitotenv.2014.10.087
pcx2 <- df %>%
ggplot2::ggplot() +
geom_vline(aes(xintercept = 2),
linetype = 2,
size = 1.5,
colour = "red") +
geom_point(aes(x = Vol01Dia,
y = tempoverc9),
shape = 21, col = "blue",
fill = "black",
stroke = 0.1, alpha = 0.9, size = 3) +
scale_x_continuous(breaks = c(0, 2, 10, 25, 50, 75, 100),
limits = c(0, 100)) +
xlab("Vol. Chuva Diária [mm]") +
ylab("Tempo em Transbordamento [min/dia de ocorrência]") +
tema
plotly::ggplotly(pcx2)
psave_lim <- cowplot::plot_grid(pcx1, pcx2, align = "hv")
psave_lim
Ajuste de dados e confecção do plot de chuva acumulada diária.
df_ch <- df %>%
dplyr::select(Periodo, Vol01Dia)
px1 <- ggplot2::ggplot(df_ch) +
geom_bar(aes(x = Periodo, y = Vol01Dia),
stat="identity", fill = "blue", col = "blue3") +
geom_hline(aes(yintercept = 0), colour="blue4") +
scale_y_reverse(name = "[mm]") +
ggtitle("Precipitação Pluviométrica [mm/dia]") +
tema + theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
Ajuste da mediana do nível diário. A escolha da métrica de mediana, para este caso com diversos tipos e ou valores de parâmetros de distribuição, é uma boa escolha para preservação do do comportamento do nível ao longo do tempo sem a perda de informação.
df_c9 <- df %>%
dplyr::select(Periodo, MEDIANA_CET09, StatusC9) %>%
reshape2::melt(c("Periodo", "StatusC9"))
px2 <- ggplot2::ggplot(df_c9) +
geom_line(aes(x = Periodo,
y = value,
group = 1),
size = 1, alpha = 0.9) +
geom_point(aes(x = Periodo,
y = value,
col = StatusC9),
size = 0.9, alpha = 1) +
scale_color_manual(values = c("red", "black"),
name = "Legenda") +
scale_x_date(name = "Período em Análise",
breaks = datebreaks_m,
labels = date_format("%b/%y")) +
scale_y_continuous(name = "Nível %",
limits = c(0, 100),
breaks = seq(0, 100, by = 25)) +
ggtitle("Mediana Diária do Nível %") +
guides(colour = guide_legend(override.aes = list(size = 2))) +
tema + theme(legend.position = "bottom")
px2
df_c9 <- df %>%
dplyr::select(Periodo, pumpc9)
Plot do tempo de bomba ligada por dia.
px3 <- ggplot2::ggplot(df_c9) +
geom_line(aes(x = Periodo,
y = pumpc9,
group = 1),
size = 1, alpha = 0.7) +
geom_point(aes(x = Periodo,
y = pumpc9,
group = 1),
size = 0.3) +
scale_x_date(name = "Período em Análise") +
scale_y_continuous(name = "[min/dia]") +
ggtitle("Tempo Bomba Ligada [min/dia] ") +
tema + theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
px <- cowplot::plot_grid(px1, px3, px2, align = "hv",
ncol = 1, rel_heights = c(0.45, 0.45, 1)) +
theme(plot.background = element_rect(color = "black"))
psave_3 <- px
px
# title <- cowplot::ggdraw() + cowplot::draw_label("Mediana do Nível % e Chuva Acumulada Diária [mm]",
# fontface='bold')
#
# plot_grid(title, px, ncol = 1, rel_heights = c(0.1, 1))
Como os dados do fenômeno em análise são não-lienares, a ocorrência e intensidade de chuvas tem característica estocástica, foram contruídos modelos preditivos utilizando machine learnign com algoritmos de Random Forest (RF) e KNN. Para a modelagem do fenômenos de extravasamento foram considerados 03 cenários, realizando um forward das variáveis.
Contudo o primeiro passo foi verificar o balanceamento entre as classes.
# unbalanced
unb <- as.data.frame(table(df$StatusC9))
unb
## Var1 Freq
## 1 Extravasando 37
## 2 Normal 220
paste0("Extravasando = ", round(unb[1,2]/sum(unb$Freq),3)*100,"%")
## [1] "Extravasando = 14.4%"
paste0("Op. Normal = ", round(unb[2,2]/sum(unb$Freq),3)*100,"%")
## [1] "Op. Normal = 85.6%"
# Ajuste do banco de dados
# Utilizando as informacoes do dia i-1 para predizer o status do dia i
df_modelo <- df %>%
dplyr::mutate(check = Periodo) %>%
dplyr::select(Periodo, check, StatusC9) %>%
dplyr::mutate(check = dplyr::lead(check, 1L),
StatusC9 = dplyr::lead(StatusC9, 1L))
df_modelo <- df %>%
dplyr::mutate(check = Periodo) %>%
dplyr::mutate(check = dplyr::lead(check, 1L),
StatusC9 = dplyr::lead(StatusC9, 1L)) %>%
na.omit() %>%
dplyr::mutate(StatusC9 = as.factor(StatusC9))
# grid de busca para melhor k
grid <- expand.grid(k = 1:25)
# Este invervalo engloba o possível valor empírico ótimo
# sqrt(nrow(df_modelo))
Modelos construídos utilizando variáveis descritas para o cenário 1.
# data set para cenario 1
modelo1 <- df_modelo %>%
dplyr::select(MEDIANA_CET09, MAX_CET09, DP_CET09, RANGE_C9,
Vol01Dia, StatusC9)
Para cada modelo construído será possível visulaizar o output dos erros, da matriz de confusão/matriz de contigência e de métricas de avaliação de predição, fornecidas pelo próprio pacote caret. Para todos os modelos foi considerado o threshold de \(p>0.5\) para determinar ocorrência de extravasamento.
knn.
Modeo knn com dados sem reamostrage.
# knn original m1 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo1$StatusC9, p = 0.75, list = FALSE)
train <- modelo1[index, ]
test <- modelo1[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE)
model1_orig_knn <- caret::train(StatusC9 ~ .,
data = train,
method = "knn",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid)
}
model1_orig_knn
## k-Nearest Neighbors
##
## 193 samples
## 5 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (5), centered (5)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8457602 0.3480780
## 2 0.8199123 0.2543356
## 3 0.8293567 0.3178866
## 4 0.8557310 0.3693473
## 5 0.8759649 0.4458664
## 6 0.8657018 0.4035748
## 7 0.8762573 0.4224722
## 8 0.8654386 0.3801175
## 9 0.8654386 0.3885640
## 10 0.8704386 0.3966721
## 11 0.8754386 0.4139135
## 12 0.8604386 0.3537429
## 13 0.8654386 0.3945051
## 14 0.8601754 0.3631109
## 15 0.8601754 0.3631109
## 16 0.8704386 0.4404510
## 17 0.8704386 0.4404510
## 18 0.8704386 0.4320046
## 19 0.8651754 0.4090569
## 20 0.8451754 0.3263121
## 21 0.8451754 0.2780362
## 22 0.8604386 0.3820835
## 23 0.8446491 0.2050344
## 24 0.8551754 0.2096920
## 25 0.8496491 0.1809951
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
# plot(model1_orig_knn)
# plot(model1_orig_knn, print.thres = 0.5, type = "S")
final_m1_knn_ori <- data.frame(Observado = test$StatusC9,
predict(model1_orig_knn, newdata = test, type = "prob"))
final_m1_knn_ori$predict <- ifelse(final_m1_knn_ori$Extravasando > 0.5, "Extravasando", "Normal")
final_m1_knn_ori$predict <- as.factor(final_m1_knn_ori$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm_m1_knn_ori <- caret::confusionMatrix(final_m1_knn_ori$predict, test$StatusC9)
cm_m1_knn_ori
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 4 3
## Normal 5 51
##
## Accuracy : 0.873
## 95% CI : (0.765, 0.9435)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.4454
##
## Kappa : 0.4286
##
## Mcnemar's Test P-Value : 0.7237
##
## Sensitivity : 0.44444
## Specificity : 0.94444
## Pos Pred Value : 0.57143
## Neg Pred Value : 0.91071
## Prevalence : 0.14286
## Detection Rate : 0.06349
## Detection Prevalence : 0.11111
## Balanced Accuracy : 0.69444
##
## 'Positive' Class : Extravasando
##
Modeo knn com dados com undersampling.
# KNN 'under' m1 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo1$StatusC9, p = 0.75, list = FALSE)
train <- modelo1[index, ]
test <- modelo1[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "down")
model1_down_knn <- caret::train(StatusC9 ~ .,
data = train,
method = "knn",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid)
}
model1_down_knn
## k-Nearest Neighbors
##
## 193 samples
## 5 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (5), centered (5)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using down-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.7219883 0.2654723
## 2 0.7572515 0.3488258
## 3 0.7835380 0.4166307
## 4 0.7527193 0.3773274
## 5 0.8251462 0.5101479
## 6 0.7883041 0.3847893
## 7 0.7732456 0.3979395
## 8 0.8143567 0.4361424
## 9 0.7926901 0.4254737
## 10 0.7976901 0.4278678
## 11 0.7932164 0.4589435
## 12 0.7940936 0.4156918
## 13 0.7893567 0.4408807
## 14 0.7574269 0.4117262
## 15 0.7793567 0.4334973
## 16 0.7782749 0.4360145
## 17 0.7726901 0.4310197
## 18 0.7838304 0.4321168
## 19 0.7571637 0.4122224
## 20 0.7571637 0.4115826
## 21 0.7619006 0.4132948
## 22 0.7838304 0.4422755
## 23 0.7616374 0.4119332
## 24 0.7571637 0.4005608
## 25 0.7677193 0.4198392
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
# plot(model1_down_knn)
# plot(model1_down_knn, print.thres = 0.5, type = "S")
final1_down_knn <- data.frame(Observado = test$StatusC9,
predict(model1_down_knn, newdata = test, type = "prob"))
final1_down_knn$predict <- ifelse(final1_down_knn$Extravasando > 0.5, "Extravasando", "Normal")
final1_down_knn$predict <- as.factor(final1_down_knn$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm1_down_knn <- caret::confusionMatrix(final1_down_knn$predict, test$StatusC9)
cm1_down_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 9 8
## Normal 0 46
##
## Accuracy : 0.873
## 95% CI : (0.765, 0.9435)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.44539
##
## Kappa : 0.6216
##
## Mcnemar's Test P-Value : 0.01333
##
## Sensitivity : 1.0000
## Specificity : 0.8519
## Pos Pred Value : 0.5294
## Neg Pred Value : 1.0000
## Prevalence : 0.1429
## Detection Rate : 0.1429
## Detection Prevalence : 0.2698
## Balanced Accuracy : 0.9259
##
## 'Positive' Class : Extravasando
##
Modeo knn com dados com oversampling.
# KNN 'up' m1 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo1$StatusC9, p = 0.75, list = FALSE)
train <- modelo1[index, ]
test <- modelo1[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "up")
model1_up_knn <- caret::train(StatusC9 ~ .,
data = train,
method = "knn",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid)
}
model1_up_knn
## k-Nearest Neighbors
##
## 193 samples
## 5 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (5), centered (5)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using up-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8302047 0.2756892
## 2 0.8046784 0.3377178
## 3 0.7583041 0.3137063
## 4 0.7535673 0.3450524
## 5 0.7430117 0.3437471
## 6 0.7427485 0.3436587
## 7 0.7224561 0.3278827
## 8 0.7224854 0.2995383
## 9 0.7538304 0.3790445
## 10 0.7585088 0.4056364
## 11 0.7674561 0.3720318
## 12 0.7440936 0.3468519
## 13 0.7577193 0.3558252
## 14 0.7790643 0.4206839
## 15 0.7838304 0.3997435
## 16 0.7741228 0.3835833
## 17 0.7999123 0.4430723
## 18 0.8001754 0.4409596
## 19 0.7996491 0.4318265
## 20 0.7729825 0.3975916
## 21 0.7949123 0.4362453
## 22 0.7943860 0.4263689
## 23 0.7993860 0.4342749
## 24 0.8035380 0.4351431
## 25 0.7935673 0.4240969
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
# plot(model1_up_knn)
# plot(model1_up_knn, print.thres = 0.5, type = "S")
final1_up_knn <- data.frame(Observado = test$StatusC9,
predict(model1_up_knn, newdata = test, type = "prob"))
final1_up_knn$predict <- ifelse(final1_up_knn$Extravasando > 0.5, "Extravasando", "Normal")
final1_up_knn$predict <- as.factor(final1_up_knn$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm1_up_knn <- caret::confusionMatrix(final1_up_knn$predict, test$StatusC9)
cm1_up_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 6 4
## Normal 3 50
##
## Accuracy : 0.8889
## 95% CI : (0.7844, 0.9541)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.3057
##
## Kappa : 0.5664
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.66667
## Specificity : 0.92593
## Pos Pred Value : 0.60000
## Neg Pred Value : 0.94340
## Prevalence : 0.14286
## Detection Rate : 0.09524
## Detection Prevalence : 0.15873
## Balanced Accuracy : 0.79630
##
## 'Positive' Class : Extravasando
##
Modeo knn com dados com ROSE.
# KNN 'rose' ----
{
set.seed(1)
index <- caret::createDataPartition(modelo1$StatusC9, p = 0.75, list = FALSE)
train <- modelo1[index, ]
test <- modelo1[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "rose")
model1_r_knn <- caret::train(StatusC9 ~ .,
data = train,
method = "knn",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid)
}
model1_r_knn
## k-Nearest Neighbors
##
## 193 samples
## 5 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (5), centered (5)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using ROSE prior to pre-processing
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8040936 0.3114802
## 2 0.7680117 0.3271554
## 3 0.7727193 0.3406117
## 4 0.8354386 0.4852551
## 5 0.8143567 0.4256848
## 6 0.7835965 0.3770804
## 7 0.8098830 0.4737488
## 8 0.7990936 0.3956600
## 9 0.8451462 0.5018523
## 10 0.7835673 0.3910869
## 11 0.7885380 0.4268851
## 12 0.8146199 0.4822169
## 13 0.7882456 0.4084584
## 14 0.8249123 0.4781986
## 15 0.8518129 0.5702730
## 16 0.8090643 0.4654193
## 17 0.8043567 0.4505869
## 18 0.8038304 0.4444237
## 19 0.7938012 0.4226346
## 20 0.7988012 0.4520727
## 21 0.7893567 0.4526069
## 22 0.8251462 0.5075308
## 23 0.7893567 0.4280918
## 24 0.8088012 0.4407426
## 25 0.8049123 0.4474972
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 15.
# plot(model1_r_knn)
# plot(model1_r_knn, print.thres = 0.5, type = "S")
final1_r_knn <- data.frame(Observado = test$StatusC9,
predict(model1_r_knn, newdata = test, type = "prob"))
final1_r_knn$predict <- ifelse(final1_r_knn$Extravasando > 0.5, "Extravasando", "Normal")
final1_r_knn$predict <- as.factor(final1_r_knn$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm1_r_knn <- confusionMatrix(final1_r_knn$predict, test$StatusC9)
cm1_r_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 9 5
## Normal 0 49
##
## Accuracy : 0.9206
## 95% CI : (0.8244, 0.9737)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.09773
##
## Kappa : 0.7368
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 1.0000
## Specificity : 0.9074
## Pos Pred Value : 0.6429
## Neg Pred Value : 1.0000
## Prevalence : 0.1429
## Detection Rate : 0.1429
## Detection Prevalence : 0.2222
## Balanced Accuracy : 0.9537
##
## 'Positive' Class : Extravasando
##
rf.
Modeo rf com dados com dados sem reamostragem.
{
set.seed(1)
index <- caret::createDataPartition(modelo1$StatusC9, p = 0.75, list = FALSE)
train <- modelo1[index, ]
test <- modelo1[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE)
model1_orig_rf <- caret::train(StatusC9 ~ .,
data = train,
method = "rf",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
ntree = 1500)
}
model1_orig_rf
## Random Forest
##
## 193 samples
## 5 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (5), centered (5)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8818129 0.4668418
## 3 0.8768129 0.4543333
## 5 0.8820760 0.4623226
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
# plot(model1_orig_rf)
# plot(model1_orig_rf, print.thres = 0.5, type = "S")
final1_orig_rf <- data.frame(Observado = test$StatusC9,
predict(model1_orig_rf, newdata = test, type = "prob"))
final1_orig_rf$predict <- ifelse(final1_orig_rf$Extravasando > 0.5, "Extravasando", "Normal")
final1_orig_rf$predict <- as.factor(final1_orig_rf$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm1_orig_rf <- confusionMatrix(final1_orig_rf$predict, test$StatusC9)
cm1_orig_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 7 2
## Normal 2 52
##
## Accuracy : 0.9365
## 95% CI : (0.8453, 0.9824)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.04297
##
## Kappa : 0.7407
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.7778
## Specificity : 0.9630
## Pos Pred Value : 0.7778
## Neg Pred Value : 0.9630
## Prevalence : 0.1429
## Detection Rate : 0.1111
## Detection Prevalence : 0.1429
## Balanced Accuracy : 0.8704
##
## 'Positive' Class : Extravasando
##
Modeo rf com dados com dados com undersampling.
# RF 'under' m1 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo1$StatusC9, p = 0.75, list = FALSE)
train <- modelo1[index, ]
test <- modelo1[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "down")
model1_down_rf <- caret::train(StatusC9 ~ .,
data = train,
method = "rf",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
ntree = 1500)
}
model1_down_rf
## Random Forest
##
## 193 samples
## 5 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (5), centered (5)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using down-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8348830 0.5150832
## 3 0.8246199 0.4882011
## 5 0.8409942 0.4863122
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
# plot(model1_down_rf)
# plot(model1_down_rf, print.thres = 0.5, type = "S")
final1_down_rf <- data.frame(Observado = test$StatusC9,
predict(model1_down_rf, newdata = test, type = "prob"))
final1_down_rf$predict <- ifelse(final1_down_rf$Normal > 0.5, "Normal", "Extravasando")
final1_down_rf$predict <- as.factor(final1_down_rf$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm1_down_rf <- confusionMatrix(final1_down_rf$predict, test$StatusC9)
cm1_down_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 9 11
## Normal 0 43
##
## Accuracy : 0.8254
## 95% CI : (0.709, 0.9095)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.818553
##
## Kappa : 0.5276
##
## Mcnemar's Test P-Value : 0.002569
##
## Sensitivity : 1.0000
## Specificity : 0.7963
## Pos Pred Value : 0.4500
## Neg Pred Value : 1.0000
## Prevalence : 0.1429
## Detection Rate : 0.1429
## Detection Prevalence : 0.3175
## Balanced Accuracy : 0.8981
##
## 'Positive' Class : Extravasando
##
Modeo rf com dados com dados com oversampling.
# RF 'up' m1 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo1$StatusC9, p = 0.75, list = FALSE)
train <- modelo1[index, ]
test <- modelo1[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "up")
model1_up_rf <- caret::train(StatusC9 ~ .,
data = train,
method = "rf",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
ntree = 1500)
}
model1_up_rf
## Random Forest
##
## 193 samples
## 5 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (5), centered (5)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using up-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8873392 0.5045829
## 3 0.8873392 0.5045829
## 5 0.8817836 0.4790469
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# plot(model1_up_rf)
# plot(model1_up_rf, print.thres = 0.5, type = "S")
final1_up_rf <- data.frame(Observado = test$StatusC9,
predict(model1_up_rf, newdata = test, type = "prob"))
final1_up_rf$predict <- ifelse(final1_up_rf$Extravasando > 0.5, "Extravasando", "Normal")
final1_up_rf$predict <- as.factor(final1_up_rf$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm1_up_rf <- confusionMatrix(final1_up_rf$predict, test$StatusC9)
cm1_up_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 8 3
## Normal 1 51
##
## Accuracy : 0.9365
## 95% CI : (0.8453, 0.9824)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.04297
##
## Kappa : 0.7627
##
## Mcnemar's Test P-Value : 0.61708
##
## Sensitivity : 0.8889
## Specificity : 0.9444
## Pos Pred Value : 0.7273
## Neg Pred Value : 0.9808
## Prevalence : 0.1429
## Detection Rate : 0.1270
## Detection Prevalence : 0.1746
## Balanced Accuracy : 0.9167
##
## 'Positive' Class : Extravasando
##
Modeo rf com dados com dados com ROSE.
# RF 'rose' m1 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo1$StatusC9, p = 0.75, list = FALSE)
train <- modelo1[index, ]
test <- modelo1[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "rose")
model1_r_rf <- caret::train(StatusC9 ~ .,
data = train,
method = "rf",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
ntree = 1500)
}
model1_r_rf
## Random Forest
##
## 193 samples
## 5 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (5), centered (5)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using ROSE prior to pre-processing
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8043567 0.4475872
## 3 0.8251754 0.4922365
## 5 0.7946491 0.4224102
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
# plot(model1_r_rf)
# plot(model1_r_rf, print.thres = 0.5, type = "S")
final1_r_rf <- data.frame(Observado = test$StatusC9,
predict(model1_r_rf, newdata = test, type = "prob"))
final1_r_rf$predict <- ifelse(final1_r_rf$Extravasando > 0.5, "Extravasando", "Normal")
final1_r_rf$predict <- as.factor(final1_r_rf$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm1_r_rf <- confusionMatrix(final1_r_rf$predict, test$StatusC9)
cm1_r_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 9 8
## Normal 0 46
##
## Accuracy : 0.873
## 95% CI : (0.765, 0.9435)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.44539
##
## Kappa : 0.6216
##
## Mcnemar's Test P-Value : 0.01333
##
## Sensitivity : 1.0000
## Specificity : 0.8519
## Pos Pred Value : 0.5294
## Neg Pred Value : 1.0000
## Prevalence : 0.1429
## Detection Rate : 0.1429
## Detection Prevalence : 0.2698
## Balanced Accuracy : 0.9259
##
## 'Positive' Class : Extravasando
##
Preparação do data set para construção de modelos no cenário 2.
modelo2 <- df_modelo %>%
dplyr::select(MEDIANA_CET09, MAX_CET09, DP_CET09, RANGE_C9,
Vol01Dia, StatusC9,
Vol02Dias, Vol03Dias, Anterior, pumpc9)
Contrução dos modelos preditivos.
knn.
Modeo knn com dados com dados sem reamostragem.
# knn original m2 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo2$StatusC9, p = 0.75, list = FALSE)
train <- modelo2[index, ]
test <- modelo2[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE)
model2_orig_knn <- caret::train(StatusC9 ~ .,
data = train,
method = "knn",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid)
}
model2_orig_knn
## k-Nearest Neighbors
##
## 193 samples
## 9 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (9), centered (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8604386 0.373935671
## 2 0.8399415 0.247780209
## 3 0.8554386 0.351932111
## 4 0.8607310 0.360828042
## 5 0.8654678 0.291630794
## 6 0.8554678 0.277049252
## 7 0.8857310 0.349341346
## 8 0.8757310 0.324964027
## 9 0.8857310 0.349341346
## 10 0.8754678 0.295219925
## 11 0.8754678 0.278020908
## 12 0.8549415 0.140838764
## 13 0.8602047 0.140414730
## 14 0.8499415 0.100912474
## 15 0.8499415 0.100912474
## 16 0.8552047 0.106798877
## 17 0.8602047 0.114906985
## 18 0.8602047 0.114906985
## 19 0.8604678 0.083552124
## 20 0.8604678 0.045714286
## 21 0.8552047 0.000000000
## 22 0.8502047 -0.008108108
## 23 0.8502047 -0.008108108
## 24 0.8502047 -0.008108108
## 25 0.8552047 0.000000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
# plot(model2_orig_knn)
# plot(model2_orig_knn, print.thres = 0.5, type = "S")
final_m2_knn_ori <- data.frame(Observado = test$StatusC9,
predict(model2_orig_knn, newdata = test, type = "prob"))
final_m2_knn_ori$predict <- ifelse(final_m2_knn_ori$Extravasando > 0.5, "Extravasando", "Normal")
final_m2_knn_ori$predict <- as.factor(final_m2_knn_ori$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm_m2_knn_ori <- caret::confusionMatrix(final_m2_knn_ori$predict, test$StatusC9)
cm_m2_knn_ori
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 3 1
## Normal 6 53
##
## Accuracy : 0.8889
## 95% CI : (0.7844, 0.9541)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.3057
##
## Kappa : 0.4096
##
## Mcnemar's Test P-Value : 0.1306
##
## Sensitivity : 0.33333
## Specificity : 0.98148
## Pos Pred Value : 0.75000
## Neg Pred Value : 0.89831
## Prevalence : 0.14286
## Detection Rate : 0.04762
## Detection Prevalence : 0.06349
## Balanced Accuracy : 0.65741
##
## 'Positive' Class : Extravasando
##
Modeo knn com dados com undersampling.
# KNN 'under' m2 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo2$StatusC9, p = 0.75, list = FALSE)
train <- modelo2[index, ]
test <- modelo2[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "down")
model2_down_knn <- caret::train(StatusC9 ~ .,
data = train,
method = "knn",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid)
}
model2_down_knn
## k-Nearest Neighbors
##
## 193 samples
## 9 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (9), centered (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using down-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.7275146 0.3051389
## 2 0.7422515 0.3270325
## 3 0.7569298 0.3266529
## 4 0.7324854 0.3409779
## 5 0.7998830 0.4410100
## 6 0.8210526 0.4752704
## 7 0.7627193 0.3933176
## 8 0.8096491 0.4485293
## 9 0.7826901 0.4377804
## 10 0.7835380 0.3974333
## 11 0.7829532 0.4412477
## 12 0.7727193 0.4150506
## 13 0.7735673 0.4122965
## 14 0.7887719 0.4654369
## 15 0.7876901 0.4479515
## 16 0.7780117 0.4154033
## 17 0.7879825 0.4481874
## 18 0.8038012 0.4589723
## 19 0.7882456 0.4450741
## 20 0.7932456 0.4547791
## 21 0.7879825 0.4481631
## 22 0.8138012 0.4816816
## 23 0.7932456 0.4509858
## 24 0.8185088 0.4838230
## 25 0.8095906 0.4761403
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 6.
# plot(model2_down_knn)
# plot(model2_down_knn, print.thres = 0.5, type = "S")
final2_down_knn <- data.frame(Observado = test$StatusC9,
predict(model2_down_knn, newdata = test, type = "prob"))
final2_down_knn$predict <- ifelse(final2_down_knn$Extravasando > 0.5, "Extravasando", "Normal")
final2_down_knn$predict <- as.factor(final2_down_knn$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm2_down_knn <- caret::confusionMatrix(final2_down_knn$predict, test$StatusC9)
cm2_down_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 7 5
## Normal 2 49
##
## Accuracy : 0.8889
## 95% CI : (0.7844, 0.9541)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.3057
##
## Kappa : 0.6016
##
## Mcnemar's Test P-Value : 0.4497
##
## Sensitivity : 0.7778
## Specificity : 0.9074
## Pos Pred Value : 0.5833
## Neg Pred Value : 0.9608
## Prevalence : 0.1429
## Detection Rate : 0.1111
## Detection Prevalence : 0.1905
## Balanced Accuracy : 0.8426
##
## 'Positive' Class : Extravasando
##
Modeo knn com dados com oversampling.
# KNN 'up' m2----
{
set.seed(1)
index <- caret::createDataPartition(modelo2$StatusC9, p = 0.75, list = FALSE)
train <- modelo2[index, ]
test <- modelo2[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "up")
model2_up_knn <- caret::train(StatusC9 ~ .,
data = train,
method = "knn",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid)
}
model2_up_knn
## k-Nearest Neighbors
##
## 193 samples
## 9 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (9), centered (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using up-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8348830 0.3258801
## 2 0.8246199 0.3444388
## 3 0.7996491 0.3602300
## 4 0.7688596 0.3517557
## 5 0.7428070 0.3401001
## 6 0.7475439 0.3687846
## 7 0.7111404 0.3407968
## 8 0.7061404 0.3204530
## 9 0.7061988 0.3087804
## 10 0.7171930 0.3414329
## 11 0.7158772 0.3040469
## 12 0.7430409 0.3322239
## 13 0.7211404 0.2995495
## 14 0.7474561 0.3467629
## 15 0.7316667 0.3116439
## 16 0.7530117 0.3807705
## 17 0.7638012 0.3855849
## 18 0.7538304 0.3729674
## 19 0.7361404 0.3070553
## 20 0.7490936 0.3624218
## 21 0.7474561 0.3669236
## 22 0.7729825 0.4032428
## 23 0.7835673 0.3977579
## 24 0.7738012 0.4153541
## 25 0.7524854 0.3831014
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
# plot(model2_up_knn)
# plot(model2_up_knn, print.thres = 0.5, type = "S")
final2_up_knn <- data.frame(Observado = test$StatusC9,
predict(model2_up_knn, newdata = test, type = "prob"))
final2_up_knn$predict <- ifelse(final2_up_knn$Extravasando > 0.5, "Extravasando", "Normal")
final2_up_knn$predict <- as.factor(final2_up_knn$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm2_up_knn <- caret::confusionMatrix(final2_up_knn$predict, test$StatusC9)
cm2_up_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 5 2
## Normal 4 52
##
## Accuracy : 0.9048
## 95% CI : (0.8041, 0.9642)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.1860
##
## Kappa : 0.5714
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.55556
## Specificity : 0.96296
## Pos Pred Value : 0.71429
## Neg Pred Value : 0.92857
## Prevalence : 0.14286
## Detection Rate : 0.07937
## Detection Prevalence : 0.11111
## Balanced Accuracy : 0.75926
##
## 'Positive' Class : Extravasando
##
Modeo knn com dados com ROSE.
# KNN 'rose' m2 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo2$StatusC9, p = 0.75, list = FALSE)
train <- modelo2[index, ]
test <- modelo2[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "rose")
model2_r_knn <- caret::train(StatusC9 ~ .,
data = train,
method = "knn",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid)
}
model2_r_knn
## k-Nearest Neighbors
##
## 193 samples
## 9 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (9), centered (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using ROSE prior to pre-processing
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8554678 0.3954751
## 2 0.8338304 0.2856434
## 3 0.8293860 0.3104005
## 4 0.8140936 0.2403190
## 5 0.8657018 0.4342187
## 6 0.8246491 0.2527696
## 7 0.8152339 0.2438735
## 8 0.8340643 0.3435023
## 9 0.8507018 0.3611746
## 10 0.8243567 0.3149141
## 11 0.8454678 0.4502157
## 12 0.8396491 0.3607766
## 13 0.8188012 0.3864216
## 14 0.8652047 0.4478241
## 15 0.8709942 0.3571308
## 16 0.8457602 0.4170751
## 17 0.8446199 0.4157717
## 18 0.8601754 0.4235092
## 19 0.8404678 0.3634873
## 20 0.8598830 0.3977461
## 21 0.8659649 0.4917839
## 22 0.8690643 0.4769531
## 23 0.8412865 0.3753766
## 24 0.8490936 0.3831323
## 25 0.8504678 0.4039870
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 15.
# plot(model2_r_knn)
# plot(model2_r_knn, print.thres = 0.5, type = "S")
final2_r_knn <- data.frame(Observado = test$StatusC9,
predict(model2_r_knn, newdata = test, type = "prob"))
final2_r_knn$predict <- ifelse(final2_r_knn$Extravasando > 0.5, "Extravasando", "Normal")
final2_r_knn$predict <- as.factor(final2_r_knn$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm2_r_knn <- caret::confusionMatrix(final2_r_knn$predict, test$StatusC9)
cm2_r_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 4 0
## Normal 5 54
##
## Accuracy : 0.9206
## 95% CI : (0.8244, 0.9737)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.09773
##
## Kappa : 0.5783
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.44444
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 0.91525
## Prevalence : 0.14286
## Detection Rate : 0.06349
## Detection Prevalence : 0.06349
## Balanced Accuracy : 0.72222
##
## 'Positive' Class : Extravasando
##
rf.
Modeo rf com dados com dados sem reamostragem.
# RF 'original' m2 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo2$StatusC9, p = 0.75, list = FALSE)
train <- modelo2[index, ]
test <- modelo2[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE)
model2_orig_rf <- caret::train(StatusC9 ~ .,
data = train,
method = "rf",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
ntree = 1500)
}
model2_orig_rf
## Random Forest
##
## 193 samples
## 9 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (9), centered (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8920760 0.4968054
## 5 0.8967836 0.5010313
## 9 0.8967836 0.5010313
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
# plot(model2_orig_rf)
# plot(model2_orig_rf, print.thres = 0.5, type = "S")
final2_orig_rf <- data.frame(Observado = test$StatusC9,
predict(model2_orig_rf, newdata = test, type = "prob"))
final2_orig_rf$predict <- ifelse(final2_orig_rf$Extravasando > 0.5, "Extravasando", "Normal")
final2_orig_rf$predict <- as.factor(final2_orig_rf$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm2_orig_rf <- caret::confusionMatrix(final2_orig_rf$predict, test$StatusC9)
cm2_orig_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 5 2
## Normal 4 52
##
## Accuracy : 0.9048
## 95% CI : (0.8041, 0.9642)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.1860
##
## Kappa : 0.5714
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.55556
## Specificity : 0.96296
## Pos Pred Value : 0.71429
## Neg Pred Value : 0.92857
## Prevalence : 0.14286
## Detection Rate : 0.07937
## Detection Prevalence : 0.11111
## Balanced Accuracy : 0.75926
##
## 'Positive' Class : Extravasando
##
Modeo rf com dados com dados com undersampling.
# RF 'under' m2 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo2$StatusC9, p = 0.75, list = FALSE)
train <- modelo2[index, ]
test <- modelo2[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "down")
model2_down_rf <- caret::train(StatusC9 ~ .,
data = train,
method = "rf",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
ntree = 1500)
}
model2_down_rf
## Random Forest
##
## 193 samples
## 9 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (9), centered (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using down-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8398830 0.5160191
## 5 0.8343567 0.5179935
## 9 0.8609649 0.5499552
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 9.
# plot(model2_down_rf)
# plot(model2_down_rf, print.thres = 0.5, type = "S")
final2_down_rf <- data.frame(Observado = test$StatusC9,
predict(model2_down_rf, newdata = test, type = "prob"))
final2_down_rf$predict <- ifelse(final2_down_rf$Extravasando > 0.5, "Extravasando", "Normal")
final2_down_rf$predict <- as.factor(final2_down_rf$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm2_down_rf <- caret::confusionMatrix(final2_down_rf$predict, test$StatusC9)
cm2_down_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 7 6
## Normal 2 48
##
## Accuracy : 0.873
## 95% CI : (0.765, 0.9435)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.4454
##
## Kappa : 0.5625
##
## Mcnemar's Test P-Value : 0.2888
##
## Sensitivity : 0.7778
## Specificity : 0.8889
## Pos Pred Value : 0.5385
## Neg Pred Value : 0.9600
## Prevalence : 0.1429
## Detection Rate : 0.1111
## Detection Prevalence : 0.2063
## Balanced Accuracy : 0.8333
##
## 'Positive' Class : Extravasando
##
Modeo rf com dados com dados com oversampling.
# RF 'up' m2 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo2$StatusC9, p = 0.75, list = FALSE)
train <- modelo2[index, ]
test <- modelo2[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "up")
model2_up_rf <- caret::train(StatusC9 ~ .,
data = train,
method = "rf",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
ntree = 1500)
}
model2_up_rf
## Random Forest
##
## 193 samples
## 9 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (9), centered (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using up-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8865205 0.4862377
## 5 0.9070468 0.5458972
## 9 0.9123099 0.6045885
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 9.
# plot(model2_up_rf)
# plot(model2_up_rf, print.thres = 0.5, type = "S")
final2_up_rf <- data.frame(Observado = test$StatusC9,
predict(model2_up_rf, newdata = test, type = "prob"))
final2_up_rf$predict <- ifelse(final2_up_rf$Extravasando > 0.5, "Extravasando", "Normal")
final2_up_rf$predict <- as.factor(final2_up_rf$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm2_up_rf <- caret::confusionMatrix(final2_up_rf$predict, test$StatusC9)
cm2_up_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 6 4
## Normal 3 50
##
## Accuracy : 0.8889
## 95% CI : (0.7844, 0.9541)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.3057
##
## Kappa : 0.5664
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.66667
## Specificity : 0.92593
## Pos Pred Value : 0.60000
## Neg Pred Value : 0.94340
## Prevalence : 0.14286
## Detection Rate : 0.09524
## Detection Prevalence : 0.15873
## Balanced Accuracy : 0.79630
##
## 'Positive' Class : Extravasando
##
Modeo rf com dados com dados com ROSE.
# RF 'rose' m2 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo2$StatusC9, p = 0.75, list = FALSE)
train <- modelo2[index, ]
test <- modelo2[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "rose")
model2_r_rf <- caret::train(StatusC9 ~ .,
data = train,
method = "rf",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
ntree = 1500)
}
model2_r_rf
## Random Forest
##
## 193 samples
## 9 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (9), centered (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using ROSE prior to pre-processing
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8188304 0.4126961
## 5 0.8459942 0.5287402
## 9 0.8046199 0.4459668
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
# plot(model2_r_rf)
# plot(model2_r_rf, print.thres = 0.5, type = "S")
final2_r_rf <- data.frame(Observado = test$StatusC9,
predict(model2_r_rf, newdata = test, type = "prob"))
final2_r_rf$predict <- ifelse(final2_r_rf$Extravasando > 0.5, "Extravasando", "Normal")
final2_r_rf$predict <- as.factor(final2_r_rf$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm2_r_rf <- caret::confusionMatrix(final2_r_rf$predict, test$StatusC9)
cm2_r_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 9 9
## Normal 0 45
##
## Accuracy : 0.8571
## 95% CI : (0.7461, 0.9325)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.587669
##
## Kappa : 0.5882
##
## Mcnemar's Test P-Value : 0.007661
##
## Sensitivity : 1.0000
## Specificity : 0.8333
## Pos Pred Value : 0.5000
## Neg Pred Value : 1.0000
## Prevalence : 0.1429
## Detection Rate : 0.1429
## Detection Prevalence : 0.2857
## Balanced Accuracy : 0.9167
##
## 'Positive' Class : Extravasando
##
Preparação do data set para construção de modelos no cenário 3.
modelo3 <- df_modelo %>%
dplyr::select(MEDIANA_CET09, MAX_CET09, DP_CET09, RANGE_C9,
Vol01Dia, StatusC9,
Vol02Dias, Vol03Dias, Anterior, pumpc9,
Ovfc9, tempoverc9, MAX11H_CET09)
Contrução dos modelos preditivos.
knn.
Modeo knn com dados sem reamostragem.
# knn original m3 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo3$StatusC9, p = 0.75, list = FALSE)
train <- modelo3[index, ]
test <- modelo3[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE)
model3_orig_knn <- caret::train(StatusC9 ~ .,
data = train,
method = "knn",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid)
}
model3_orig_knn
## k-Nearest Neighbors
##
## 193 samples
## 12 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (12), centered (12)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8712573 0.3887241
## 2 0.8818129 0.4853511
## 3 0.8812573 0.4216539
## 4 0.8707310 0.3607798
## 5 0.9015205 0.4323362
## 6 0.9065205 0.4464639
## 7 0.9115205 0.4605917
## 8 0.9065205 0.4464639
## 9 0.9165205 0.4919185
## 10 0.9112573 0.4605243
## 11 0.9115205 0.4605917
## 12 0.9007310 0.3977360
## 13 0.9007310 0.4232438
## 14 0.9009942 0.3978034
## 15 0.8954678 0.3918496
## 16 0.9062865 0.4617360
## 17 0.9062865 0.4617360
## 18 0.9115497 0.4931302
## 19 0.9062865 0.4617360
## 20 0.9062865 0.4617360
## 21 0.9012865 0.4304093
## 22 0.9012865 0.4304093
## 23 0.9012865 0.4304093
## 24 0.9015497 0.3930347
## 25 0.9065497 0.4389807
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
# plot(model3_orig_knn)
# plot(model3_orig_knn, print.thres = 0.5, type = "S")
final_m3_knn_ori <- data.frame(Observado = test$StatusC9,
predict(model3_orig_knn, newdata = test, type = "prob"))
final_m3_knn_ori$predict <- ifelse(final_m3_knn_ori$Extravasando > 0.5, "Extravasando", "Normal")
final_m3_knn_ori$predict <- as.factor(final_m3_knn_ori$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm_m3_knn_ori <- caret::confusionMatrix(final_m3_knn_ori$predict, test$StatusC9)
cm_m3_knn_ori
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 2 0
## Normal 7 54
##
## Accuracy : 0.8889
## 95% CI : (0.7844, 0.9541)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.30569
##
## Kappa : 0.3288
##
## Mcnemar's Test P-Value : 0.02334
##
## Sensitivity : 0.22222
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 0.88525
## Prevalence : 0.14286
## Detection Rate : 0.03175
## Detection Prevalence : 0.03175
## Balanced Accuracy : 0.61111
##
## 'Positive' Class : Extravasando
##
Modeo knn com dados com undersampling.
# KNN 'under' m3 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo3$StatusC9, p = 0.75, list = FALSE)
train <- modelo3[index, ]
test <- modelo3[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "down")
model3_down_knn <- caret::train(StatusC9 ~ .,
data = train,
method = "knn",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid)
}
model3_down_knn
## k-Nearest Neighbors
##
## 193 samples
## 12 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (12), centered (12)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using down-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.7580409 0.3487116
## 2 0.7414620 0.2899809
## 3 0.8135380 0.4627791
## 4 0.7746491 0.4098027
## 5 0.8454386 0.4930722
## 6 0.8357310 0.4782013
## 7 0.7835088 0.4516003
## 8 0.8407310 0.5067918
## 9 0.8295906 0.5198483
## 10 0.8407018 0.4687983
## 11 0.8132164 0.4602499
## 12 0.8354386 0.5244490
## 13 0.8615497 0.5730035
## 14 0.8145906 0.4833847
## 15 0.8615205 0.5101008
## 16 0.8557018 0.5598614
## 17 0.8393275 0.4664166
## 18 0.8615205 0.5759126
## 19 0.8454386 0.4833937
## 20 0.8654094 0.5754428
## 21 0.8654094 0.5693514
## 22 0.8548830 0.5235823
## 23 0.8651462 0.5547538
## 24 0.8848830 0.5509329
## 25 0.8812573 0.6057154
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 24.
# plot(model3_down_knn)
# plot(model3_down_knn, print.thres = 0.5, type = "S")
final3_down_knn <- data.frame(Observado = test$StatusC9,
predict(model3_down_knn, newdata = test, type = "prob"))
final3_down_knn$predict <- ifelse(final3_down_knn$Extravasando > 0.5, "Extravasando", "Normal")
final3_down_knn$predict <- as.factor(final3_down_knn$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm3_down_knn <- caret::confusionMatrix(final3_down_knn$predict, test$StatusC9)
cm3_down_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 9 5
## Normal 0 49
##
## Accuracy : 0.9206
## 95% CI : (0.8244, 0.9737)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.09773
##
## Kappa : 0.7368
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 1.0000
## Specificity : 0.9074
## Pos Pred Value : 0.6429
## Neg Pred Value : 1.0000
## Prevalence : 0.1429
## Detection Rate : 0.1429
## Detection Prevalence : 0.2222
## Balanced Accuracy : 0.9537
##
## 'Positive' Class : Extravasando
##
Modeo knn com dados com oversampling.
# KNN 'up' m3----
{
set.seed(1)
index <- caret::createDataPartition(modelo3$StatusC9, p = 0.75, list = FALSE)
train <- modelo3[index, ]
test <- modelo3[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "up")
model3_up_knn <- caret::train(StatusC9 ~ .,
data = train,
method = "knn",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid)
}
model3_up_knn
## k-Nearest Neighbors
##
## 193 samples
## 12 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (12), centered (12)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using up-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8667836 0.4418303
## 2 0.8454386 0.4874792
## 3 0.8201754 0.4300384
## 4 0.8099415 0.4526167
## 5 0.7893860 0.4224052
## 6 0.7694152 0.3934421
## 7 0.7724561 0.4084034
## 8 0.7264035 0.2877401
## 9 0.7366959 0.3122513
## 10 0.7577193 0.3776564
## 11 0.7563743 0.3315763
## 12 0.7322222 0.2992497
## 13 0.7411111 0.3091463
## 14 0.7677193 0.3799019
## 15 0.7619591 0.3376831
## 16 0.7677485 0.3695775
## 17 0.7738304 0.3822091
## 18 0.7793860 0.3969683
## 19 0.7722222 0.3535007
## 20 0.7990643 0.4494331
## 21 0.7788304 0.3923367
## 22 0.7935380 0.4286329
## 23 0.8146491 0.4343551
## 24 0.8038012 0.4313335
## 25 0.8090936 0.4637515
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
# plot(model3_up_knn)
# plot(model3_up_knn, print.thres = 0.5, type = "S")
final3_up_knn <- data.frame(Observado = test$StatusC9,
predict(model3_up_knn, newdata = test, type = "prob"))
final3_up_knn$predict <- ifelse(final3_up_knn$Extravasando > 0.5, "Extravasando", "Normal")
final3_up_knn$predict <- as.factor(final3_up_knn$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm3_up_knn <- caret::confusionMatrix(final3_up_knn$predict, test$StatusC9)
cm3_up_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 4 0
## Normal 5 54
##
## Accuracy : 0.9206
## 95% CI : (0.8244, 0.9737)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.09773
##
## Kappa : 0.5783
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.44444
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 0.91525
## Prevalence : 0.14286
## Detection Rate : 0.06349
## Detection Prevalence : 0.06349
## Balanced Accuracy : 0.72222
##
## 'Positive' Class : Extravasando
##
Modeo knn com dados com ROSE.
# KNN 'rose' m3 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo3$StatusC9, p = 0.75, list = FALSE)
train <- modelo3[index, ]
test <- modelo3[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "rose")
model3_r_knn <- caret::train(StatusC9 ~ .,
data = train,
method = "knn",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid)
}
model3_r_knn
## k-Nearest Neighbors
##
## 193 samples
## 12 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (12), centered (12)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using ROSE prior to pre-processing
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8921053 0.5081957
## 2 0.9015205 0.5133778
## 3 0.8762865 0.4351727
## 4 0.8757602 0.4263846
## 5 0.8973684 0.5440734
## 6 0.8863158 0.4507099
## 7 0.8821053 0.4584897
## 8 0.8865497 0.5039848
## 9 0.9020760 0.5322804
## 10 0.8809942 0.4431366
## 11 0.8715497 0.5296963
## 12 0.9070760 0.5985232
## 13 0.8762573 0.4886031
## 14 0.9068421 0.5485597
## 15 0.8873684 0.5178448
## 16 0.9023684 0.5889053
## 17 0.9021053 0.5332075
## 18 0.8968421 0.4963275
## 19 0.8865497 0.4833413
## 20 0.9173684 0.5959896
## 21 0.9120760 0.5521169
## 22 0.9220760 0.6037322
## 23 0.8923684 0.5293369
## 24 0.8801754 0.4724861
## 25 0.8971053 0.5291455
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 22.
# plot(model3_r_knn)
# plot(model3_r_knn, print.thres = 0.5, type = "S")
final3_r_knn <- data.frame(Observado = test$StatusC9,
predict(model3_r_knn, newdata = test, type = "prob"))
final3_r_knn$predict <- ifelse(final3_r_knn$Extravasando > 0.5, "Extravasando", "Normal")
final3_r_knn$predict <- as.factor(final3_r_knn$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm3_r_knn <- caret::confusionMatrix(final3_r_knn$predict, test$StatusC9)
cm3_r_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 3 1
## Normal 6 53
##
## Accuracy : 0.8889
## 95% CI : (0.7844, 0.9541)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.3057
##
## Kappa : 0.4096
##
## Mcnemar's Test P-Value : 0.1306
##
## Sensitivity : 0.33333
## Specificity : 0.98148
## Pos Pred Value : 0.75000
## Neg Pred Value : 0.89831
## Prevalence : 0.14286
## Detection Rate : 0.04762
## Detection Prevalence : 0.06349
## Balanced Accuracy : 0.65741
##
## 'Positive' Class : Extravasando
##
rf.
Modeo rf com dados com dados sem reamostragem.
# RF 'original' m3 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo3$StatusC9, p = 0.75, list = FALSE)
train <- modelo3[index, ]
test <- modelo3[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE)
model3_orig_rf <- caret::train(StatusC9 ~ .,
data = train,
method = "rf",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
ntree = 1500)
}
model3_orig_rf
## Random Forest
##
## 193 samples
## 12 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (12), centered (12)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9070760 0.5442280
## 7 0.9273392 0.6297374
## 12 0.9267836 0.6450107
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.
# plot(model3_orig_rf)
# plot(model3_orig_rf, print.thres = 0.5, type = "S")
final3_orig_rf <- data.frame(Observado = test$StatusC9,
predict(model3_orig_rf, newdata = test, type = "prob"))
final3_orig_rf$predict <- ifelse(final3_orig_rf$Extravasando > 0.5, "Extravasando", "Normal")
final3_orig_rf$predict <- as.factor(final3_orig_rf$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm3_orig_rf <- caret::confusionMatrix(final3_orig_rf$predict, test$StatusC9)
cm3_orig_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 5 1
## Normal 4 53
##
## Accuracy : 0.9206
## 95% CI : (0.8244, 0.9737)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.09773
##
## Kappa : 0.6237
##
## Mcnemar's Test P-Value : 0.37109
##
## Sensitivity : 0.55556
## Specificity : 0.98148
## Pos Pred Value : 0.83333
## Neg Pred Value : 0.92982
## Prevalence : 0.14286
## Detection Rate : 0.07937
## Detection Prevalence : 0.09524
## Balanced Accuracy : 0.76852
##
## 'Positive' Class : Extravasando
##
Modelo rf com dados com dados com undersampling. Para esta rotina, com prévio conhecimento de seu alto desemnho, será mensurado o tempo demandado para processamento utilizando a função Sys.time. O computador utilizado para construção dos modelos tem processador Intel Core i7 3610QM 3ªG de 2.3ghz e 8gb de memória ram DDR3 1600mhz.
start.time <- Sys.time()
# RF 'under' m3 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo3$StatusC9, p = 0.75, list = FALSE)
train <- modelo3[index, ]
test <- modelo3[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "down")
model3_down_rf <- caret::train(StatusC9 ~ .,
data = train,
method = "rf",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
ntree = 1500)
}
model3_down_rf
## Random Forest
##
## 193 samples
## 12 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (12), centered (12)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using down-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8706725 0.5536617
## 7 0.8704094 0.5975907
## 12 0.8509649 0.5427073
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# plot(model3_down_rf)
# plot(model3_down_rf, print.thres = 0.5, type = "S")
final3_down_rf <- data.frame(Observado = test$StatusC9,
predict(model3_down_rf, newdata = test, type = "prob"))
final3_down_rf$predict <- ifelse(final3_down_rf$Extravasando > 0.5, "Extravasando", "Normal")
final3_down_rf$predict <- as.factor(final3_down_rf$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm3_down_rf <- caret::confusionMatrix(final3_down_rf$predict, test$StatusC9)
end.time <- Sys.time()
cm3_down_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 9 6
## Normal 0 48
##
## Accuracy : 0.9048
## 95% CI : (0.8041, 0.9642)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.18596
##
## Kappa : 0.6957
##
## Mcnemar's Test P-Value : 0.04123
##
## Sensitivity : 1.0000
## Specificity : 0.8889
## Pos Pred Value : 0.6000
## Neg Pred Value : 1.0000
## Prevalence : 0.1429
## Detection Rate : 0.1429
## Detection Prevalence : 0.2381
## Balanced Accuracy : 0.9444
##
## 'Positive' Class : Extravasando
##
Modeo rf com dados com dados com oversampling.
# RF 'up' m3 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo3$StatusC9, p = 0.75, list = FALSE)
train <- modelo3[index, ]
test <- modelo3[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "up")
model3_up_rf <- caret::train(StatusC9 ~ .,
data = train,
method = "rf",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
ntree = 1500)
}
model3_up_rf
## Random Forest
##
## 193 samples
## 12 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (12), centered (12)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using up-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9117836 0.5955454
## 7 0.9167836 0.6175197
## 12 0.9223099 0.6450192
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 12.
# plot(model3_up_rf)
# plot(model3_up_rf, print.thres = 0.5, type = "S")
final3_up_rf <- data.frame(Observado = test$StatusC9,
predict(model3_up_rf, newdata = test, type = "prob"))
final3_up_rf$predict <- ifelse(final3_up_rf$Extravasando > 0.5, "Extravasando", "Normal")
final3_up_rf$predict <- as.factor(final3_up_rf$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm3_up_rf <- caret::confusionMatrix(final3_up_rf$predict, test$StatusC9)
cm3_up_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 5 2
## Normal 4 52
##
## Accuracy : 0.9048
## 95% CI : (0.8041, 0.9642)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.1860
##
## Kappa : 0.5714
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.55556
## Specificity : 0.96296
## Pos Pred Value : 0.71429
## Neg Pred Value : 0.92857
## Prevalence : 0.14286
## Detection Rate : 0.07937
## Detection Prevalence : 0.11111
## Balanced Accuracy : 0.75926
##
## 'Positive' Class : Extravasando
##
Modeo rf com dados com dados com ROSE.
# RF 'rose' m3 ----
{
set.seed(1)
index <- caret::createDataPartition(modelo3$StatusC9, p = 0.75, list = FALSE)
train <- modelo3[index, ]
test <- modelo3[-index, ]
ctrl <- caret::trainControl(method = "cv",
number = 10,
classProbs = T,
savePredictions = T,
verboseIter = FALSE,
sampling = "rose")
model3_r_rf <- caret::train(StatusC9 ~ .,
data = train,
method = "rf",
preProcess = c("scale", "center"),
metric = "Accuracy",
trControl = ctrl,
ntree = 1500)
}
model3_r_rf
## Random Forest
##
## 193 samples
## 12 predictor
## 2 classes: 'Extravasando', 'Normal'
##
## Pre-processing: scaled (12), centered (12)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 174, 173, 174, 173, 173, 174, ...
## Addtional sampling using ROSE prior to pre-processing
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8917836 0.5552787
## 7 0.9167836 0.6088947
## 12 0.8873392 0.6265651
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.
# plot(model3_r_rf)
# plot(model3_r_rf, print.thres = 0.5, type = "S")
final3_r_rf <- data.frame(Observado = test$StatusC9,
predict(model3_r_rf, newdata = test, type = "prob"))
final3_r_rf$predict <- ifelse(final3_r_rf$Extravasando > 0.5, "Extravasando", "Normal")
final3_r_rf$predict <- as.factor(final3_r_rf$predict)
test$StatusC9 <- as.factor(test$StatusC9)
cm3_r_rf <- caret::confusionMatrix(final3_r_rf$predict, test$StatusC9)
cm3_r_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 7 2
## Normal 2 52
##
## Accuracy : 0.9365
## 95% CI : (0.8453, 0.9824)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.04297
##
## Kappa : 0.7407
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.7778
## Specificity : 0.9630
## Pos Pred Value : 0.7778
## Neg Pred Value : 0.9630
## Prevalence : 0.1429
## Detection Rate : 0.1111
## Detection Prevalence : 0.1429
## Balanced Accuracy : 0.8704
##
## 'Positive' Class : Extravasando
##
Para melhor compreensão dos resultdados obtidos nos modelos, e objetivado em realizar a melhor escolha entre os modelos, foi realizado novamente análise exploratória do problema com o intuito de entender melhor a análise quanto ao equilíbrio dos índices de precisão calculados e importância quanto a scores da matriz de confusão.
# Descritiva de informações
# Dias
length(df_ch$Vol01Dia)
nrow(df_ch %>% dplyr::filter(Vol01Dia > 0)) # 154 dias com ocorrência de chuvas
154/257 # ~0.6
nrow(df_ch %>% dplyr::filter(Vol01Dia > 2)) # 93 dias com ocorrência de chuvas
93/257 # ~0.36
nrow(df_ch %>% dplyr::filter(Vol01Dia > 10)) # 40 dias com ocorrência de chuvas
40/257 # ~0.15
nrow(df_ch %>% dplyr::filter(Vol01Dia > 15)) # 29 dias com ocorrência de chuvas
29/257 # ~0.11
nrow(df_ch %>% dplyr::filter(Vol01Dia > 25)) # 14 dias com ocorrência de chuvas
14/257 # ~0.05
nrow(df_ch %>% dplyr::filter(Vol01Dia > 50)) # 5 dias com ocorrência de chuvas
5/257 # ~0.02
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 1 & Vol01Dia > 0))
# 19
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 2 & Vol01Dia > 0))
# 12
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 3 & Vol01Dia > 0))
# 14
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 4 & Vol01Dia > 0))
# 24
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 5 & Vol01Dia > 0))
# 27
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 6 & Vol01Dia > 0))
# 19
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 7 & Vol01Dia > 0))
# 17
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 8 & Vol01Dia > 0))
# 18
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 9 & Vol01Dia > 0))
# 4
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 1 & Vol01Dia > 10))
# 6
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 2 & Vol01Dia > 10))
# 1
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 3 & Vol01Dia > 10))
# 5
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 4 & Vol01Dia > 10))
# 7
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 5 & Vol01Dia > 10))
# 9
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 6 & Vol01Dia > 10))
# 10
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 7 & Vol01Dia > 10))
# 0
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 8 & Vol01Dia > 10))
# 2
nrow(df_ch %>% dplyr::mutate(Mes = lubridate::month(Periodo)) %>%
dplyr::filter(Mes == 9 & Vol01Dia > 10))
# 0
nrow(df %>% dplyr::filter(StatusC9 == "Extravasando")) # 37 dias ocorrências de extravasamento
37/257 # 0.14
nrow(df %>% dplyr::filter(StatusC9 == "Normal")) # 220 dias com operação normal
220/257 # 0.86
nrow(df %>% dplyr::filter(Vol01Dia > 0 & StatusC9 == "Extravasando"))
# 34 Extravasamentos ocorridos na presença de qualquer valor de chuva
34/154 # 0.22
nrow(df %>% dplyr::filter(Vol01Dia >= 2 & StatusC9 == "Extravasando"))
# 31 Extravasamentos ocorridos na presença de chuva maior ou igual a 2 mm
31/93 # 0.3333
nrow(df %>% dplyr::filter(Vol01Dia >= 5 & StatusC9 == "Extravasando"))
# 20 Extravasamentos ocorridos na presença de chuva maior ou igual a 5 mm
20/40 # 0.5
nrow(df %>% dplyr::filter(Vol01Dia >= 15 & StatusC9 == "Extravasando"))
# 9 Extravasamentos ocorridos na presença de chuva maior ou igual a 15 mm
9/29 # 0.31
nrow(df %>% dplyr::filter(Vol01Dia >= 25 & StatusC9 == "Extravasando"))
# 6 Extravasamentos ocorridos na presença de chuva maior ou igual a 25 mm
6/14 # 0.43
nrow(df %>% dplyr::filter(Vol01Dia >= 50 & StatusC9 == "Extravasando"))
3/5 # 0.6
A partir da escolha dos melhores modelos, serão plotadas todas as métricas relacionadas a precisão para que se possa indicar, possivelmente, o melhor modelo.
# Modelo RF Oversampling cenário 1 ----
cm1_up_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extravasando Normal
## Extravasando 8 3
## Normal 1 51
##
## Accuracy : 0.9365
## 95% CI : (0.8453, 0.9824)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.04297
##
## Kappa : 0.7627
##
## Mcnemar's Test P-Value : 0.61708
##
## Sensitivity : 0.8889
## Specificity : 0.9444
## Pos Pred Value : 0.7273
## Neg Pred Value : 0.9808
## Prevalence : 0.1429
## Detection Rate : 0.1270
## Detection Prevalence : 0.1746
## Balanced Accuracy : 0.9167
##
## 'Positive' Class : Extravasando
##
df_rf_c1_ov <- data.frame(Acuracia = cm1_up_rf$overall[1],
kappa = cm1_up_rf$overall[2],
Sentit = cm1_up_rf$byClass[1],
Especi = cm1_up_rf$byClass[2],
F1 = cm1_up_rf$byClass[7])
# Para obtenção do índice orss
# Funcao para indice ORSS
orss <- function(x){
a <- as.data.frame(x$table)[1,3]
b <- as.data.frame(x$table)[2,3]
c <- as.data.frame(x$table)[3,3]
d <- as.data.frame(x$table)[4,3]
result <- round(((a*d) - (b*c)) / ((a*d) + (b*c)), 3)
return(as.numeric(result))
}
#
orss_rf_c1_op <- orss(cm1_up_rf)
df_rf_c1_ov <- df_rf_c1_ov %>%
dplyr::mutate(ORSS = orss_rf_c1_op,
Cenario = 1,
resamp = "over")
# Modelo RF undersampling cenário 3 ----
df_rf_c3_un <- data.frame(Acuracia = cm3_down_rf$overall[1],
kappa = cm3_down_rf$overall[2],
Sentit = cm3_down_rf$byClass[1],
Especi = cm3_down_rf$byClass[2],
F1 = cm3_down_rf$byClass[7])
orss_rf_c3_un <- orss(cm3_down_rf)
df_rf_c3_un <- df_rf_c3_un %>%
dplyr::mutate(ORSS = orss_rf_c3_un,
Cenario = 3,
resamp = "under")
# Modelo RF ROSE cenário 3 ----
df_rf_c3_rose <- data.frame(Acuracia = cm3_r_rf$overall[1],
kappa = cm3_r_rf$overall[2],
Sentit = cm3_r_rf$byClass[1],
Especi = cm3_r_rf$byClass[2],
F1 = cm3_r_rf$byClass[7])
orss_rf_c3_rose <- orss(cm3_r_rf)
df_rf_c3_rose <- df_rf_c3_rose %>%
dplyr::mutate(ORSS = orss_rf_c3_rose,
Cenario = 3,
resamp = "ROSE")
models <- df_rf_c1_ov %>%
dplyr::bind_rows(df_rf_c3_un, df_rf_c3_rose) %>%
reshape2::melt(id.vars = c("Cenario", "resamp")) %>%
dplyr::rename("Metrica" = variable,
"Valor" = value) %>%
dplyr::mutate(Cenario = as.factor(Cenario),
resamp = as.factor(resamp))
levels(models$Cenario) <- c("Cenário 1", "Cenário 3")
levels(models$resamp) <- c("Oversampling", "ROSE", "Undersampling")
levels(models$Metrica) <- c("Acurácia", "Kappa", "Sensibilidade",
"Especificidade", "F1", "ORSS")
pmodelo <- ggplot2::ggplot(models) +
geom_point(aes(x = Metrica,
y = Valor,
fill = resamp),
shape = 21, alpha = 0.7, stroke = 1.5,
size = 5) +
facet_wrap(~Cenario, nrow = 1) +
scale_fill_manual(name = "Legenda",
values = wesanderson::wes_palette("Darjeeling1")) +
#scale_y_continuous(limits = c(0, 1)) +
guides(fill = guide_legend(override.aes = list(size = 4))) +
xlab("Métrica") + ylab("Valor") +
temag + theme(axis.text.x = element_text(angle = 90),
legend.position = "bottom",
strip.text = element_text(size = 20, face = "bold"))
p_save_modeloc <- pmodelo
pmodelo
plotly::ggplotly(pmodelo)
Referente ao modelo rf no cenário 3 com undersampling
time.taken <- end.time - start.time
paste0("O tempo de processamento foi de :", time.taken, "s .")
## [1] "O tempo de processamento foi de :3.27318787574768s ."
dis_chu <- chuva %>%
dplyr::select(-Periodo, -Mes)
p1 <- ggplot2::ggplot(dis_chu) +
geom_histogram(aes(x = Vol01Dia, y = ..density..),
bins = 19,
colour = "black") +
geom_density(aes(x = Vol01Dia),
alpha = .2, fill = "#FF6666") +
scale_fill_grey() +
xlab("Volume Diário de Chuva Acumulada [mm]") +
ylab("Densidade") +
tema
levels(chuva$Mes) <- c("JAN", "FEV", "MAR", "ABR", "MAI",
"JUN", "JUL", "AGO", "SET")
p2 <- ggplot2::ggplot(chuva,
aes(x = Vol01Dia, y = Mes)) +
geom_density_ridges_gradient(aes(fill = ..x..),
scale = 3,
size = 0.3) +
scale_fill_gradient(low = "gray48", high = "gray23", name = "Chuva [mm]") +
xlab("Índice Pluviométrico Diário [mm]") + ylab("Mês") +
theme_ridges() +
tema + theme(legend.position = "bottom")
# cowplot::plot_grid(p1, p2, align = "vh", labels = c('(A)', '(B)'))
p1 <- ggplot2::ggplot(Datax_m,
aes(x = CET_09, y = Mes, fill = ..x..)) +
geom_density_ridges_gradient(scale = 3,
size = 0.3) +
scale_x_continuous(expand = c(0.01, 0)) +
scale_y_discrete(expand = c(0.01, 0)) +
scale_fill_gradient(low = "gray48", high = "gray23", name = "Nível %") +
xlab("Nível CET 09") + ylab("Mês") +
theme_ridges() +
tema + theme(legend.position = "bottom")
psave1 <- cowplot::plot_grid(p2, p1, align = "vh", labels = c('(A)', '(B)'))
psave1
niv_cmb <- Datax %>%
dplyr::select(-PUMP_09) %>%
reshape2::melt(id.vars = "Periodo")
levels(niv_cmb$variable) <- "CET 09"
p1 <- ggplot2::ggplot(niv_cmb) +
geom_violin(aes(x = variable,
y = value),
col = "black", alpha = 0.85, fill = "gray13") +
ylab("% Nível") +
tema + theme(axis.title.x = element_blank(),
axis.ticks.x = element_blank())
niv_cmb <- Datax %>%
dplyr::select(-CET_09) %>%
reshape2::melt(id.vars = "Periodo")
levels(niv_cmb$variable) <- "CMB CET 09"
p2 <- ggplot2::ggplot(niv_cmb) +
geom_violin(aes(x = variable,
y = value),
col = "black", alpha = 0.85, fill = "gray13") +
scale_y_continuous(breaks = seq(0, 2, 1),
labels = c("OFF", "1 ON", "2 ON")) +
ylab("CMB") +
tema + theme(axis.title.x = element_blank(),
axis.ticks.x = element_blank())
p2
cowplot::plot_grid(p1, p2, align = "vh", labels = c('(A)', '(B)'))
Modelo rf com variáveis do cenário 3 e com undersampling.
# Variaveis - importancia
# indice Gini ----
plot(varImp(model3_down_rf))
gini <- varImp(model3_down_rf)$importance %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
dplyr::arrange(Overall) %>%
dplyr::mutate(rowname = forcats::fct_inorder(rowname))
levels(gini$rowname) <- c("Chuva (Vci-1) [mm]", "Chuva (Vci-1 + Vci-2 + Vci-3) [mm]",
"Dif. Niv. Diário", # Range_c9
"DP % Nível/dia", # Desvio-padrão
"Tempo de Op. do CMB/dia", "Chuva (Vc i-2) [mm]",
"Chuva (Vci-1 + Vci-2) [mm]",
"Vol. de Extra. [m³/dia]", # Volume extravasado diario
"Tempo Extra. [min/dia]", # Tempo em extravasamento [min/dia]
"Meidana Nível %",
"Máx. % Nível/dia", "Máx. %Nível 23hr")
gini <- ggplot2::ggplot(gini) +
geom_col(aes(x = rowname,
y = Overall,
fill = rowname),
col = "black", show.legend = F) +
coord_flip() +
scale_fill_grey() +
xlab("Variáveis") +
ggtitle("Random Forest - Cenário 3 com subamostragem") +
temag + theme(plot.title = element_text(color = "black", size = 22, face = "bold"))
gini
# Curva ROC - AUC ----
# levels(model2_down_rf$pred$pred)
roc1 <- data.frame(preditor = model3_down_rf$pred$pred) %>%
dplyr::mutate(preditor = ifelse(preditor == "Extravasando", 1, 0))
# levels(model2_down_rf$pred$obs)
roc2 <- data.frame(Obs = model3_down_rf$pred$obs) %>%
dplyr::mutate(Obs = ifelse(Obs == "Extravasando", 1, 0))
myRoc <- roc(roc1$preditor, roc2$Obs, positive = 'pred')
# plot(myRoc)
# Erros no looping da random forest -----
# plot(model3_down_rf$finalModel, main = "OOB - RF com undersampling")
ccc <- model3_down_rf$finalModel
ccc <- as.data.frame(ccc[["err.rate"]])
df_oob <- ccc %>%
dplyr::mutate(tree = 1:1500) %>%
reshape2::melt(id.vars = "tree")
# OOB com ggplot2
plot_oob <- ggplot2::ggplot(df_oob) +
geom_line(aes(x = tree, y = value, col = variable),
size = 1) +
xlab("N° Árvores") + ylab("Erro") +
labs(col = "Legenda") +
tema + theme(legend.position = "bottom")
plot_oob
# Plot de incorporação do modelo a série de nivel ----
#
final_model3_down_rf <- data.frame(Observado = modelo3$StatusC9,
predict(model3_down_rf, newdata = modelo3,
type = "prob"))
final_model3_down_rf$predict <- ifelse(final_model3_down_rf$Extravasando > 0.5,
"Extravasando", "Normal")
final_model3_down_rf <- final_model3_down_rf %>%
dplyr::mutate(Prob = ifelse(predict == "Extravasando", Extravasando,
ifelse(predict == "Normal", Normal, NA)),
Acertos = as.factor(ifelse(predict == Observado, "Correta", "Incorreta")),
Periodo = as.Date(df_modelo$Periodo),
Mediana_nivel = df_modelo$MEDIANA_CET09)
pprob <- ggplot2::ggplot(final_model3_down_rf) +
geom_hline(aes(yintercept = 0.5), col = "red") +
geom_point(aes(x = Periodo,
y = Prob,
fill = Acertos),
shape = 21, stroke = 0.2,
alpha = 0.7, size = 3) +
scale_y_continuous(limits = c(0, 1)) +
scale_fill_manual(values = pals::tol(n = 2),
name = "Legenda - Predição") +
# scale_fill_manual(values = pals::tol(n = 2),
# name = "Legenda - Predição",
# labels = c("Incorreta", "Correta")) +
guides(fill = guide_legend(override.aes = list(size = 2))) +
ylab("Probalidade") + xlab("Período em Análise") +
ggtitle("Aplicação do Modelo") +
tema + theme(legend.position = "bottom")
plotly::ggplotly(pprob)
px2 <- px2 + theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
px <- cowplot::plot_grid(px2, pprob,
align = "hv", ncol = 1)
px
pprob <- ggplot2::ggplot(final_model3_down_rf) +
geom_hline(aes(yintercept = 0.5), col = "black", linetype = "twodash") +
geom_hline(aes(yintercept = 0.95), col = "red", linetype = "dashed") +
geom_line(aes(x = Periodo,
y = Mediana_nivel/100, col = "% Nível"),
alpha = 0.4, size = 1) +
geom_point(aes(x = Periodo,
y = Prob,
fill = Acertos),
shape = 21, stroke = 0.2,
alpha = 0.7, size = 3) +
scale_y_continuous(limits = c(0, 1),
sec.axis = sec_axis(~ . * 100,
name = "% Mediana Nível")) +
scale_fill_manual(values = pals::tol(n = 2),
labels = c("Predição\n Correta", "Predição\n Incorreta")) +
scale_color_manual(values = "black", labels = "% Nível") +
guides(fill = guide_legend(override.aes = list(size = 2.3))) +
labs(fill = "Legenda:", col = "") +
ylab("Probalidade Predita") + xlab("Período em Análise") +
ggtitle("") +
tema + theme(legend.position = "bottom")
pprob_save <- pprob
pprob
final2_down_rf <- data.frame(Observado = modelo2$StatusC9,
predict(model2_down_rf, newdata = modelo2, type = "prob"))
final2_down_rf$predict <- ifelse(final2_down_rf$Extravasando > 0.5, "Extravasando", "Normal")
final2_down_rf <- final2_down_rf %>%
dplyr::mutate(Prob = ifelse(predict == "Extravasando", Extravasando,
ifelse(predict == "Normal", Normal, NA)),
Acertos = as.factor(ifelse(predict == Observado, "Correta", "Incorreta")),
Periodo = as.Date(df_modelo$Periodo),
Mediana_nivel = df_modelo$MEDIANA_CET09)
pprob <- ggplot2::ggplot(final2_down_rf) +
geom_hline(aes(yintercept = 0.5), col = "red") +
geom_point(aes(x = Periodo,
y = Prob,
fill = Acertos),
shape = 21, stroke = 0.2,
alpha = 0.7, size = 3) +
scale_y_continuous(limits = c(0, 1)) +
scale_fill_manual(values = pals::tol(n = 2),
name = "Legenda - Predição") +
guides(fill = guide_legend(override.aes = list(size = 2))) +
ylab("Probalidade") + xlab("Período em Análise") +
ggtitle("Aplicação do Modelo") +
tema + theme(legend.position = "bottom")
plotly::ggplotly(pprob)
# final3_down_rf
# Modelo de melhor acerto
melhor_modelo <- final3_down_rf %>%
dplyr::mutate(id = 1:length(final3_down_rf$Observado),
Predicao = ifelse(predict == Observado, "Acerto", "Erro")) %>%
dplyr::select(-predict, -Normal) %>%
reshape2::melt(id.vars = c("id", "Observado", "Predicao"))
ggplot2::ggplot(melhor_modelo) +
geom_point(aes(x = id,
y = value,
col = Predicao)) +
facet_grid(Predicao~Observado) +
ylab("Probabilidade") + xlab("Observação") +
ggtitle(" Predição\n Apenas informação da Odds. Extravasamento") +
tema + theme(legend.position = "bottom")
melhor_modelo<- melhor_modelo %>%
dplyr::filter(Observado == "Extravasando" & Predicao == "Acerto")
binz <- 1.32*log10(nrow(melhor_modelo))
ggplot2::ggplot(melhor_modelo) +
geom_density(aes(x = value),
alpha = .5,
fill = "#FF6666") +
xlab("Valor da Probabilidade\n de Extravasamento") +
ylab("Densidade") +
tema
p_save_pump <- p_save_pump + theme(strip.text = element_text(size = 16, face = "bold"),
axis.text.x = element_text(angle = 90))
p_save_pump
chuva_mensal
p_save_pump <- cowplot::plot_grid(p_save_pump, chuva_mensal,
align = "hv", labels = c('(A)', '(B)'))
# Chunk para save de imagens
# psave1
ggsave(file = "distr_densi.png",
psave1,
width = 17,
height = 7,
dpi = 700)
# psave_3
ggsave(file = "chuv_op_niv.png",
psave_3,
width = 15,
height = 8,
dpi = 700)
# psave_lim
ggsave(file = "limiar.png",
psave_lim,
width = 19,
height = 7,
dpi = 700)
# p_save_modeloc
ggsave(file = "p_save_modeloc.png",
p_save_modeloc,
width = 15,
height = 7,
dpi = 700)
# p_save_pump
ggsave(file = "p_save_pump.png",
p_save_pump,
width = 15,
height = 7,
dpi = 700)
# gini
ggsave(file = "p_save_gini.png",
gini,
width = 17,
height = 8,
dpi = 700)
ggsave(file = "plot_oob.png",
plot_oob,
width = 13,
height = 9,
dpi = 700)
ggsave(file = "plot_fre_ovfl.png",
plot_fre_ovfl,
width = 13,
height = 7,
dpi = 700)
ggsave(file = "pprob_save.png",
pprob_save,
width = 13,
height = 7,
dpi = 700)
Os autores agradecem ao Coordenação de Aperfeiçoamento de Pessoal de Nível Superior (CAPES) e a Conselho Nacional de Desenvolvimento Científico e Tecnológico (CNPq) pelo fomento financeiro a pesquisa, e ao grupo GAMMA pelas frutíferas discussões acerca do trabalho.