Introducción

Este documento hace parte del trabajo del curso de Analítica Predictiva de la Universidad Nacional de Colombia para la Maestría en Ingeniería y Especialización en Analítica. El alcance de este es la creación de modelos predictivos para la estimación de cantidad de accidentes en el valle del aburra a nivel diario.

Carga de librerias

# Manipulación de datos
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

# Modelado de datos
library(recipes)

## 
## Attaching package: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library(Metrics)
library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.0-2

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:Metrics':
## 
##     precision, recall

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

# Visualización
library(ggplot2)

# Formato tablas
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

Carga de datos

df_train <- read.csv('../../data/processed/train_data.csv')
df_test <- read.csv('../../data/processed/test_data.csv')
special_dates_train <- read.csv('../../data/processed/special_date_monthly_2017.csv')
special_dates_test <- read.csv('../../data/processed/special_date_monthly_2018.csv')

kable(head(df_train)) %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  scroll_box(width = "100%")

X	DIA	PERIODO	CLASE	DIRECCION	DIRECCION_ENC	CBML	TIPO_GEOCOD	GRAVEDAD	BARRIO	COMUNA	DISENO	DIA_NOMBRE	MES	LONGITUD	LATITUD	FECHA
1	1	2014	choque	cr 63 cl 94	cr 063 094 000 00000		no ubicada	solo danos			tramo de via	miercoles	1	-75.70382	6.221806	2014-01-01 19:00:00
2	1	2014	choque	cl 30 cr 66 b	cl 030 066 b 000 00000	1602	malla vial	solo danos	rosales	belen	interseccion	miercoles	1	-75.58727	6.231716	2014-01-01 07:40:00
3	1	2014	choque	cr 52 cl 97	cr 052 097 000 00000	0402	malla vial	solo danos	san isidro	aranjuez	interseccion	miercoles	1	-75.56253	6.289907	2014-01-01 05:30:00
4	1	2014	choque	tv 78 cl 65	tv 078 065 000 00000	0519	malla vial	solo danos	el progreso	castilla	tramo de via	miercoles	1	-75.57365	6.275473	2014-01-01 13:50:00
5	1	2014	otro	cr 63 cl 50	cr 063 050 000 00000	1101	malla vial	solo danos	carlos e. restrepo	laureles estadio	tramo de via	miercoles	1	-75.57697	6.255457	2014-01-01 07:25:00
6	1	2014	choque	cr 57 cl 51	cr 057 051 000 00000	1006	malla vial	solo danos	san benito	la candelaria	tramo de via	miercoles	1	-75.57481	6.254322	2014-01-01 04:15:00

Armado del conjunto de datos

Debido a que se deben armar ciertas agregaciones a nivel de conjunto de datos antes de crear los datasets de entrenamiento, se incluye una etapa de armado del conjunto de variables, no está asociado directamente con la limpieza de los datos. La base que se construye a continuación serán los cimientos para el modelo el modelo diario.

df <- rbind(
  df_train,
  df_test
)
special_dates_train <- special_dates_train %>% 
  dplyr::select(PERIODO, MES, DIA, DIA_FESTIVO, FECHA_ESPECIAL, FESTIVO_FECHA_ESPECIAL)
special_dates_test <- special_dates_test %>% 
  dplyr::select(PERIODO, MES, DIA, DIA_FESTIVO, FECHA_ESPECIAL, FESTIVO_FECHA_ESPECIAL)

special_dates <- rbind(
  special_dates_train,
  special_dates_test
) %>% distinct()

df <- merge(df, special_dates, by = c("PERIODO", "MES", "DIA"))

kable(head(df)) %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  scroll_box(width = "100%")

PERIODO	MES	DIA	X	CLASE	DIRECCION	DIRECCION_ENC	CBML	TIPO_GEOCOD	GRAVEDAD	BARRIO	COMUNA	DISENO	DIA_NOMBRE	LONGITUD	LATITUD	FECHA	DIA_FESTIVO	FECHA_ESPECIAL	FESTIVO_FECHA_ESPECIAL
2014	1	1	1	choque	cr 63 cl 94	cr 063 094 000 00000		no ubicada	solo danos			tramo de via	miercoles	-75.70382	6.221806	2014-01-01 19:00:00	1	1	1
2014	1	1	2	choque	cl 30 cr 66 b	cl 030 066 b 000 00000	1602	malla vial	solo danos	rosales	belen	interseccion	miercoles	-75.58727	6.231716	2014-01-01 07:40:00	1	1	1
2014	1	1	3	choque	cr 52 cl 97	cr 052 097 000 00000	0402	malla vial	solo danos	san isidro	aranjuez	interseccion	miercoles	-75.56253	6.289907	2014-01-01 05:30:00	1	1	1
2014	1	1	4	choque	tv 78 cl 65	tv 078 065 000 00000	0519	malla vial	solo danos	el progreso	castilla	tramo de via	miercoles	-75.57365	6.275473	2014-01-01 13:50:00	1	1	1
2014	1	1	5	otro	cr 63 cl 50	cr 063 050 000 00000	1101	malla vial	solo danos	carlos e. restrepo	laureles estadio	tramo de via	miercoles	-75.57697	6.255457	2014-01-01 07:25:00	1	1	1
2014	1	1	6	choque	cr 57 cl 51	cr 057 051 000 00000	1006	malla vial	solo danos	san benito	la candelaria	tramo de via	miercoles	-75.57481	6.254322	2014-01-01 04:15:00	1	1	1

Modelado diario

Clase choque

Preprocesamiento de datos

armado_dataset_diario <- function(df, objective_var){
  # conjunto de datos diario por clase
  accidentes_por_clase <- df %>% 
    group_by(PERIODO, MES, DIA, CLASE) %>% 
    summarise(n_accidentes=n(), .groups="drop") %>% 
    spread(CLASE, n_accidentes) %>%
    mutate(PERIODO_LEAD=lead(PERIODO), MES_LEAD=lead(MES), DIA_LEAD=lead(DIA), rank = 1:length(PERIODO)) %>%
    filter(rank < max(rank)) %>%
    dplyr::select(-c(PERIODO, MES, DIA, rank)) 
  
  # conjunto de datos dia por gravedad
  accidentes_por_gravedad <- df %>% 
    group_by(PERIODO, MES, DIA, GRAVEDAD) %>%
    summarise(n_accidentes=n(), .groups="drop") %>% 
    spread(GRAVEDAD, n_accidentes) %>%
    mutate(PERIODO_LEAD=lead(PERIODO), MES_LEAD=lead(MES), DIA_LEAD=lead(DIA), rank = 1:length(PERIODO)) %>%
    filter(rank < max(rank)) %>%
    dplyr::select(-c(PERIODO, MES, DIA, rank))
  
  accidentes_por_hora <- df %>% mutate(HORA=hour(FECHA)) %>%
    mutate(
      temprano=ifelse(HORA %in% c(1,2,3,4,5,6), 1, 0),
      temprano_trabajo=ifelse(HORA %in% c(7,8,9,10,11,12), 1,0),
      almuerzo=ifelse(HORA == 13, 1, 0),
      tarde=ifelse(HORA %in% c(14,15,16,17,18), 1, 0),
      noche=ifelse(HORA %in% c(19, 20, 21, 22, 23, 24), 1,0)
      ) %>% 
    group_by(PERIODO, MES, DIA) %>% 
    summarise(
      accidentes_temprano=sum(temprano),
      accidentes_temprano_trabajo=sum(temprano_trabajo),
      accidentes_almuerzo=sum(almuerzo),
      accidentes_tarde=sum(tarde),
      accidentes_noche=sum(noche),
      .groups="drop"
    ) %>% 
  mutate(PERIODO_LEAD=lead(PERIODO), MES_LEAD=lead(MES), DIA_LEAD=lead(DIA), rank = 1:length(PERIODO)) %>%
    filter(rank < max(rank)) %>%
    dplyr::select(-c(PERIODO, MES, DIA, rank))
  
  # rezagos de la clase
  rezagos_accidentes <- df %>% 
    filter(CLASE == objective_var) %>% 
    group_by(PERIODO, MES, DIA) %>%
    summarise(n_accidentes=n(), .groups="drop") %>%
    mutate(
      FECHA=as.Date(paste(PERIODO, formatC(MES, width = 2, flag = "0"), formatC(DIA, width = 2, flag = "0"), sep='-')),
      t_minus_1=lag(n_accidentes, n = 1),
      t_minus_2=lag(n_accidentes, n = 2),
      t_minus_3=lag(n_accidentes, n = 3),
      t_minus_4=lag(n_accidentes, n = 4),
      t_minus_5=lag(n_accidentes, n = 5),
      t_minus_6=lag(n_accidentes, n = 6),
    ) %>% 
    dplyr::select(PERIODO, MES, DIA, t_minus_2, t_minus_3, t_minus_4, t_minus_5, t_minus_6) 
  
  # Accidentes en los domingos
  domingos <- df %>% 
    mutate(domingo = ifelse(DIA_NOMBRE == "domingo  ", 1, 0)) %>% 
    group_by(PERIODO, MES, DIA) %>% 
    summarise(accidentes_domingo=sum(domingo), .groups="drop") %>% 
    mutate(PERIODO_LEAD=lead(PERIODO), MES_LEAD=lead(MES), DIA_LEAD=lead(DIA), rank = 1:length(PERIODO)) %>%
    filter(rank < max(rank)) %>% 
    dplyr::select(-c(PERIODO, MES, DIA, rank)) 
  
  # Union del df
  df_processed <- df %>%
    filter(CLASE == objective_var) %>%
    group_by(PERIODO, MES, DIA) %>% 
    summarise(
      numero_accidentes = n(),
      .groups="drop"
    ) %>% 
    mutate(
      DIA=factor(DIA),
      MES=factor(MES)
      ) %>%
    merge(accidentes_por_clase,
          by.x = c("PERIODO", "MES", "DIA"), by.y = c("PERIODO_LEAD", "MES_LEAD", "DIA_LEAD"), all.x = T) %>%
    merge(accidentes_por_gravedad, 
          by.x = c("PERIODO", "MES", "DIA"), by.y = c("PERIODO_LEAD", "MES_LEAD", "DIA_LEAD"), all.x = T) %>%
    merge(rezagos_accidentes,
          by.x = c("PERIODO", "MES", "DIA"), by.y = c("PERIODO", "MES", "DIA"), all.x = T) %>% 
    merge(domingos, 
          by.x = c("PERIODO", "MES", "DIA"), by.y = c("PERIODO_LEAD", "MES_LEAD", "DIA_LEAD"), all.x = T) %>%
    merge(special_dates,
          by = c("PERIODO", "MES", "DIA")) %>% 
    merge(accidentes_por_hora,
          by.x = c("PERIODO", "MES", "DIA"), by.y = c("PERIODO_LEAD", "MES_LEAD", "DIA_LEAD"), all.x = T) %>%
    mutate(
      DIA_FESTIVO=factor(DIA_FESTIVO),
      FECHA_ESPECIAL=factor(FECHA_ESPECIAL),
      FESTIVO_FECHA_ESPECIAL=factor(FESTIVO_FECHA_ESPECIAL)
      ) %>% 
    arrange(PERIODO, MES, DIA, .by_group = T) %>% 
    filter(!is.na(t_minus_6)) %>% 
    replace_na(list(incendio=0, muerto=0, volcamiento=0)) %>% 
    mutate(
      DIA_SEMANA = factor(wday(
        as_date(
          paste(
            PERIODO, formatC(MES, width = 2, flag = 0), formatC(DIA, width = 2, flag = 0), sep='-')
        )
      )
      )
    )
  
  return(df_processed)
}

Se procede a realizar la construcción del conjunto de datos para la clase choque y hacer todo el pipeline de preprocesamiento para implementar el escalado y estandarización al conjunto de entrenamiento y validación.

df_preprocessed <- armado_dataset_diario(df=df, objective_var = "choque")
df_choque_train <- df_preprocessed %>% filter(PERIODO!=2018) %>% dplyr::select(-c(PERIODO, DIA))
df_choque_test <- df_preprocessed %>% filter(PERIODO==2018) %>% dplyr::select(-c(PERIODO, DIA))

model.choque.recipe <- recipe(numero_accidentes ~ ., 
                              data = df_choque_train)
model.choque.steps <- model.choque.recipe %>% 
  step_log(all_outcomes()) %>%
  step_center(all_predictors(), -MES, -DIA_SEMANA, -DIA_FESTIVO, -FECHA_ESPECIAL, -FESTIVO_FECHA_ESPECIAL) %>% 
  step_scale(all_predictors(), -MES, -DIA_SEMANA, -DIA_FESTIVO, -FECHA_ESPECIAL, -FESTIVO_FECHA_ESPECIAL)

model.choque.prepared <- prep(model.choque.steps, training = df_choque_train)
df_choque_train <- bake(model.choque.prepared, df_choque_train)
df_choque_test <- bake(model.choque.prepared, df_choque_test)

Modelamiento

Regresión lineal simple

Como primer intento, vale la pena evaluar un modelo de regresión lineal simple con stepwise variable selection para descartar variables que no entreguen valor.

modelo.choque <- lm(numero_accidentes ~ ., data=df_choque_train)
step.model.choque <- stepAIC(modelo.choque, direction = "both", trace = F, steps = 2000)

model.choque.summary <- summary(step.model.choque)
model.choque.summary

## 
## Call:
## lm(formula = numero_accidentes ~ MES + `caida ocupante` + choque + 
##     otro + t_minus_2 + t_minus_3 + t_minus_4 + t_minus_5 + t_minus_6 + 
##     DIA_FESTIVO + FESTIVO_FECHA_ESPECIAL + DIA_SEMANA, data = df_choque_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.03751 -0.08680  0.00437  0.10475  0.82881 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.817669   0.020152 189.440  < 2e-16 ***
## MES2                     0.092665   0.024103   3.845 0.000126 ***
## MES3                     0.100110   0.023348   4.288 1.93e-05 ***
## MES4                     0.087314   0.023050   3.788 0.000158 ***
## MES5                     0.098930   0.023629   4.187 3.00e-05 ***
## MES6                     0.084458   0.022909   3.687 0.000236 ***
## MES7                     0.108122   0.022808   4.741 2.34e-06 ***
## MES8                     0.100754   0.023992   4.200 2.84e-05 ***
## MES9                     0.097070   0.024088   4.030 5.88e-05 ***
## MES10                    0.096701   0.023222   4.164 3.31e-05 ***
## MES11                    0.098616   0.023223   4.246 2.31e-05 ***
## MES12                    0.082832   0.023468   3.530 0.000430 ***
## `caida ocupante`         0.007982   0.004684   1.704 0.088549 .  
## choque                   0.035429   0.006490   5.459 5.63e-08 ***
## otro                     0.007042   0.004704   1.497 0.134572    
## t_minus_2                0.028963   0.006363   4.552 5.76e-06 ***
## t_minus_3                0.019719   0.006395   3.084 0.002085 ** 
## t_minus_4                0.017344   0.006410   2.706 0.006892 ** 
## t_minus_5                0.012954   0.006398   2.025 0.043071 *  
## t_minus_6                0.010533   0.006367   1.654 0.098301 .  
## DIA_FESTIVO1            -0.705518   0.032015 -22.037  < 2e-16 ***
## FESTIVO_FECHA_ESPECIAL1  0.113555   0.041489   2.737 0.006277 ** 
## DIA_SEMANA2              0.642780   0.020598  31.205  < 2e-16 ***
## DIA_SEMANA3              0.652417   0.021727  30.028  < 2e-16 ***
## DIA_SEMANA4              0.598663   0.020491  29.216  < 2e-16 ***
## DIA_SEMANA5              0.568465   0.021173  26.849  < 2e-16 ***
## DIA_SEMANA6              0.654000   0.021215  30.827  < 2e-16 ***
## DIA_SEMANA7              0.490743   0.019805  24.779  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1684 on 1427 degrees of freedom
## Multiple R-squared:  0.7105, Adjusted R-squared:  0.705 
## F-statistic: 129.7 on 27 and 1427 DF,  p-value: < 2.2e-16

plot_estimation_errors <- function(y_true, y_pred, title){
  min_value <- min(y_true, y_pred) - min(y_true, y_pred) * 0.1
  max_value <- max(y_true, y_pred) + max(y_true, y_pred) * 0.1
  
  plot(x=y_true,y=y_pred,
       ylab="Predicciones",xlab="Observados",
       xlim=c(min_value, max_value),ylim=c(min_value, max_value),
       las=1, cex=1, pch=16,
       main=title)
  abline(a=0,b=1,lwd=2,col="black", lty=2)
  R_vl<-cor(y_pred, y_true)
  R_vl<-format(R_vl, digits = 3, nsmall = 3)
  rmse_vl<- rmse(actual = y_true, predicted = y_pred)
  rmse_vl<-format(rmse_vl,digits = 3,nsmall = 2)
  grid()
  legend("topleft",legend=paste0(c("Correlación: ","RMSE: "),c(R_vl,rmse_vl)), bty="n")
}

y_pred <- exp(predict(step.model.choque, df_choque_train))
y_true <- exp(df_choque_train$numero_accidentes)

train_rmse <- rmse(y_true, y_pred)

plot_estimation_errors(
  y_true = y_true, 
  y_pred = y_pred, 
  title="Rendimiento de modelo semanal choque ~ Entrenamiento"
  )

De primera instancia, se observa un muy buen ajuste de los datos de entrenamiento respecto al modelo de regresión lineal, sin embargo, para evaluar su capacidad de generalizar es necesario evaluarlo en el conjunto de validación.

y_pred <- exp(predict(step.model.choque, df_choque_test))
y_true <- exp(df_choque_test$numero_accidentes)

test_rmse <- rmse(y_true, y_pred)

test_train_ratio <- round(((test_rmse / train_rmse) - 1)*100, 3)

plot_estimation_errors(
  y_true = y_true, 
  y_pred = y_pred, 
  title="Rendimiento de modelo mensual semanal ~ Validación"
  )

plot_errors_density <- function(model, train, y_train, test, y_test, title){
  # data extraction
  y_pred <- exp(predict(model, train))
  y_true <- exp(y_train)
  train_error <- y_true - y_pred
  y_pred <- exp(predict(model, test))
  y_true <- exp(y_test)
  test_error <- y_true - y_pred
  #errors dataframe
  errors_df <- data.frame(
    errors = c(train_error, test_error),
    error_type = c(
      rep("train", length(train_error)),
      rep("test", length(test_error))
    )
  )
  # plotting
  ggplot(errors_df, aes(x=errors, color=error_type)) +
    geom_density() + 
    theme_classic() + 
    ggtitle(title) + 
    theme(plot.title = element_text(hjust = 0.5))
}

plot_errors_density(
  model = step.model.choque, 
  train = df_choque_train, 
  y_train = df_choque_train$numero_accidentes,
  test = df_choque_test, 
  y_test = df_choque_test$numero_accidentes,
  title="Errores de entrenamiento y validación clase choque"
  )

Del análisis de errores, es claro que el modelo tiene problemas de subestimación, ya que las estimaciones no superan los valores de aproximadamente 120 choques por día. Sin embargo, viendo justamente los datos, es una proporción pequeña respecto a todo el conjunto de datos, por lo que podrían denotarse como observaciones más atípicas.

Clase atropello

Regresión lineal

df_preprocessed <- armado_dataset_diario(df=df, objective_var = "atropello")
df_atropello_train <- df_preprocessed %>% filter(PERIODO!=2018) %>% dplyr::select(-c(PERIODO, DIA))
df_atropello_test <- df_preprocessed %>% filter(PERIODO==2018) %>% dplyr::select(-c(PERIODO, DIA))

model.atropello.recipe <- recipe(numero_accidentes ~ ., 
                              data = df_atropello_train)
model.atropello.steps <- model.atropello.recipe %>% 
  step_log(all_outcomes()) %>%
  step_center(all_predictors(), -MES, -DIA_SEMANA, -DIA_FESTIVO, -FECHA_ESPECIAL, -FESTIVO_FECHA_ESPECIAL) %>% 
  step_scale(all_predictors(), -MES, -DIA_SEMANA, -DIA_FESTIVO, -FECHA_ESPECIAL, -FESTIVO_FECHA_ESPECIAL)

model.atropello.prepared <- prep(model.atropello.steps, training = df_atropello_train)
df_atropello_train <- bake(model.atropello.prepared, df_atropello_train)
df_atropello_test <- bake(model.atropello.prepared, df_atropello_test)

modelo.atropello <- lm(numero_accidentes ~ ., data=df_atropello_train)
step.model.atropello <- stepAIC(modelo.atropello, direction = "both", trace = F, steps = 2000)

model.atropello.summary <- summary(step.model.atropello)
model.atropello.summary

## 
## Call:
## lm(formula = numero_accidentes ~ `caida ocupante` + otro + volcamiento + 
##     herido + muerto + t_minus_2 + t_minus_3 + t_minus_4 + t_minus_5 + 
##     t_minus_6 + accidentes_domingo + DIA_FESTIVO + accidentes_temprano + 
##     accidentes_almuerzo + DIA_SEMANA, data = df_atropello_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.26655 -0.20056  0.03529  0.23670  1.09328 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.434403   0.034623  70.313  < 2e-16 ***
## `caida ocupante`     0.024239   0.011426   2.121  0.03406 *  
## otro                -0.036975   0.011835  -3.124  0.00182 ** 
## volcamiento         -0.029113   0.010150  -2.868  0.00419 ** 
## herido               0.031515   0.016813   1.874  0.06107 .  
## muerto               0.023293   0.009307   2.503  0.01243 *  
## t_minus_2            0.028537   0.009671   2.951  0.00322 ** 
## t_minus_3            0.014402   0.009681   1.488  0.13707    
## t_minus_4            0.024836   0.009653   2.573  0.01018 *  
## t_minus_5            0.016426   0.009644   1.703  0.08874 .  
## t_minus_6            0.029651   0.009642   3.075  0.00214 ** 
## accidentes_domingo   0.127255   0.059204   2.149  0.03177 *  
## DIA_FESTIVO1        -0.210127   0.043343  -4.848 1.38e-06 ***
## accidentes_temprano  0.034444   0.013456   2.560  0.01058 *  
## accidentes_almuerzo  0.030982   0.013601   2.278  0.02288 *  
## DIA_SEMANA2         -0.293228   0.174862  -1.677  0.09378 .  
## DIA_SEMANA3          0.093196   0.035739   2.608  0.00921 ** 
## DIA_SEMANA4          0.064353   0.035471   1.814  0.06985 .  
## DIA_SEMANA5          0.072045   0.035375   2.037  0.04187 *  
## DIA_SEMANA6          0.160591   0.035983   4.463 8.72e-06 ***
## DIA_SEMANA7          0.230033   0.034915   6.588 6.23e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3488 on 1434 degrees of freedom
## Multiple R-squared:  0.1371, Adjusted R-squared:  0.1251 
## F-statistic: 11.39 on 20 and 1434 DF,  p-value: < 2.2e-16

y_pred <- exp(predict(step.model.atropello, df_atropello_train))
y_true <- exp(df_atropello_train$numero_accidentes)

train_rmse <- rmse(y_true, y_pred)

plot_estimation_errors(
  y_true = y_true, 
  y_pred = y_pred, 
  title="Rendimiento de modelo semanal atropello ~ Entrenamiento"
  )

y_pred <- exp(predict(step.model.atropello, df_atropello_test))
y_true <- exp(df_atropello_test$numero_accidentes)

train_rmse <- rmse(y_true, y_pred)

plot_estimation_errors(
  y_true = y_true, 
  y_pred = y_pred, 
  title="Rendimiento de modelo semanal atropello ~ Validación"
  )

plot_errors_density(
  model = step.model.atropello, 
  train = df_atropello_train, 
  y_train = df_atropello_train$numero_accidentes,
  test = df_atropello_test, 
  y_test = df_atropello_test$numero_accidentes,
  title="Errores de entrenamiento y validación clase atropello"
  )

El modelo de regresión lineal claramente no se ajusta correctamente a los datos, una de las soluciones puede ser incorporar nuevas variables que logren explicar mejor la variabilidad del fenomeno a estudiar o intentar con modelos diferentes que logren generalizar mejor la variable respuesta.

Clase caida ocupante

Regresión lineal

df_preprocessed <- armado_dataset_diario(df=df, objective_var = "caida ocupante")
df_caida_ocupante_train <- df_preprocessed %>% filter(PERIODO!=2018) %>% dplyr::select(-c(PERIODO, DIA))
df_caida_ocupante_test <- df_preprocessed %>% filter(PERIODO==2018) %>% dplyr::select(-c(PERIODO, DIA))

model.caida_ocupante.recipe <- recipe(numero_accidentes ~ ., 
                              data = df_caida_ocupante_train)
model.caida_ocupante.steps <- model.caida_ocupante.recipe %>% 
  step_log(all_outcomes()) %>%
  step_center(all_predictors(), -MES, -DIA_SEMANA, -DIA_FESTIVO, -FECHA_ESPECIAL, -FESTIVO_FECHA_ESPECIAL) %>% 
  step_scale(all_predictors(), -MES, -DIA_SEMANA, -DIA_FESTIVO, -FECHA_ESPECIAL, -FESTIVO_FECHA_ESPECIAL)

model.caida_ocupante.prepared <- prep(model.caida_ocupante.steps, training = df_caida_ocupante_train)
df_caida_ocupante_train <- bake(model.caida_ocupante.prepared, df_caida_ocupante_train)
df_caida_ocupante_test <- bake(model.caida_ocupante.prepared, df_caida_ocupante_test)

modelo.caida_ocupante <- lm(numero_accidentes ~ ., data=df_caida_ocupante_train)
step.model.caida_ocupante <- stepAIC(modelo.caida_ocupante, direction = "both", trace = F, steps = 2000)

model.caida_ocupante.summary <- summary(step.model.caida_ocupante)
model.caida_ocupante.summary

## 
## Call:
## lm(formula = numero_accidentes ~ MES + choque + otro + volcamiento + 
##     t_minus_2 + t_minus_4 + t_minus_5 + DIA_FESTIVO + FECHA_ESPECIAL + 
##     FESTIVO_FECHA_ESPECIAL + accidentes_temprano + accidentes_temprano_trabajo + 
##     accidentes_almuerzo + accidentes_tarde + accidentes_noche + 
##     DIA_SEMANA, data = df_caida_ocupante_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.25264 -0.23946  0.04606  0.27381  1.00763 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  2.12682    0.04774  44.549  < 2e-16 ***
## MES2                         0.05325    0.05490   0.970 0.332242    
## MES3                         0.17976    0.05454   3.296 0.001004 ** 
## MES4                         0.07701    0.05377   1.432 0.152255    
## MES5                         0.13129    0.05437   2.415 0.015880 *  
## MES6                         0.03267    0.05386   0.606 0.544293    
## MES7                         0.03543    0.05319   0.666 0.505469    
## MES8                         0.20563    0.05521   3.725 0.000203 ***
## MES9                         0.16311    0.05548   2.940 0.003335 ** 
## MES10                        0.08554    0.05410   1.581 0.114067    
## MES11                        0.12010    0.05445   2.205 0.027579 *  
## MES12                        0.05317    0.05391   0.986 0.324128    
## choque                      -0.19823    0.03960  -5.006 6.26e-07 ***
## otro                        -0.05720    0.01475  -3.877 0.000110 ***
## volcamiento                 -0.04745    0.01198  -3.961 7.85e-05 ***
## t_minus_2                    0.03424    0.01138   3.009 0.002666 ** 
## t_minus_4                    0.02001    0.01130   1.770 0.076908 .  
## t_minus_5                    0.01717    0.01129   1.521 0.128455    
## DIA_FESTIVO1                -0.42343    0.07693  -5.504 4.39e-08 ***
## FECHA_ESPECIAL1             -0.13485    0.07092  -1.901 0.057462 .  
## FESTIVO_FECHA_ESPECIAL1      0.31839    0.12180   2.614 0.009043 ** 
## accidentes_temprano          0.18045    0.04019   4.490 7.70e-06 ***
## accidentes_temprano_trabajo  0.14124    0.02849   4.957 8.01e-07 ***
## accidentes_almuerzo          0.04502    0.01697   2.652 0.008082 ** 
## accidentes_tarde             0.19280    0.03736   5.161 2.80e-07 ***
## accidentes_noche             0.07773    0.02409   3.227 0.001278 ** 
## DIA_SEMANA2                  0.33584    0.04841   6.937 6.07e-12 ***
## DIA_SEMANA3                  0.19725    0.04235   4.658 3.50e-06 ***
## DIA_SEMANA4                  0.18375    0.04258   4.316 1.70e-05 ***
## DIA_SEMANA5                  0.23637    0.04231   5.587 2.77e-08 ***
## DIA_SEMANA6                  0.19669    0.04217   4.665 3.38e-06 ***
## DIA_SEMANA7                  0.07328    0.04164   1.760 0.078620 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4047 on 1423 degrees of freedom
## Multiple R-squared:  0.148,  Adjusted R-squared:  0.1295 
## F-statistic: 7.976 on 31 and 1423 DF,  p-value: < 2.2e-16

y_pred <- exp(predict(step.model.caida_ocupante, df_caida_ocupante_train))
y_true <- exp(df_caida_ocupante_train$numero_accidentes)

train_rmse <- rmse(y_true, y_pred)

plot_estimation_errors(
  y_true = y_true, 
  y_pred = y_pred, 
  title="Rendimiento de modelo semanal caida_ocupante ~ Entrenamiento"
  )

y_pred <- exp(predict(step.model.caida_ocupante, df_caida_ocupante_test))
y_true <- exp(df_caida_ocupante_test$numero_accidentes)

train_rmse <- rmse(y_true, y_pred)

plot_estimation_errors(
  y_true = y_true, 
  y_pred = y_pred, 
  title="Rendimiento de modelo semanal caida_ocupante ~ Validación"
  )

plot_errors_density(
  model = step.model.caida_ocupante, 
  train = df_caida_ocupante_train, 
  y_train = df_caida_ocupante_train$numero_accidentes,
  test = df_caida_ocupante_test, 
  y_test = df_caida_ocupante_test$numero_accidentes,
  title="Errores de entrenamiento y validación clase caida_ocupante"
  )

Nuevamente, observamos una baja calidad de modelos. Vale la pena explorar nuevas variables para capturar la variabilidad de la variable respuesta.
### Clase volcamiento #### Regresión lineal

df_preprocessed <- armado_dataset_diario(df=df, objective_var = "volcamiento")
df_volcamiento_train <- df_preprocessed %>% filter(PERIODO!=2018) %>% dplyr::select(-c(PERIODO, DIA))
df_volcamiento_test <- df_preprocessed %>% filter(PERIODO==2018) %>% dplyr::select(-c(PERIODO, DIA))

model.volcamiento.recipe <- recipe(numero_accidentes ~ ., 
                              data = df_volcamiento_train)
model.volcamiento.steps <- model.volcamiento.recipe %>% 
  step_log(all_outcomes()) %>%
  step_center(all_predictors(), -MES, -DIA_SEMANA, -DIA_FESTIVO, -FECHA_ESPECIAL, -FESTIVO_FECHA_ESPECIAL) %>% 
  step_scale(all_predictors(), -MES, -DIA_SEMANA, -DIA_FESTIVO, -FECHA_ESPECIAL, -FESTIVO_FECHA_ESPECIAL)

model.volcamiento.prepared <- prep(model.volcamiento.steps, training = df_volcamiento_train)
df_volcamiento_train <- bake(model.volcamiento.prepared, df_volcamiento_train)
df_volcamiento_test <- bake(model.volcamiento.prepared, df_volcamiento_test)

modelo.volcamiento <- lm(numero_accidentes ~ ., data=df_volcamiento_train)
step.model.volcamiento <- stepAIC(modelo.volcamiento, direction = "both", trace = F, steps = 2000)

model.volcamiento.summary <- summary(step.model.volcamiento)
model.volcamiento.summary

## 
## Call:
## lm(formula = numero_accidentes ~ atropello + `caida ocupante` + 
##     choque + volcamiento + t_minus_2 + t_minus_4 + t_minus_5 + 
##     t_minus_6 + accidentes_tarde + DIA_SEMANA, data = df_volcamiento_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.81718 -0.37373  0.05275  0.41719  1.45850 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.09706    0.04323  25.376  < 2e-16 ***
## atropello        -0.02578    0.01671  -1.543 0.123003    
## `caida ocupante` -0.02592    0.01670  -1.552 0.120915    
## choque            0.07664    0.02308   3.321 0.000921 ***
## volcamiento       0.04422    0.01659   2.666 0.007774 ** 
## t_minus_2         0.06551    0.01647   3.977 7.34e-05 ***
## t_minus_4         0.03391    0.01654   2.050 0.040511 *  
## t_minus_5         0.03176    0.01633   1.944 0.052052 .  
## t_minus_6         0.03290    0.01648   1.996 0.046151 *  
## accidentes_tarde -0.03335    0.01776  -1.878 0.060623 .  
## DIA_SEMANA2       0.39940    0.06813   5.863 5.69e-09 ***
## DIA_SEMANA3       0.28482    0.06135   4.642 3.77e-06 ***
## DIA_SEMANA4       0.26446    0.06089   4.343 1.51e-05 ***
## DIA_SEMANA5       0.30045    0.06138   4.895 1.10e-06 ***
## DIA_SEMANA6       0.22403    0.06101   3.672 0.000250 ***
## DIA_SEMANA7       0.21282    0.06132   3.470 0.000536 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5945 on 1390 degrees of freedom
## Multiple R-squared:  0.08012,    Adjusted R-squared:  0.0702 
## F-statistic: 8.072 on 15 and 1390 DF,  p-value: < 2.2e-16

y_pred <- exp(predict(step.model.volcamiento, df_volcamiento_train))
y_true <- exp(df_volcamiento_train$numero_accidentes)

train_rmse <- rmse(y_true, y_pred)

plot_estimation_errors(
  y_true = y_true, 
  y_pred = y_pred, 
  title="Rendimiento de modelo semanal volcamiento ~ Entrenamiento"
  )

y_pred <- exp(predict(step.model.volcamiento, df_volcamiento_test))
y_true <- exp(df_volcamiento_test$numero_accidentes)

train_rmse <- rmse(y_true, y_pred)

plot_estimation_errors(
  y_true = y_true, 
  y_pred = y_pred, 
  title="Rendimiento de modelo semanal volcamiento ~ Validación"
  )

plot_errors_density(
  model = step.model.volcamiento, 
  train = df_volcamiento_train, 
  y_train = df_volcamiento_train$numero_accidentes,
  test = df_volcamiento_test, 
  y_test = df_volcamiento_test$numero_accidentes,
  title="Errores de entrenamiento y validación clase volcamiento"
  )

Clase otros accidentes

Regresión lineal

df_preprocessed <- armado_dataset_diario(df=df, objective_var = "otro")
df_otro_train <- df_preprocessed %>% filter(PERIODO!=2018) %>% dplyr::select(-c(PERIODO, DIA))
df_otro_test <- df_preprocessed %>% filter(PERIODO==2018) %>% dplyr::select(-c(PERIODO, DIA))

model.otro.recipe <- recipe(numero_accidentes ~ ., 
                              data = df_otro_train)
model.otro.steps <- model.otro.recipe %>% 
  step_log(all_outcomes()) %>%
  step_center(all_predictors(), -MES, -DIA_SEMANA, -DIA_FESTIVO, -FECHA_ESPECIAL, -FESTIVO_FECHA_ESPECIAL) %>% 
  step_scale(all_predictors(), -MES, -DIA_SEMANA, -DIA_FESTIVO, -FECHA_ESPECIAL, -FESTIVO_FECHA_ESPECIAL)

model.otro.prepared <- prep(model.otro.steps, training = df_otro_train)
df_otro_train <- bake(model.otro.prepared, df_otro_train)
df_otro_test <- bake(model.otro.prepared, df_otro_test)

modelo.otro <- lm(numero_accidentes ~ ., data=df_otro_train)
step.model.otro <- stepAIC(modelo.otro, direction = "both", trace = F, steps = 2000)

model.otro.summary <- summary(step.model.otro)
model.otro.summary

## 
## Call:
## lm(formula = numero_accidentes ~ MES + choque + otro + t_minus_2 + 
##     t_minus_3 + t_minus_4 + DIA_FESTIVO + accidentes_tarde + 
##     DIA_SEMANA, data = df_otro_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.82171 -0.21817  0.03722  0.24910  1.04217 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.28306    0.04387  52.040  < 2e-16 ***
## MES2              0.16437    0.05169   3.180 0.001505 ** 
## MES3              0.15434    0.05058   3.052 0.002318 ** 
## MES4              0.07430    0.05016   1.481 0.138773    
## MES5              0.21359    0.05087   4.199 2.85e-05 ***
## MES6              0.12814    0.05030   2.547 0.010957 *  
## MES7              0.15588    0.05004   3.115 0.001876 ** 
## MES8              0.17843    0.05124   3.483 0.000511 ***
## MES9              0.15268    0.05136   2.973 0.002998 ** 
## MES10             0.14958    0.05065   2.953 0.003199 ** 
## MES11             0.08308    0.05057   1.643 0.100653    
## MES12             0.08288    0.05033   1.647 0.099802 .  
## choque           -0.02636    0.01469  -1.794 0.072964 .  
## otro              0.03575    0.01070   3.340 0.000859 ***
## t_minus_2         0.02047    0.01053   1.944 0.052050 .  
## t_minus_3         0.02508    0.01052   2.383 0.017291 *  
## t_minus_4         0.02056    0.01049   1.960 0.050140 .  
## DIA_FESTIVO1     -0.22394    0.04730  -4.734 2.42e-06 ***
## accidentes_tarde  0.02591    0.01174   2.207 0.027504 *  
## DIA_SEMANA2       0.23515    0.04335   5.424 6.83e-08 ***
## DIA_SEMANA3       0.27365    0.03836   7.134 1.55e-12 ***
## DIA_SEMANA4       0.26011    0.03898   6.673 3.57e-11 ***
## DIA_SEMANA5       0.27949    0.03840   7.278 5.56e-13 ***
## DIA_SEMANA6       0.22559    0.03771   5.983 2.77e-09 ***
## DIA_SEMANA7       0.05995    0.03818   1.570 0.116571    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3799 on 1430 degrees of freedom
## Multiple R-squared:  0.1287, Adjusted R-squared:  0.1141 
## F-statistic: 8.803 on 24 and 1430 DF,  p-value: < 2.2e-16

y_pred <- exp(predict(step.model.otro, df_otro_train))
y_true <- exp(df_otro_train$numero_accidentes)

train_rmse <- rmse(y_true, y_pred)

plot_estimation_errors(
  y_true = y_true, 
  y_pred = y_pred, 
  title="Rendimiento de modelo semanal otro ~ Entrenamiento"
  )

y_pred <- exp(predict(step.model.otro, df_otro_test))
y_true <- exp(df_otro_test$numero_accidentes)

train_rmse <- rmse(y_true, y_pred)

plot_estimation_errors(
  y_true = y_true, 
  y_pred = y_pred, 
  title="Rendimiento de modelo semanal otro ~ Validación"
  )

plot_errors_density(
  model = step.model.otro, 
  train = df_otro_train, 
  y_train = df_otro_train$numero_accidentes,
  test = df_otro_test, 
  y_test = df_otro_test$numero_accidentes,
  title="Errores de entrenamiento y validación clase otro"
  )

daily-predictive-modelling

Alejandro Cadavid Romero

Santiago Vasquez Rodriguez

Introducción

Carga de librerias

Carga de datos

Armado del conjunto de datos

Modelado diario

Clase choque

Preprocesamiento de datos

Modelamiento

Regresión lineal simple

Clase atropello

Regresión lineal

Clase caida ocupante

Regresión lineal

Clase otros accidentes

Regresión lineal