pacotes necessários

#install.packages()
library(readr)
library(dplyr)
## 
## Anexando pacote: 'dplyr'
## Os seguintes objetos são mascarados por 'package:stats':
## 
##     filter, lag
## Os seguintes objetos são mascarados por 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
library(stringr)
library(lubridate)
## Warning: pacote 'lubridate' foi compilado no R versão 4.5.2
## 
## Anexando pacote: 'lubridate'
## Os seguintes objetos são mascarados por 'package:base':
## 
##     date, intersect, setdiff, union
library(tidyr)
library(ggplot2)
files <- list.files("dados_PRF/", pattern = "\\.csv$", full.names = TRUE)


cat("Arquivos encontrados:", length(files), "\n")
## Arquivos encontrados: 17
cols_all <- map(files, ~ names(read_csv2(.x, n_max = 0)))
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (25): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## 
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unique_cols <- unique(unlist(cols_all))

cat("Número total de colunas identificadas:", length(unique_cols), "\n")
## Número total de colunas identificadas: 31
read_prf <- function(file, all_cols) {
  
  df <- read_csv2(
    file,
    col_types = cols(.default = col_character()),  # tudo como texto para evitar erros
    locale = locale(encoding = "latin1", decimal_mark = ",")
  )
  
  # adicionar colunas faltantes
  missing_cols <- setdiff(all_cols, names(df))
  for (col in missing_cols) {
    df[[col]] <- NA_character_
  }
  
  # garantir a mesma ordem em todos os arquivos
  df <- df[, all_cols]
  
  return(df)
}
df_list <- map(files, read_prf, all_cols = unique_cols)

# unir tudo sem erros
df_raw <- bind_rows(df_list)

cat("Dataset unificado com sucesso!\n")
## Dataset unificado com sucesso!
cat("Total de linhas:", nrow(df_raw), "\n")
## Total de linhas: 2013757
# conversão de datas
if ("data_inversa" %in% names(df_raw)) {
  df_raw$data_inversa <- dmy(df_raw$data_inversa)
}
## Warning: 1114235 failed to parse.
# conversão de horário
if ("horario" %in% names(df_raw)) {
  df_raw$horario <- parse_time(df_raw$horario)
}

# colunas que desejamos tornar numéricas
numeric_cols <- intersect(
  c("id","br","ano","pessoas","mortos","feridos_leves","feridos_graves",
    "ilesos","ignorados","feridos","veiculos","latitude","longitude"),
  names(df_raw)
)

df_raw <- df_raw %>%
  mutate(across(all_of(numeric_cols), ~ parse_number(.x)))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(all_of(numeric_cols), ~parse_number(.x))`.
## Caused by warning:
## ! 12 parsing failures.
##     row col expected actual
##  398104  -- a number (null)
##  406616  -- a number (null)
##  446990  -- a number (null)
##  484778  -- a number (null)
## 1621693  -- a number (null)
## ....... ... ........ ......
## See problems(...) for more details.
if ("km" %in% names(df_raw)) {
  df_raw <- df_raw %>%
    mutate(
      km = str_replace(km, ",", "."),
      km = parse_number(km)
    )
}
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `km = parse_number(km)`.
## Caused by warning:
## ! 12 parsing failures.
##     row col expected actual
##  398104  -- a number (null)
##  406616  -- a number (null)
##  446990  -- a number (null)
##  484778  -- a number (null)
## 1621693  -- a number (null)
## ....... ... ........ ......
## See problems(...) for more details.
df_raw <- df_raw %>%
  mutate(across(where(is.character), ~ str_squish(.x)))
df <- df_raw

cat("\nFinalizado! Dataset pronto para análise.\n")
## 
## Finalizado! Dataset pronto para análise.
cat("Total de linhas:", nrow(df), "\n")
## Total de linhas: 2013757
cat("Total de colunas:", ncol(df), "\n")
## Total de colunas: 31
# acidentes por ano
if ("ano" %in% names(df)) {
  acidentes_ano <- df %>%
    group_by(ano) %>%
    summarise(total = n(), mortos = sum(mortos, na.rm = TRUE))
  print(acidentes_ano)
}
## # A tibble: 10 × 3
##      ano  total mortos
##    <dbl>  <int>  <dbl>
##  1  2007 127675   6742
##  2  2008 141043   6914
##  3  2009 158646   7341
##  4  2010 183469   8623
##  5  2011 192326   8675
##  6  2012 184568   8663
##  7  2013 186748   8426
##  8  2014 169201   8234
##  9  2015 122161   6867
## 10    NA 547920  42043
# top 10 municípios mais perigosos
if ("municipio" %in% names(df)) {
  top_mun <- df %>%
    count(municipio, sort = TRUE) %>%
    slice_head(n = 10)
  print(top_mun)
}
## # A tibble: 10 × 2
##    municipio           n
##    <chr>           <int>
##  1 CURITIBA        27608
##  2 SAO JOSE        23081
##  3 GUARULHOS       21110
##  4 DUQUE DE CAXIAS 19979
##  5 BETIM           19873
##  6 SERRA           19484
##  7 BRASILIA        19167
##  8 PALHOCA         15556
##  9 PORTO VELHO     14906
## 10 CARIACICA       14808
# distribuição temporal mensal
if ("data_inversa" %in% names(df)) {
  mensal <- df %>%
    mutate(mes = floor_date(data_inversa, "month")) %>%
    count(mes)
  print(mensal)
}
## # A tibble: 73 × 2
##    mes            n
##    <date>     <int>
##  1 2007-01-01 10402
##  2 2007-02-01  9609
##  3 2007-03-01  9827
##  4 2007-04-01 10368
##  5 2007-05-01 10121
##  6 2007-06-01 10329
##  7 2007-07-01 11007
##  8 2007-08-01 10196
##  9 2007-09-01 10451
## 10 2007-10-01 11037
## # ℹ 63 more rows
df <- df %>%
  mutate(
    total_vitimas = mortos + feridos_leves + feridos_graves,
    grave = if_else(mortos > 0 | feridos_graves > 0, TRUE, FALSE)
  )
df %>%
  count(ano, name = "n_acidentes") %>%
  ggplot(aes(x = ano, y = n_acidentes)) +
  geom_line() +
  geom_point() +
  labs(
    title = "Número de acidentes por ano",
    x = "Ano",
    y = "Acidentes"
  )
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

df <- df %>%
  mutate(
    ano = year(data_inversa),
    mes = month(data_inversa),
    dia = day(data_inversa)
  )
df %>%
  count(ano, mes, name = "n") %>%
  ggplot(aes(x = mes, y = n, group = ano, color = as.factor(ano))) +
  geom_line(alpha = 0.4) +
  labs(
    title = "Acidentes por mês (sazonalidade por ano)",
    x = "Mês",
    y = "Acidentes",
    color = "Ano"
  )
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

acidente por dia da semana

Padronizar os dias da semana

df <- df %>%
  mutate(
    dia_semana = str_to_lower(dia_semana),
    dia_semana = str_replace_all(dia_semana, "[^a-zà-ú ]", " "),
    dia_semana = str_squish(dia_semana),

    dia_semana = case_when(
      str_detect(dia_semana, "domingo") ~ "Domingo",
      str_detect(dia_semana, "segunda") ~ "Segunda",
      str_detect(dia_semana, "terca") | str_detect(dia_semana, "terça") ~ "Terça",
      str_detect(dia_semana, "quarta") ~ "Quarta",
      str_detect(dia_semana, "quinta") ~ "Quinta",
      str_detect(dia_semana, "sexta") ~ "Sexta",
      str_detect(dia_semana, "sabado") | str_detect(dia_semana, "sábado") ~ "Sábado",
      TRUE ~ NA_character_
    ),

    dia_semana = factor(
      dia_semana,
      levels = c("Domingo", "Segunda", "Terça", "Quarta", "Quinta", "Sexta", "Sábado"),
      ordered = TRUE
    )
  )

Gráfico

df %>%
  count(dia_semana, name = "n") %>%
  ggplot(aes(x = dia_semana, y = n)) +
  geom_col() +
  labs(
    title = "Acidentes por dia da semana",
    x = "Dia da semana",
    y = "Quantidade de acidentes"
  )

df %>%
  group_by(causa_acidente) %>%
  summarise(
    n = n(),
    n_graves = sum(grave, na.rm = TRUE),
    proporcao_grave = n_graves / n
  ) %>%
  filter(n > 100) %>%                       
  slice_max(proporcao_grave, n = 10) %>%    
  ggplot(aes(x = reorder(causa_acidente, proporcao_grave), 
             y = proporcao_grave)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 10 causas com maior proporção de acidentes graves",
    x = "Causa do acidente",
    y = "Proporção de acidentes graves"
  )

top_br <- df %>%
  group_by(br) %>%
  summarise(
    n = n(),
    n_graves = sum(grave)
  ) %>%
  arrange(desc(n)) %>%
  slice_head(n = 10)
ggplot(top_br, aes(x = reorder(as.factor(br), n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 10 BRs com mais acidentes",
    x = "BR",
    y = "Número de acidentes"
  )

top_uf <- df %>%
  group_by(uf) %>%
  summarise(
    n = n(),
    n_graves = sum(grave)
  ) %>%
  arrange(desc(n))

ggplot(top_uf, aes(x = reorder(uf, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Acidentes por Unidade Federativa",
    x = "UF",
    y = "Número de acidentes"
  )

df <- df %>%
  mutate(
    hora = hour(horario),
    periodo_dia = case_when(
      hora >= 0 & hora < 6  ~ "Madrugada",
      hora >= 6 & hora < 12 ~ "Manhã",
      hora >= 12 & hora < 18 ~ "Tarde",
      hora >= 18 & hora <= 23 ~ "Noite",
      TRUE ~ NA_character_
    ),
    periodo_dia = factor(periodo_dia,
                         levels = c("Madrugada", "Manhã", "Tarde", "Noite"),
                         ordered = TRUE)
  )
df %>%
  ggplot(aes(x = hora, fill = periodo_dia)) +
  geom_density(alpha = 0.4) +
  scale_x_continuous(breaks = seq(0, 23, 3)) +
  labs(
    title = "Densidade de acidentes por período do dia",
    x = "Hora do dia",
    y = "Densidade",
    fill = "Período"
  ) +
  theme_minimal()