pacotes necessários
#install.packages()
library(readr)
library(dplyr)
##
## Anexando pacote: 'dplyr'
## Os seguintes objetos são mascarados por 'package:stats':
##
## filter, lag
## Os seguintes objetos são mascarados por 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(stringr)
library(lubridate)
## Warning: pacote 'lubridate' foi compilado no R versão 4.5.2
##
## Anexando pacote: 'lubridate'
## Os seguintes objetos são mascarados por 'package:base':
##
## date, intersect, setdiff, union
library(tidyr)
library(ggplot2)
files <- list.files("dados_PRF/", pattern = "\\.csv$", full.names = TRUE)
cat("Arquivos encontrados:", length(files), "\n")
## Arquivos encontrados: 17
cols_all <- map(files, ~ names(read_csv2(.x, n_max = 0)))
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (25): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (30): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## Rows: 0 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (26): id, data_inversa, dia_semana, horario, uf, br, km, municipio, caus...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unique_cols <- unique(unlist(cols_all))
cat("Número total de colunas identificadas:", length(unique_cols), "\n")
## Número total de colunas identificadas: 31
read_prf <- function(file, all_cols) {
df <- read_csv2(
file,
col_types = cols(.default = col_character()), # tudo como texto para evitar erros
locale = locale(encoding = "latin1", decimal_mark = ",")
)
# adicionar colunas faltantes
missing_cols <- setdiff(all_cols, names(df))
for (col in missing_cols) {
df[[col]] <- NA_character_
}
# garantir a mesma ordem em todos os arquivos
df <- df[, all_cols]
return(df)
}
df_list <- map(files, read_prf, all_cols = unique_cols)
# unir tudo sem erros
df_raw <- bind_rows(df_list)
cat("Dataset unificado com sucesso!\n")
## Dataset unificado com sucesso!
cat("Total de linhas:", nrow(df_raw), "\n")
## Total de linhas: 2013757
# conversão de datas
if ("data_inversa" %in% names(df_raw)) {
df_raw$data_inversa <- dmy(df_raw$data_inversa)
}
## Warning: 1114235 failed to parse.
# conversão de horário
if ("horario" %in% names(df_raw)) {
df_raw$horario <- parse_time(df_raw$horario)
}
# colunas que desejamos tornar numéricas
numeric_cols <- intersect(
c("id","br","ano","pessoas","mortos","feridos_leves","feridos_graves",
"ilesos","ignorados","feridos","veiculos","latitude","longitude"),
names(df_raw)
)
df_raw <- df_raw %>%
mutate(across(all_of(numeric_cols), ~ parse_number(.x)))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(all_of(numeric_cols), ~parse_number(.x))`.
## Caused by warning:
## ! 12 parsing failures.
## row col expected actual
## 398104 -- a number (null)
## 406616 -- a number (null)
## 446990 -- a number (null)
## 484778 -- a number (null)
## 1621693 -- a number (null)
## ....... ... ........ ......
## See problems(...) for more details.
if ("km" %in% names(df_raw)) {
df_raw <- df_raw %>%
mutate(
km = str_replace(km, ",", "."),
km = parse_number(km)
)
}
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `km = parse_number(km)`.
## Caused by warning:
## ! 12 parsing failures.
## row col expected actual
## 398104 -- a number (null)
## 406616 -- a number (null)
## 446990 -- a number (null)
## 484778 -- a number (null)
## 1621693 -- a number (null)
## ....... ... ........ ......
## See problems(...) for more details.
df_raw <- df_raw %>%
mutate(across(where(is.character), ~ str_squish(.x)))
df <- df_raw
cat("\nFinalizado! Dataset pronto para análise.\n")
##
## Finalizado! Dataset pronto para análise.
cat("Total de linhas:", nrow(df), "\n")
## Total de linhas: 2013757
cat("Total de colunas:", ncol(df), "\n")
## Total de colunas: 31
# acidentes por ano
if ("ano" %in% names(df)) {
acidentes_ano <- df %>%
group_by(ano) %>%
summarise(total = n(), mortos = sum(mortos, na.rm = TRUE))
print(acidentes_ano)
}
## # A tibble: 10 × 3
## ano total mortos
## <dbl> <int> <dbl>
## 1 2007 127675 6742
## 2 2008 141043 6914
## 3 2009 158646 7341
## 4 2010 183469 8623
## 5 2011 192326 8675
## 6 2012 184568 8663
## 7 2013 186748 8426
## 8 2014 169201 8234
## 9 2015 122161 6867
## 10 NA 547920 42043
# top 10 municípios mais perigosos
if ("municipio" %in% names(df)) {
top_mun <- df %>%
count(municipio, sort = TRUE) %>%
slice_head(n = 10)
print(top_mun)
}
## # A tibble: 10 × 2
## municipio n
## <chr> <int>
## 1 CURITIBA 27608
## 2 SAO JOSE 23081
## 3 GUARULHOS 21110
## 4 DUQUE DE CAXIAS 19979
## 5 BETIM 19873
## 6 SERRA 19484
## 7 BRASILIA 19167
## 8 PALHOCA 15556
## 9 PORTO VELHO 14906
## 10 CARIACICA 14808
# distribuição temporal mensal
if ("data_inversa" %in% names(df)) {
mensal <- df %>%
mutate(mes = floor_date(data_inversa, "month")) %>%
count(mes)
print(mensal)
}
## # A tibble: 73 × 2
## mes n
## <date> <int>
## 1 2007-01-01 10402
## 2 2007-02-01 9609
## 3 2007-03-01 9827
## 4 2007-04-01 10368
## 5 2007-05-01 10121
## 6 2007-06-01 10329
## 7 2007-07-01 11007
## 8 2007-08-01 10196
## 9 2007-09-01 10451
## 10 2007-10-01 11037
## # ℹ 63 more rows
df <- df %>%
mutate(
total_vitimas = mortos + feridos_leves + feridos_graves,
grave = if_else(mortos > 0 | feridos_graves > 0, TRUE, FALSE)
)
df %>%
count(ano, name = "n_acidentes") %>%
ggplot(aes(x = ano, y = n_acidentes)) +
geom_line() +
geom_point() +
labs(
title = "Número de acidentes por ano",
x = "Ano",
y = "Acidentes"
)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

df <- df %>%
mutate(
ano = year(data_inversa),
mes = month(data_inversa),
dia = day(data_inversa)
)
df %>%
count(ano, mes, name = "n") %>%
ggplot(aes(x = mes, y = n, group = ano, color = as.factor(ano))) +
geom_line(alpha = 0.4) +
labs(
title = "Acidentes por mês (sazonalidade por ano)",
x = "Mês",
y = "Acidentes",
color = "Ano"
)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

acidente por dia da semana
Padronizar os dias da semana
df <- df %>%
mutate(
dia_semana = str_to_lower(dia_semana),
dia_semana = str_replace_all(dia_semana, "[^a-zà-ú ]", " "),
dia_semana = str_squish(dia_semana),
dia_semana = case_when(
str_detect(dia_semana, "domingo") ~ "Domingo",
str_detect(dia_semana, "segunda") ~ "Segunda",
str_detect(dia_semana, "terca") | str_detect(dia_semana, "terça") ~ "Terça",
str_detect(dia_semana, "quarta") ~ "Quarta",
str_detect(dia_semana, "quinta") ~ "Quinta",
str_detect(dia_semana, "sexta") ~ "Sexta",
str_detect(dia_semana, "sabado") | str_detect(dia_semana, "sábado") ~ "Sábado",
TRUE ~ NA_character_
),
dia_semana = factor(
dia_semana,
levels = c("Domingo", "Segunda", "Terça", "Quarta", "Quinta", "Sexta", "Sábado"),
ordered = TRUE
)
)
Gráfico
df %>%
count(dia_semana, name = "n") %>%
ggplot(aes(x = dia_semana, y = n)) +
geom_col() +
labs(
title = "Acidentes por dia da semana",
x = "Dia da semana",
y = "Quantidade de acidentes"
)

df %>%
group_by(causa_acidente) %>%
summarise(
n = n(),
n_graves = sum(grave, na.rm = TRUE),
proporcao_grave = n_graves / n
) %>%
filter(n > 100) %>%
slice_max(proporcao_grave, n = 10) %>%
ggplot(aes(x = reorder(causa_acidente, proporcao_grave),
y = proporcao_grave)) +
geom_col() +
coord_flip() +
labs(
title = "Top 10 causas com maior proporção de acidentes graves",
x = "Causa do acidente",
y = "Proporção de acidentes graves"
)

top_br <- df %>%
group_by(br) %>%
summarise(
n = n(),
n_graves = sum(grave)
) %>%
arrange(desc(n)) %>%
slice_head(n = 10)
ggplot(top_br, aes(x = reorder(as.factor(br), n), y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Top 10 BRs com mais acidentes",
x = "BR",
y = "Número de acidentes"
)

top_uf <- df %>%
group_by(uf) %>%
summarise(
n = n(),
n_graves = sum(grave)
) %>%
arrange(desc(n))
ggplot(top_uf, aes(x = reorder(uf, n), y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Acidentes por Unidade Federativa",
x = "UF",
y = "Número de acidentes"
)

df <- df %>%
mutate(
hora = hour(horario),
periodo_dia = case_when(
hora >= 0 & hora < 6 ~ "Madrugada",
hora >= 6 & hora < 12 ~ "Manhã",
hora >= 12 & hora < 18 ~ "Tarde",
hora >= 18 & hora <= 23 ~ "Noite",
TRUE ~ NA_character_
),
periodo_dia = factor(periodo_dia,
levels = c("Madrugada", "Manhã", "Tarde", "Noite"),
ordered = TRUE)
)
df %>%
ggplot(aes(x = hora, fill = periodo_dia)) +
geom_density(alpha = 0.4) +
scale_x_continuous(breaks = seq(0, 23, 3)) +
labs(
title = "Densidade de acidentes por período do dia",
x = "Hora do dia",
y = "Densidade",
fill = "Período"
) +
theme_minimal()
