library(readr)
library(dplyr)
library(ggplot2)
df <- read_delim(
"casosgravescovid.csv",
delim = ";",
quote = "",
escape_backslash = FALSE,
escape_double = FALSE,
show_col_types = FALSE
)
# # Estrutura inicial
str(df)
spc_tbl_ [36,435 × 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ data_notificacao : Date[1:36435], format: "2020-05-30" "2020-06-02" "2020-06-02" "2020-06-01" ...
$ sexo : chr [1:36435] "Masculino" "Feminino" "Feminino" "Masculino" ...
$ raca : chr [1:36435] "Branca" "Parda" "Parda" "Branca" ...
$ etnia : logi [1:36435] NA NA NA NA NA NA ...
$ idade : chr [1:36435] "53" "0" "86" "66" ...
$ municipio_residencia : chr [1:36435] "Recife" "Recife" "Recife" "Recife" ...
$ bairro : chr [1:36435] "SAN MARTIN" "ALTO SANTA TEREZINHA" "ESPINHEIRO" "CASA FORTE" ...
$ distrito_sanitario : chr [1:36435] "V" "II" "III" "III" ...
$ data_inicio_sintomas : Date[1:36435], format: "2020-05-25" "2020-05-30" "2020-06-01" "2020-05-25" ...
$ sintomas : chr [1:36435] "Febre, Tosse, Mialgia, Alteração/perda de olfato e/ou paladar" "Febre, Tosse, Dispneia, Saturação O2 < 95" "Dispneia, Saturação O2 < 95" NA ...
$ outros_sintomas : chr [1:36435] NA NA NA NA ...
$ doencas_preexistentes : chr [1:36435] NA NA NA NA ...
$ outras_doencas_preexistentes : chr [1:36435] NA NA NA NA ...
$ profissional_saude : chr [1:36435] NA NA NA NA ...
$ categoria_profissional : chr [1:36435] NA NA NA NA ...
$ classificacao_final : chr [1:36435] "DESCARTADO" "INCONCLUSIVO" "DESCARTADO" "CONFIRMADO" ...
$ evolucao : chr [1:36435] "ISOLAMENTO DOMICILIAR" "ISOLAMENTO DOMICILIAR" "INTERNADO LEITO DE ISOLAMENTO" "RECUPERADO" ...
$ data_obito : chr [1:36435] NA NA NA NA ...
- attr(*, "spec")=
.. cols(
.. data_notificacao = col_date(format = ""),
.. sexo = col_character(),
.. raca = col_character(),
.. etnia = col_logical(),
.. idade = col_character(),
.. municipio_residencia = col_character(),
.. bairro = col_character(),
.. distrito_sanitario = col_character(),
.. data_inicio_sintomas = col_date(format = ""),
.. sintomas = col_character(),
.. outros_sintomas = col_character(),
.. doencas_preexistentes = col_character(),
.. `outras_doencas_preexistentes ` = col_character(),
.. profissional_saude = col_character(),
.. categoria_profissional = col_character(),
.. classificacao_final = col_character(),
.. evolucao = col_character(),
.. data_obito = col_character()
.. )
- attr(*, "problems")=<externalptr>
# Estatísticas gerais
summary(df)
data_notificacao sexo raca etnia idade municipio_residencia
Min. :2020-02-26 Length:36435 Length:36435 Mode:logical Length:36435 Length:36435
1st Qu.:2020-05-15 Class :character Class :character NA's:36435 Class :character Class :character
Median :2020-11-14 Mode :character Mode :character Mode :character Mode :character
Mean :2020-12-22
3rd Qu.:2021-06-01
Max. :2022-06-13
bairro distrito_sanitario data_inicio_sintomas sintomas outros_sintomas
Length:36435 Length:36435 Min. :2020-01-22 Length:36435 Length:36435
Class :character Class :character 1st Qu.:2020-05-07 Class :character Class :character
Mode :character Mode :character Median :2020-11-01 Mode :character Mode :character
Mean :2020-12-07
3rd Qu.:2021-05-11
Max. :2022-06-11
doencas_preexistentes outras_doencas_preexistentes profissional_saude categoria_profissional classificacao_final
Length:36435 Length:36435 Length:36435 Length:36435 Length:36435
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
evolucao data_obito
Length:36435 Length:36435
Class :character Class :character
Mode :character Mode :character
# Verificar valores nulos
colSums(is.na(df))
data_notificacao sexo raca
0 0 6329
etnia idade municipio_residencia
36435 0 0
bairro distrito_sanitario data_inicio_sintomas
0 0 0
sintomas outros_sintomas doencas_preexistentes
1958 30461 22737
outras_doencas_preexistentes profissional_saude categoria_profissional
28627 9098 33516
classificacao_final evolucao data_obito
4 978 27086
Limpeza e Engenharia de Dados
# Verificar duplicados
total_duplicados <- sum(duplicated(df))
total_duplicados
[1] 6
# Visualizar duplicados
duplicados <- df[duplicated(df), ]
head(duplicados)
# Remover duplicados
df <- df %>% distinct()
# Tratar datas
df <- df %>%
mutate(
data_notificacao = as.Date(data_notificacao, format = "%Y-%m-%d"),
data_inicio_sintomas = as.Date(data_inicio_sintomas, format = "%Y-%m-%d"),
data_obito = as.Date(data_obito, format = "%Y-%m-%d")
)
Separar colunas multivaloradas
df <- df %>%
mutate(id = row_number())
df_long <- df %>%
mutate(id_paciente = row_number()) %>%
mutate(
sintomas = str_split(sintomas, ","),
outros_sintomas = str_split(outros_sintomas, ","),
doencas_preexistentes = str_split(doencas_preexistentes, ",")
) %>%
unnest(sintomas) %>%
unnest(outros_sintomas) %>%
unnest(doencas_preexistentes) %>%
mutate(across(c(sintomas, outros_sintomas, doencas_preexistentes), trimws))
str(df_long)
tibble [170,553 × 20] (S3: tbl_df/tbl/data.frame)
$ data_notificacao : Date[1:170553], format: "2020-05-30" "2020-05-30" "2020-05-30" "2020-05-30" ...
$ sexo : chr [1:170553] "Masculino" "Masculino" "Masculino" "Masculino" ...
$ raca : chr [1:170553] "Branca" "Branca" "Branca" "Branca" ...
$ etnia : logi [1:170553] NA NA NA NA NA NA ...
$ idade : chr [1:170553] "53" "53" "53" "53" ...
$ municipio_residencia : chr [1:170553] "Recife" "Recife" "Recife" "Recife" ...
$ bairro : chr [1:170553] "SAN MARTIN" "SAN MARTIN" "SAN MARTIN" "SAN MARTIN" ...
$ distrito_sanitario : chr [1:170553] "V" "V" "V" "V" ...
$ data_inicio_sintomas : Date[1:170553], format: "2020-05-25" "2020-05-25" "2020-05-25" "2020-05-25" ...
$ sintomas : chr [1:170553] "Febre" "Tosse" "Mialgia" "Alteração/perda de olfato e/ou paladar" ...
$ outros_sintomas : chr [1:170553] NA NA NA NA ...
$ doencas_preexistentes : chr [1:170553] NA NA NA NA ...
$ outras_doencas_preexistentes : chr [1:170553] NA NA NA NA ...
$ profissional_saude : chr [1:170553] NA NA NA NA ...
$ categoria_profissional : chr [1:170553] NA NA NA NA ...
$ classificacao_final : chr [1:170553] "DESCARTADO" "DESCARTADO" "DESCARTADO" "DESCARTADO" ...
$ evolucao : chr [1:170553] "ISOLAMENTO DOMICILIAR" "ISOLAMENTO DOMICILIAR" "ISOLAMENTO DOMICILIAR" "ISOLAMENTO DOMICILIAR" ...
$ data_obito : Date[1:170553], format: NA NA NA NA ...
$ id : int [1:170553] 1 1 1 1 2 2 2 2 3 3 ...
$ id_paciente : int [1:170553] 1 1 1 1 2 2 2 2 3 3 ...
Visualizações iniciais
library(dplyr)
library(ggplot2)
library(purrr)
library(forcats)
# Identificar colunas categóricas (exceto ID)
colunas_categoricas <- df %>%
select(where(is.character), where(is.factor)) %>%
colnames()
# Função: gráfico por ID
plot_por_id <- function(col) {
df_long %>%
filter(!is.na(.data[[col]]), .data[[col]] != "") %>%
count(id_paciente, .data[[col]]) %>%
count(.data[[col]]) %>% # total de IDs por categoria
mutate(
categoria = fct_reorder(.data[[col]], n)
) %>%
ggplot(aes(x = categoria, y = n)) +
geom_bar(stat = "identity", fill = "#2C7FB8") +
coord_flip() +
labs(
title = paste("Distribuição por ID da coluna:", col),
x = "Categoria",
y = "Quantidade de IDs"
) +
theme_minimal(base_size = 12)
}
# Gerar gráficos para cada coluna categórica
plots <- map(colunas_categoricas, plot_por_id)
plots
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
[[10]]
[[11]]
[[12]]
[[13]]
[[14]]














DQotLS0NCnRpdGxlOiAiQW7DoWxpc2UgaW5pY2lhbCBkb3MgZGFkb3MgZGUgQ09WSUQiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCmxpYnJhcnkocmVhZHIpDQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KDQpkZiA8LSByZWFkX2RlbGltKA0KICAiY2Fzb3NncmF2ZXNjb3ZpZC5jc3YiLA0KICBkZWxpbSA9ICI7IiwNCiAgcXVvdGUgPSAiIiwNCiAgZXNjYXBlX2JhY2tzbGFzaCA9IEZBTFNFLA0KICBlc2NhcGVfZG91YmxlID0gRkFMU0UsDQogIHNob3dfY29sX3R5cGVzID0gRkFMU0UNCikNCg0KYGBgDQpgYGB7cn0NCiMgIyBFc3RydXR1cmEgaW5pY2lhbA0Kc3RyKGRmKQ0KYGBgDQoNCmBgYHtyfQ0KIyBFc3RhdMOtc3RpY2FzIGdlcmFpcw0Kc3VtbWFyeShkZikNCmBgYA0KYGBge3J9DQojIFZlcmlmaWNhciB2YWxvcmVzIG51bG9zDQpjb2xTdW1zKGlzLm5hKGRmKSkNCmBgYA0KDQojIyBMaW1wZXphIGUgRW5nZW5oYXJpYSBkZSBEYWRvcw0KDQpgYGB7cn0NCiMgVmVyaWZpY2FyIGR1cGxpY2Fkb3MNCnRvdGFsX2R1cGxpY2Fkb3MgPC0gc3VtKGR1cGxpY2F0ZWQoZGYpKQ0KdG90YWxfZHVwbGljYWRvcw0KDQojIFZpc3VhbGl6YXIgZHVwbGljYWRvcw0KZHVwbGljYWRvcyA8LSBkZltkdXBsaWNhdGVkKGRmKSwgXQ0KaGVhZChkdXBsaWNhZG9zKQ0KDQojIFJlbW92ZXIgZHVwbGljYWRvcw0KZGYgPC0gZGYgJT4lIGRpc3RpbmN0KCkNCg0KYGBgDQpgYGB7cn0NCiMgVHJhdGFyIGRhdGFzDQpkZiA8LSBkZiAlPiUNCiAgbXV0YXRlKA0KICAgIGRhdGFfbm90aWZpY2FjYW8gPSBhcy5EYXRlKGRhdGFfbm90aWZpY2FjYW8sIGZvcm1hdCA9ICIlWS0lbS0lZCIpLA0KICAgIGRhdGFfaW5pY2lvX3NpbnRvbWFzID0gYXMuRGF0ZShkYXRhX2luaWNpb19zaW50b21hcywgZm9ybWF0ID0gIiVZLSVtLSVkIiksDQogICAgZGF0YV9vYml0byA9IGFzLkRhdGUoZGF0YV9vYml0bywgZm9ybWF0ID0gIiVZLSVtLSVkIikNCiAgKQ0KYGBgDQoNCiMjIyBTZXBhcmFyIGNvbHVuYXMgbXVsdGl2YWxvcmFkYXMNCg0KYGBge3J9DQpkZiA8LSBkZiAlPiUNCiAgbXV0YXRlKGlkID0gcm93X251bWJlcigpKQ0KYGBgDQoNCmBgYHtyfSANCmRmX2xvbmcgPC0gZGYgJT4lDQogIG11dGF0ZShpZF9wYWNpZW50ZSA9IHJvd19udW1iZXIoKSkgJT4lDQogIG11dGF0ZSgNCiAgICBzaW50b21hcyA9IHN0cl9zcGxpdChzaW50b21hcywgIiwiKSwNCiAgICBvdXRyb3Nfc2ludG9tYXMgPSBzdHJfc3BsaXQob3V0cm9zX3NpbnRvbWFzLCAiLCIpLA0KICAgIGRvZW5jYXNfcHJlZXhpc3RlbnRlcyA9IHN0cl9zcGxpdChkb2VuY2FzX3ByZWV4aXN0ZW50ZXMsICIsIikNCiAgKSAlPiUNCiAgdW5uZXN0KHNpbnRvbWFzKSAlPiUNCiAgdW5uZXN0KG91dHJvc19zaW50b21hcykgJT4lDQogIHVubmVzdChkb2VuY2FzX3ByZWV4aXN0ZW50ZXMpICU+JQ0KICBtdXRhdGUoYWNyb3NzKGMoc2ludG9tYXMsIG91dHJvc19zaW50b21hcywgZG9lbmNhc19wcmVleGlzdGVudGVzKSwgdHJpbXdzKSkNCg0KDQpgYGANCg0KYGBge3J9DQpzdHIoZGZfbG9uZykNCg0KYGBgDQoNCiMjIFZpc3VhbGl6YcOnw7VlcyBpbmljaWFpcw0KDQpgYGB7ciBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkocHVycnIpDQpsaWJyYXJ5KGZvcmNhdHMpDQoNCiMgSWRlbnRpZmljYXIgY29sdW5hcyBjYXRlZ8OzcmljYXMgKGV4Y2V0byBJRCkNCmNvbHVuYXNfY2F0ZWdvcmljYXMgPC0gZGYgJT4lDQogIHNlbGVjdCh3aGVyZShpcy5jaGFyYWN0ZXIpLCB3aGVyZShpcy5mYWN0b3IpKSAlPiUNCiAgY29sbmFtZXMoKQ0KDQoNCiMgRnVuw6fDo286IGdyw6FmaWNvIHBvciBJRA0KcGxvdF9wb3JfaWQgPC0gZnVuY3Rpb24oY29sKSB7DQogIA0KICBkZl9sb25nICU+JQ0KICAgIGZpbHRlcighaXMubmEoLmRhdGFbW2NvbF1dKSwgLmRhdGFbW2NvbF1dICE9ICIiKSAlPiUNCiAgICBjb3VudChpZF9wYWNpZW50ZSwgLmRhdGFbW2NvbF1dKSAlPiUNCiAgICBjb3VudCguZGF0YVtbY29sXV0pICU+JSAgIyB0b3RhbCBkZSBJRHMgcG9yIGNhdGVnb3JpYQ0KICAgIG11dGF0ZSgNCiAgICAgIGNhdGVnb3JpYSA9IGZjdF9yZW9yZGVyKC5kYXRhW1tjb2xdXSwgbikNCiAgICApICU+JQ0KICAgIGdncGxvdChhZXMoeCA9IGNhdGVnb3JpYSwgeSA9IG4pKSArDQogICAgZ2VvbV9iYXIoc3RhdCA9ICJpZGVudGl0eSIsIGZpbGwgPSAiIzJDN0ZCOCIpICsNCiAgICBjb29yZF9mbGlwKCkgKw0KICAgIGxhYnMoDQogICAgICB0aXRsZSA9IHBhc3RlKCJEaXN0cmlidWnDp8OjbyBwb3IgSUQgZGEgY29sdW5hOiIsIGNvbCksDQogICAgICB4ID0gIkNhdGVnb3JpYSIsDQogICAgICB5ID0gIlF1YW50aWRhZGUgZGUgSURzIg0KICAgICkgKw0KICAgIHRoZW1lX21pbmltYWwoYmFzZV9zaXplID0gMTIpDQp9DQoNCiMgR2VyYXIgZ3LDoWZpY29zIHBhcmEgY2FkYSBjb2x1bmEgY2F0ZWfDs3JpY2ENCnBsb3RzIDwtIG1hcChjb2x1bmFzX2NhdGVnb3JpY2FzLCBwbG90X3Bvcl9pZCkNCg0KcGxvdHMNCg0KYGBgDQoNCiMjIyBWaXN1YWxpemFuZG8gY29sdW5hcyBxdWUgZmljYXJhbSBjb20gZMOtZmljaWwgdmlzdWFsaXphw6fDo28gcG9yIGdyw6FmaWNvDQoNCmBgYHtyfSANCg0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoc3RyaW5ncikNCmxpYnJhcnkocHVycnIpDQpsaWJyYXJ5KHRpZHlyKQ0KDQpjb2x1bmFzIDwtIGMoDQogICJzaW50b21hcyIsDQogICJvdXRyb3Nfc2ludG9tYXMiLA0KICAiZG9lbmNhc19wcmVleGlzdGVudGVzIiwNCiAgIm91dHJhc19kb2VuY2FzX3ByZWV4aXN0ZW50ZXMgIg0KKQ0KDQpsaXN0YXJfdW5pY29zIDwtIGZ1bmN0aW9uKGRmLCBjb2x1bmEpIHsNCiAgZGYgJT4lDQogICAgc2VsZWN0KGFsbF9vZihjb2x1bmEpKSAlPiUNCiAgICBmaWx0ZXIoIWlzLm5hKC5kYXRhW1tjb2x1bmFdXSkpICU+JQ0KICAgICMgZXhwbG9kZSB2YWxvcmVzIHNlcGFyYWRvcyBwb3IgdsOtcmd1bGENCiAgICBtdXRhdGUodmFsb3IgPSBzdHJfc3BsaXQoLmRhdGFbW2NvbHVuYV1dLCAiLCIpKSAlPiUNCiAgICB1bm5lc3QodmFsb3IpICU+JQ0KICAgIG11dGF0ZSh2YWxvciA9IHN0cl90cmltKHZhbG9yKSkgJT4lDQogICAgZGlzdGluY3QodmFsb3IpICU+JQ0KICAgIGFycmFuZ2UodmFsb3IpDQp9DQoNCnZhbG9yZXNfdW5pY29zIDwtIG1hcChjb2x1bmFzLCB+IGxpc3Rhcl91bmljb3MoZGYsIC54KSkNCg0KbmFtZXModmFsb3Jlc191bmljb3MpIDwtIGNvbHVuYXMNCg0KdmFsb3Jlc191bmljb3MNCg0KYGBg