library(readr)
library(dplyr)
library(ggplot2)

df <- read_delim(
  "casosgravescovid.csv",
  delim = ";",
  quote = "",
  escape_backslash = FALSE,
  escape_double = FALSE,
  show_col_types = FALSE
)
# # Estrutura inicial
str(df)
spc_tbl_ [36,435 × 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ data_notificacao             : Date[1:36435], format: "2020-05-30" "2020-06-02" "2020-06-02" "2020-06-01" ...
 $ sexo                         : chr [1:36435] "Masculino" "Feminino" "Feminino" "Masculino" ...
 $ raca                         : chr [1:36435] "Branca" "Parda" "Parda" "Branca" ...
 $ etnia                        : logi [1:36435] NA NA NA NA NA NA ...
 $ idade                        : chr [1:36435] "53" "0" "86" "66" ...
 $ municipio_residencia         : chr [1:36435] "Recife" "Recife" "Recife" "Recife" ...
 $ bairro                       : chr [1:36435] "SAN MARTIN" "ALTO SANTA TEREZINHA" "ESPINHEIRO" "CASA FORTE" ...
 $ distrito_sanitario           : chr [1:36435] "V" "II" "III" "III" ...
 $ data_inicio_sintomas         : Date[1:36435], format: "2020-05-25" "2020-05-30" "2020-06-01" "2020-05-25" ...
 $ sintomas                     : chr [1:36435] "Febre, Tosse, Mialgia, Alteração/perda de olfato e/ou paladar" "Febre, Tosse, Dispneia, Saturação O2 < 95" "Dispneia, Saturação O2 < 95" NA ...
 $ outros_sintomas              : chr [1:36435] NA NA NA NA ...
 $ doencas_preexistentes        : chr [1:36435] NA NA NA NA ...
 $ outras_doencas_preexistentes : chr [1:36435] NA NA NA NA ...
 $ profissional_saude           : chr [1:36435] NA NA NA NA ...
 $ categoria_profissional       : chr [1:36435] NA NA NA NA ...
 $ classificacao_final          : chr [1:36435] "DESCARTADO" "INCONCLUSIVO" "DESCARTADO" "CONFIRMADO" ...
 $ evolucao                     : chr [1:36435] "ISOLAMENTO DOMICILIAR" "ISOLAMENTO DOMICILIAR" "INTERNADO LEITO DE ISOLAMENTO" "RECUPERADO" ...
 $ data_obito                   : chr [1:36435] NA NA NA NA ...
 - attr(*, "spec")=
  .. cols(
  ..   data_notificacao = col_date(format = ""),
  ..   sexo = col_character(),
  ..   raca = col_character(),
  ..   etnia = col_logical(),
  ..   idade = col_character(),
  ..   municipio_residencia = col_character(),
  ..   bairro = col_character(),
  ..   distrito_sanitario = col_character(),
  ..   data_inicio_sintomas = col_date(format = ""),
  ..   sintomas = col_character(),
  ..   outros_sintomas = col_character(),
  ..   doencas_preexistentes = col_character(),
  ..   `outras_doencas_preexistentes ` = col_character(),
  ..   profissional_saude = col_character(),
  ..   categoria_profissional = col_character(),
  ..   classificacao_final = col_character(),
  ..   evolucao = col_character(),
  ..   data_obito = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 
# Estatísticas gerais
summary(df)
 data_notificacao         sexo               raca            etnia            idade           municipio_residencia
 Min.   :2020-02-26   Length:36435       Length:36435       Mode:logical   Length:36435       Length:36435        
 1st Qu.:2020-05-15   Class :character   Class :character   NA's:36435     Class :character   Class :character    
 Median :2020-11-14   Mode  :character   Mode  :character                  Mode  :character   Mode  :character    
 Mean   :2020-12-22                                                                                               
 3rd Qu.:2021-06-01                                                                                               
 Max.   :2022-06-13                                                                                               
    bairro          distrito_sanitario data_inicio_sintomas   sintomas         outros_sintomas   
 Length:36435       Length:36435       Min.   :2020-01-22   Length:36435       Length:36435      
 Class :character   Class :character   1st Qu.:2020-05-07   Class :character   Class :character  
 Mode  :character   Mode  :character   Median :2020-11-01   Mode  :character   Mode  :character  
                                       Mean   :2020-12-07                                        
                                       3rd Qu.:2021-05-11                                        
                                       Max.   :2022-06-11                                        
 doencas_preexistentes outras_doencas_preexistentes  profissional_saude categoria_profissional classificacao_final
 Length:36435          Length:36435                  Length:36435       Length:36435           Length:36435       
 Class :character      Class :character              Class :character   Class :character       Class :character   
 Mode  :character      Mode  :character              Mode  :character   Mode  :character       Mode  :character   
                                                                                                                  
                                                                                                                  
                                                                                                                  
   evolucao          data_obito       
 Length:36435       Length:36435      
 Class :character   Class :character  
 Mode  :character   Mode  :character  
                                      
                                      
                                      
# Verificar valores nulos
colSums(is.na(df))
             data_notificacao                          sexo                          raca 
                            0                             0                          6329 
                        etnia                         idade          municipio_residencia 
                        36435                             0                             0 
                       bairro            distrito_sanitario          data_inicio_sintomas 
                            0                             0                             0 
                     sintomas               outros_sintomas         doencas_preexistentes 
                         1958                         30461                         22737 
outras_doencas_preexistentes             profissional_saude        categoria_profissional 
                        28627                          9098                         33516 
          classificacao_final                      evolucao                    data_obito 
                            4                           978                         27086 

Limpeza e Engenharia de Dados

# Verificar duplicados
total_duplicados <- sum(duplicated(df))
total_duplicados
[1] 6
# Visualizar duplicados
duplicados <- df[duplicated(df), ]
head(duplicados)

# Remover duplicados
df <- df %>% distinct()
# Tratar datas
df <- df %>%
  mutate(
    data_notificacao = as.Date(data_notificacao, format = "%Y-%m-%d"),
    data_inicio_sintomas = as.Date(data_inicio_sintomas, format = "%Y-%m-%d"),
    data_obito = as.Date(data_obito, format = "%Y-%m-%d")
  )

Separar colunas multivaloradas

df <- df %>%
  mutate(id = row_number())
df_long <- df %>%
  mutate(id_paciente = row_number()) %>%
  mutate(
    sintomas = str_split(sintomas, ","),
    outros_sintomas = str_split(outros_sintomas, ","),
    doencas_preexistentes = str_split(doencas_preexistentes, ",")
  ) %>%
  unnest(sintomas) %>%
  unnest(outros_sintomas) %>%
  unnest(doencas_preexistentes) %>%
  mutate(across(c(sintomas, outros_sintomas, doencas_preexistentes), trimws))
str(df_long)
tibble [170,553 × 20] (S3: tbl_df/tbl/data.frame)
 $ data_notificacao             : Date[1:170553], format: "2020-05-30" "2020-05-30" "2020-05-30" "2020-05-30" ...
 $ sexo                         : chr [1:170553] "Masculino" "Masculino" "Masculino" "Masculino" ...
 $ raca                         : chr [1:170553] "Branca" "Branca" "Branca" "Branca" ...
 $ etnia                        : logi [1:170553] NA NA NA NA NA NA ...
 $ idade                        : chr [1:170553] "53" "53" "53" "53" ...
 $ municipio_residencia         : chr [1:170553] "Recife" "Recife" "Recife" "Recife" ...
 $ bairro                       : chr [1:170553] "SAN MARTIN" "SAN MARTIN" "SAN MARTIN" "SAN MARTIN" ...
 $ distrito_sanitario           : chr [1:170553] "V" "V" "V" "V" ...
 $ data_inicio_sintomas         : Date[1:170553], format: "2020-05-25" "2020-05-25" "2020-05-25" "2020-05-25" ...
 $ sintomas                     : chr [1:170553] "Febre" "Tosse" "Mialgia" "Alteração/perda de olfato e/ou paladar" ...
 $ outros_sintomas              : chr [1:170553] NA NA NA NA ...
 $ doencas_preexistentes        : chr [1:170553] NA NA NA NA ...
 $ outras_doencas_preexistentes : chr [1:170553] NA NA NA NA ...
 $ profissional_saude           : chr [1:170553] NA NA NA NA ...
 $ categoria_profissional       : chr [1:170553] NA NA NA NA ...
 $ classificacao_final          : chr [1:170553] "DESCARTADO" "DESCARTADO" "DESCARTADO" "DESCARTADO" ...
 $ evolucao                     : chr [1:170553] "ISOLAMENTO DOMICILIAR" "ISOLAMENTO DOMICILIAR" "ISOLAMENTO DOMICILIAR" "ISOLAMENTO DOMICILIAR" ...
 $ data_obito                   : Date[1:170553], format: NA NA NA NA ...
 $ id                           : int [1:170553] 1 1 1 1 2 2 2 2 3 3 ...
 $ id_paciente                  : int [1:170553] 1 1 1 1 2 2 2 2 3 3 ...

Visualizações iniciais

library(dplyr)
library(ggplot2)
library(purrr)
library(forcats)

# Identificar colunas categóricas (exceto ID)
colunas_categoricas <- df %>%
  select(where(is.character), where(is.factor)) %>%
  colnames()


# Função: gráfico por ID
plot_por_id <- function(col) {
  
  df_long %>%
    filter(!is.na(.data[[col]]), .data[[col]] != "") %>%
    count(id_paciente, .data[[col]]) %>%
    count(.data[[col]]) %>%  # total de IDs por categoria
    mutate(
      categoria = fct_reorder(.data[[col]], n)
    ) %>%
    ggplot(aes(x = categoria, y = n)) +
    geom_bar(stat = "identity", fill = "#2C7FB8") +
    coord_flip() +
    labs(
      title = paste("Distribuição por ID da coluna:", col),
      x = "Categoria",
      y = "Quantidade de IDs"
    ) +
    theme_minimal(base_size = 12)
}

# Gerar gráficos para cada coluna categórica
plots <- map(colunas_categoricas, plot_por_id)

plots
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]

[[6]]

[[7]]

[[8]]

[[9]]

[[10]]

[[11]]

[[12]]

[[13]]

[[14]]

Visualizando colunas que ficaram com díficil visualização por gráfico


library(dplyr)
library(stringr)
library(purrr)
library(tidyr)

colunas <- c(
  "sintomas",
  "outros_sintomas",
  "doencas_preexistentes",
  "outras_doencas_preexistentes "
)

listar_unicos <- function(df, coluna) {
  df %>%
    select(all_of(coluna)) %>%
    filter(!is.na(.data[[coluna]])) %>%
    # explode valores separados por vírgula
    mutate(valor = str_split(.data[[coluna]], ",")) %>%
    unnest(valor) %>%
    mutate(valor = str_trim(valor)) %>%
    distinct(valor) %>%
    arrange(valor)
}

valores_unicos <- map(colunas, ~ listar_unicos(df, .x))

names(valores_unicos) <- colunas

valores_unicos
DQotLS0NCnRpdGxlOiAiQW7DoWxpc2UgaW5pY2lhbCBkb3MgZGFkb3MgZGUgQ09WSUQiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCmxpYnJhcnkocmVhZHIpDQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KDQpkZiA8LSByZWFkX2RlbGltKA0KICAiY2Fzb3NncmF2ZXNjb3ZpZC5jc3YiLA0KICBkZWxpbSA9ICI7IiwNCiAgcXVvdGUgPSAiIiwNCiAgZXNjYXBlX2JhY2tzbGFzaCA9IEZBTFNFLA0KICBlc2NhcGVfZG91YmxlID0gRkFMU0UsDQogIHNob3dfY29sX3R5cGVzID0gRkFMU0UNCikNCg0KYGBgDQpgYGB7cn0NCiMgIyBFc3RydXR1cmEgaW5pY2lhbA0Kc3RyKGRmKQ0KYGBgDQoNCmBgYHtyfQ0KIyBFc3RhdMOtc3RpY2FzIGdlcmFpcw0Kc3VtbWFyeShkZikNCmBgYA0KYGBge3J9DQojIFZlcmlmaWNhciB2YWxvcmVzIG51bG9zDQpjb2xTdW1zKGlzLm5hKGRmKSkNCmBgYA0KDQojIyBMaW1wZXphIGUgRW5nZW5oYXJpYSBkZSBEYWRvcw0KDQpgYGB7cn0NCiMgVmVyaWZpY2FyIGR1cGxpY2Fkb3MNCnRvdGFsX2R1cGxpY2Fkb3MgPC0gc3VtKGR1cGxpY2F0ZWQoZGYpKQ0KdG90YWxfZHVwbGljYWRvcw0KDQojIFZpc3VhbGl6YXIgZHVwbGljYWRvcw0KZHVwbGljYWRvcyA8LSBkZltkdXBsaWNhdGVkKGRmKSwgXQ0KaGVhZChkdXBsaWNhZG9zKQ0KDQojIFJlbW92ZXIgZHVwbGljYWRvcw0KZGYgPC0gZGYgJT4lIGRpc3RpbmN0KCkNCg0KYGBgDQpgYGB7cn0NCiMgVHJhdGFyIGRhdGFzDQpkZiA8LSBkZiAlPiUNCiAgbXV0YXRlKA0KICAgIGRhdGFfbm90aWZpY2FjYW8gPSBhcy5EYXRlKGRhdGFfbm90aWZpY2FjYW8sIGZvcm1hdCA9ICIlWS0lbS0lZCIpLA0KICAgIGRhdGFfaW5pY2lvX3NpbnRvbWFzID0gYXMuRGF0ZShkYXRhX2luaWNpb19zaW50b21hcywgZm9ybWF0ID0gIiVZLSVtLSVkIiksDQogICAgZGF0YV9vYml0byA9IGFzLkRhdGUoZGF0YV9vYml0bywgZm9ybWF0ID0gIiVZLSVtLSVkIikNCiAgKQ0KYGBgDQoNCiMjIyBTZXBhcmFyIGNvbHVuYXMgbXVsdGl2YWxvcmFkYXMNCg0KYGBge3J9DQpkZiA8LSBkZiAlPiUNCiAgbXV0YXRlKGlkID0gcm93X251bWJlcigpKQ0KYGBgDQoNCmBgYHtyfSANCmRmX2xvbmcgPC0gZGYgJT4lDQogIG11dGF0ZShpZF9wYWNpZW50ZSA9IHJvd19udW1iZXIoKSkgJT4lDQogIG11dGF0ZSgNCiAgICBzaW50b21hcyA9IHN0cl9zcGxpdChzaW50b21hcywgIiwiKSwNCiAgICBvdXRyb3Nfc2ludG9tYXMgPSBzdHJfc3BsaXQob3V0cm9zX3NpbnRvbWFzLCAiLCIpLA0KICAgIGRvZW5jYXNfcHJlZXhpc3RlbnRlcyA9IHN0cl9zcGxpdChkb2VuY2FzX3ByZWV4aXN0ZW50ZXMsICIsIikNCiAgKSAlPiUNCiAgdW5uZXN0KHNpbnRvbWFzKSAlPiUNCiAgdW5uZXN0KG91dHJvc19zaW50b21hcykgJT4lDQogIHVubmVzdChkb2VuY2FzX3ByZWV4aXN0ZW50ZXMpICU+JQ0KICBtdXRhdGUoYWNyb3NzKGMoc2ludG9tYXMsIG91dHJvc19zaW50b21hcywgZG9lbmNhc19wcmVleGlzdGVudGVzKSwgdHJpbXdzKSkNCg0KDQpgYGANCg0KYGBge3J9DQpzdHIoZGZfbG9uZykNCg0KYGBgDQoNCiMjIFZpc3VhbGl6YcOnw7VlcyBpbmljaWFpcw0KDQpgYGB7ciBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkocHVycnIpDQpsaWJyYXJ5KGZvcmNhdHMpDQoNCiMgSWRlbnRpZmljYXIgY29sdW5hcyBjYXRlZ8OzcmljYXMgKGV4Y2V0byBJRCkNCmNvbHVuYXNfY2F0ZWdvcmljYXMgPC0gZGYgJT4lDQogIHNlbGVjdCh3aGVyZShpcy5jaGFyYWN0ZXIpLCB3aGVyZShpcy5mYWN0b3IpKSAlPiUNCiAgY29sbmFtZXMoKQ0KDQoNCiMgRnVuw6fDo286IGdyw6FmaWNvIHBvciBJRA0KcGxvdF9wb3JfaWQgPC0gZnVuY3Rpb24oY29sKSB7DQogIA0KICBkZl9sb25nICU+JQ0KICAgIGZpbHRlcighaXMubmEoLmRhdGFbW2NvbF1dKSwgLmRhdGFbW2NvbF1dICE9ICIiKSAlPiUNCiAgICBjb3VudChpZF9wYWNpZW50ZSwgLmRhdGFbW2NvbF1dKSAlPiUNCiAgICBjb3VudCguZGF0YVtbY29sXV0pICU+JSAgIyB0b3RhbCBkZSBJRHMgcG9yIGNhdGVnb3JpYQ0KICAgIG11dGF0ZSgNCiAgICAgIGNhdGVnb3JpYSA9IGZjdF9yZW9yZGVyKC5kYXRhW1tjb2xdXSwgbikNCiAgICApICU+JQ0KICAgIGdncGxvdChhZXMoeCA9IGNhdGVnb3JpYSwgeSA9IG4pKSArDQogICAgZ2VvbV9iYXIoc3RhdCA9ICJpZGVudGl0eSIsIGZpbGwgPSAiIzJDN0ZCOCIpICsNCiAgICBjb29yZF9mbGlwKCkgKw0KICAgIGxhYnMoDQogICAgICB0aXRsZSA9IHBhc3RlKCJEaXN0cmlidWnDp8OjbyBwb3IgSUQgZGEgY29sdW5hOiIsIGNvbCksDQogICAgICB4ID0gIkNhdGVnb3JpYSIsDQogICAgICB5ID0gIlF1YW50aWRhZGUgZGUgSURzIg0KICAgICkgKw0KICAgIHRoZW1lX21pbmltYWwoYmFzZV9zaXplID0gMTIpDQp9DQoNCiMgR2VyYXIgZ3LDoWZpY29zIHBhcmEgY2FkYSBjb2x1bmEgY2F0ZWfDs3JpY2ENCnBsb3RzIDwtIG1hcChjb2x1bmFzX2NhdGVnb3JpY2FzLCBwbG90X3Bvcl9pZCkNCg0KcGxvdHMNCg0KYGBgDQoNCiMjIyBWaXN1YWxpemFuZG8gY29sdW5hcyBxdWUgZmljYXJhbSBjb20gZMOtZmljaWwgdmlzdWFsaXphw6fDo28gcG9yIGdyw6FmaWNvDQoNCmBgYHtyfSANCg0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoc3RyaW5ncikNCmxpYnJhcnkocHVycnIpDQpsaWJyYXJ5KHRpZHlyKQ0KDQpjb2x1bmFzIDwtIGMoDQogICJzaW50b21hcyIsDQogICJvdXRyb3Nfc2ludG9tYXMiLA0KICAiZG9lbmNhc19wcmVleGlzdGVudGVzIiwNCiAgIm91dHJhc19kb2VuY2FzX3ByZWV4aXN0ZW50ZXMgIg0KKQ0KDQpsaXN0YXJfdW5pY29zIDwtIGZ1bmN0aW9uKGRmLCBjb2x1bmEpIHsNCiAgZGYgJT4lDQogICAgc2VsZWN0KGFsbF9vZihjb2x1bmEpKSAlPiUNCiAgICBmaWx0ZXIoIWlzLm5hKC5kYXRhW1tjb2x1bmFdXSkpICU+JQ0KICAgICMgZXhwbG9kZSB2YWxvcmVzIHNlcGFyYWRvcyBwb3IgdsOtcmd1bGENCiAgICBtdXRhdGUodmFsb3IgPSBzdHJfc3BsaXQoLmRhdGFbW2NvbHVuYV1dLCAiLCIpKSAlPiUNCiAgICB1bm5lc3QodmFsb3IpICU+JQ0KICAgIG11dGF0ZSh2YWxvciA9IHN0cl90cmltKHZhbG9yKSkgJT4lDQogICAgZGlzdGluY3QodmFsb3IpICU+JQ0KICAgIGFycmFuZ2UodmFsb3IpDQp9DQoNCnZhbG9yZXNfdW5pY29zIDwtIG1hcChjb2x1bmFzLCB+IGxpc3Rhcl91bmljb3MoZGYsIC54KSkNCg0KbmFtZXModmFsb3Jlc191bmljb3MpIDwtIGNvbHVuYXMNCg0KdmFsb3Jlc191bmljb3MNCg0KYGBg