Análisis exploratorio

Se analizará la base de datos del nuevo sistema de ingreso de tickets.

15 de julio , diccionario.

# summary(BD)
# names(BD)

NA’s en la data

Se reemplazan los strings “NULL” por NA y se gráfica para visualizar los campos con missing values de la data.

# BD <- read_csv("~/Ramos/2021-2/Instacrops/OTRS/otrs_state_july2021.csv",col_types = cols(tn = col_character()))
# BD = BD %>%
#   mutate(create_time = as.character(create_time),
#          change_time = as.character(change_time))
# 
# for (i in 1:ncol(BD)) {
#   for (j in 1:nrow(BD)) {
#     BD[j,i] = ifelse(BD[j,i]=="NULL", NA ,BD[j,i])
#   }
# }
# save(BD,file = "~/Ramos/2021-2/Instacrops/OTRS/BD.RData")
# 
load("~/Ramos/2021-2/Instacrops/OTRS/BD.RData")


BD_clean=BD
vis_miss(BD)

Se revisará cada columna para explorar la información contenida y evaluar si es necesario cambiar el formato.

ID

# Revisar si el id es único
sum(unique(BD$id)/unique(BD$id))
## [1] 154
nrow(BD)
## [1] 154

El id es único y completo.

tn

# table(BD$tn)

BD %>%
  select(tn) %>%
  mutate(tn= as.numeric(tn)) %>%
  filter(!is.na(tn)) %>%
  ggplot(aes(tn)) + geom_bar(stat="bin", bins=60)

BD_clean = BD_clean %>%
  mutate(tn= as.numeric(tn))

sum(unique(BD_clean$tn)/unique(BD_clean$tn))
## [1] 154

No hay NA’s, también es único.

title

table(BD$title)[1:8]
## 
##                                 1014, falla de sensor atmosférico 
##                                                                 1 
##                     1020, error en el registro de precipitaciones 
##                                                                 1 
##                      1028, datos de sensor atmosférico con ruidos 
##                                                                 1 
##             1049-1051 - Falla eléctrica en activación de sectores 
##                                                                 1 
## 1052, intermitencia en la señal/caídas en las lecturas de humedad 
##                                                                 1 
##                1062/1063 - Cambio de altura sensores atmosféricos 
##                                                                 1 
##                          1069, dudas en registro de precipitación 
##                                                                 1 
##                        1069, sin registro de velocidad del viento 
##                                                                 1

Se muestran algunos titulos.

queue_id

table(BD$queue_id)
## 
##  6  9 15 16 19 44 45 46 
## 10 20 56 53  1  1  3 10
BD %>%
  select(queue_id) %>%
  count(queue_id, sort=TRUE) %>%
  filter(!is.na(queue_id)) %>%
  mutate(queue_id = as.factor(queue_id)) %>%
  ggplot(aes(n,queue_id)) + geom_col()

BD %>%
  filter(is.na(queue_id))
## # A tibble: 0 x 25
## # ... with 25 variables: id <dbl>, tn <chr>, title <chr>, queue_id <dbl>,
## #   ticket_lock_id <dbl>, type_id <dbl>, service_id <chr>, sla_id <chr>,
## #   user_id <dbl>, responsible_user_id <dbl>, ticket_priority_id <dbl>,
## #   ticket_state_id <dbl>, customer_id <chr>, customer_user_id <chr>,
## #   timeout <dbl>, until_time <dbl>, escalation_time <dbl>,
## #   escalation_update_time <dbl>, escalation_response_time <dbl>,
## #   escalation_solution_time <dbl>, archive_flag <dbl>, create_time <chr>, ...
# BD_clean = BD_clean %>%
#   filter(!is.na(queue_id))

No hay NA’s.

ticket_lock_id

# table(BD$ticket_lock_id)

BD %>%
  select(ticket_lock_id) %>%
  count(ticket_lock_id, sort=TRUE) %>%
  filter(!is.na(ticket_lock_id)) %>%
  mutate(ticket_lock_id = as.factor(ticket_lock_id)) %>%
  ggplot(aes(n,ticket_lock_id)) + geom_col(stat="identity")
## Warning: Ignoring unknown parameters: stat

BD %>%
  filter(is.na(ticket_lock_id))
## # A tibble: 0 x 25
## # ... with 25 variables: id <dbl>, tn <chr>, title <chr>, queue_id <dbl>,
## #   ticket_lock_id <dbl>, type_id <dbl>, service_id <chr>, sla_id <chr>,
## #   user_id <dbl>, responsible_user_id <dbl>, ticket_priority_id <dbl>,
## #   ticket_state_id <dbl>, customer_id <chr>, customer_user_id <chr>,
## #   timeout <dbl>, until_time <dbl>, escalation_time <dbl>,
## #   escalation_update_time <dbl>, escalation_response_time <dbl>,
## #   escalation_solution_time <dbl>, archive_flag <dbl>, create_time <chr>, ...
# BD_clean = BD_clean %>%
#   filter(!is.na(ticket_lock_id))

No hay NA’s.

type_id

# table(BD$type_id)

BD %>%
  select(type_id) %>%
  count(type_id, sort=TRUE) %>%
  filter(!is.na(type_id)) %>%
  mutate(type_id = as.factor(type_id)) %>%
  ggplot(aes(n,type_id)) + geom_col()

BD %>%
  filter(is.na(type_id))
## # A tibble: 0 x 25
## # ... with 25 variables: id <dbl>, tn <chr>, title <chr>, queue_id <dbl>,
## #   ticket_lock_id <dbl>, type_id <dbl>, service_id <chr>, sla_id <chr>,
## #   user_id <dbl>, responsible_user_id <dbl>, ticket_priority_id <dbl>,
## #   ticket_state_id <dbl>, customer_id <chr>, customer_user_id <chr>,
## #   timeout <dbl>, until_time <dbl>, escalation_time <dbl>,
## #   escalation_update_time <dbl>, escalation_response_time <dbl>,
## #   escalation_solution_time <dbl>, archive_flag <dbl>, create_time <chr>, ...
# BD_clean = BD_clean %>%
#   filter(!is.na(type_id))

No hay NA’s.

service_id

# table(BD$service_id)

BD %>%
  select(service_id) %>%
  count(service_id, sort=TRUE) %>%
  filter(!is.na(service_id)) %>%
  mutate(service_id = as.factor(service_id)) %>%
  ggplot(aes(n,service_id)) + geom_col(stat="identity")
## Warning: Ignoring unknown parameters: stat

BD %>%
  filter(is.na(service_id))
## # A tibble: 28 x 25
##       id tn     title  queue_id ticket_lock_id type_id service_id sla_id user_id
##    <dbl> <chr>  <chr>     <dbl>          <dbl>   <dbl> <chr>      <chr>    <dbl>
##  1  2435 20210~ 884, ~       46              1       2 <NA>       <NA>         7
##  2  2441 20210~ ID: 1~       15              1       8 <NA>       <NA>        46
##  3  2442 20210~ 143, ~        9              2       2 <NA>       <NA>        50
##  4  2444 20210~ ID: 1~       15              1       8 <NA>       <NA>        48
##  5  2445 20210~ ID: 1~       15              1       8 <NA>       <NA>        48
##  6  2446 20210~ id_62~       16              2       2 <NA>       <NA>        46
##  7  2447 20210~ ID_50~       19              2       2 <NA>       <NA>        42
##  8  2454 20210~ 1098_~       15              1       2 <NA>       <NA>        42
##  9  2455 20210~ 1097_~       15              1       2 <NA>       <NA>        42
## 10  2456 20210~ IWP_F~       45              1       8 <NA>       <NA>        50
## # ... with 18 more rows, and 16 more variables: responsible_user_id <dbl>,
## #   ticket_priority_id <dbl>, ticket_state_id <dbl>, customer_id <chr>,
## #   customer_user_id <chr>, timeout <dbl>, until_time <dbl>,
## #   escalation_time <dbl>, escalation_update_time <dbl>,
## #   escalation_response_time <dbl>, escalation_solution_time <dbl>,
## #   archive_flag <dbl>, create_time <chr>, create_by <dbl>, change_time <chr>,
## #   change_by <dbl>
# BD_clean = BD_clean %>%
#   filter(!is.na(service_id))

Hay 28 filas con este valor NA.

sla_id

# table(BD$sla_id)

BD %>%
  select(sla_id) %>%
  count(sla_id, sort=TRUE) %>%
  filter(!is.na(sla_id)) %>%
  mutate(sla_id = as.factor(sla_id)) %>%
  ggplot(aes(n,sla_id)) + geom_col()

BD %>%
  filter(is.na(sla_id))
## # A tibble: 48 x 25
##       id tn     title  queue_id ticket_lock_id type_id service_id sla_id user_id
##    <dbl> <chr>  <chr>     <dbl>          <dbl>   <dbl> <chr>      <chr>    <dbl>
##  1  2434 20210~ 366, ~       46              1       2 44         <NA>         7
##  2  2435 20210~ 884, ~       46              1       2 <NA>       <NA>         7
##  3  2441 20210~ ID: 1~       15              1       8 <NA>       <NA>        46
##  4  2442 20210~ 143, ~        9              2       2 <NA>       <NA>        50
##  5  2443 20210~ 805, ~       16              2       2 50         <NA>        46
##  6  2444 20210~ ID: 1~       15              1       8 <NA>       <NA>        48
##  7  2445 20210~ ID: 1~       15              1       8 <NA>       <NA>        48
##  8  2446 20210~ id_62~       16              2       2 <NA>       <NA>        46
##  9  2447 20210~ ID_50~       19              2       2 <NA>       <NA>        42
## 10  2452 20210~ 704, ~       46              1       2 44         <NA>        46
## # ... with 38 more rows, and 16 more variables: responsible_user_id <dbl>,
## #   ticket_priority_id <dbl>, ticket_state_id <dbl>, customer_id <chr>,
## #   customer_user_id <chr>, timeout <dbl>, until_time <dbl>,
## #   escalation_time <dbl>, escalation_update_time <dbl>,
## #   escalation_response_time <dbl>, escalation_solution_time <dbl>,
## #   archive_flag <dbl>, create_time <chr>, create_by <dbl>, change_time <chr>,
## #   change_by <dbl>
# BD_clean = BD_clean %>%
#   filter(!is.na(sla_id))

Hay 48 filas con NA’s.

user_id

# table(BD$user_id)

BD %>%
  select(user_id) %>%
  count(user_id, sort=TRUE) %>%
  #filter(n>20) %>%
  mutate(user_id = as.factor(user_id)) %>%
  ggplot(aes(n,user_id)) + geom_col()

BD %>%
  filter(is.na(user_id))
## # A tibble: 0 x 25
## # ... with 25 variables: id <dbl>, tn <chr>, title <chr>, queue_id <dbl>,
## #   ticket_lock_id <dbl>, type_id <dbl>, service_id <chr>, sla_id <chr>,
## #   user_id <dbl>, responsible_user_id <dbl>, ticket_priority_id <dbl>,
## #   ticket_state_id <dbl>, customer_id <chr>, customer_user_id <chr>,
## #   timeout <dbl>, until_time <dbl>, escalation_time <dbl>,
## #   escalation_update_time <dbl>, escalation_response_time <dbl>,
## #   escalation_solution_time <dbl>, archive_flag <dbl>, create_time <chr>, ...
# BD_clean = BD_clean %>%
#   filter(!is.na(user_id))

No hay NA’s.

responsible_user_id

table(BD$responsible_user_id)
## 
##   1 
## 154

El valor es único.

ticket_priority_id

table(BD$ticket_priority_id)
## 
##   4 
## 154

El valor es único.

ticket_state_id

table(BD$ticket_state_id)
## 
##   2   4 
##  23 131
BD %>%
  select(ticket_state_id) %>%
  count(ticket_state_id, sort=TRUE) %>%
  #filter(n>20) %>%
  mutate(ticket_state_id = as.factor(ticket_state_id)) %>%
  ggplot(aes(n,ticket_state_id)) + geom_col(stat="identity")
## Warning: Ignoring unknown parameters: stat

customer_id

table(BD$customer_id)
## 
##    1   10  100 1005 1006 1008 1010 1012  102  103  104  108  120  121  129  130 
##    1    2    1    1    4    1    2    2    1    1    1    2    1    1    4    1 
##  131  132  134  145  146  148  149   15  154  155   16   17  171  172  175  176 
##    8    5    1    2    1    1    3    1    2    1    2    2    2    2    1    1 
##  179  186  190  192  193  198  201  202  212  215  226   23  230  232  243  245 
##    1    5    2    1    1    2    2    1    1    1    1    2    9    2    1    1 
##  247  248   25  251  256  257  258   26  262  268  269  271  272   33   35   39 
##    3    1    1    1    1    1    2    4    1    2    1    1    2    1    1    3 
##   45   46   47   49   51   59   63   64   89  999 
##    1    1    1    1    2    3    1    3    1    5
BD %>%
  select(customer_id) %>%
  count(customer_id, sort=TRUE) %>%
  filter(n>2) %>%
  mutate(customer_id = as.factor(customer_id)) %>%
  ggplot(aes(n,customer_id)) + geom_col()

BD %>%
  filter(is.na(customer_id))
## # A tibble: 15 x 25
##       id tn     title  queue_id ticket_lock_id type_id service_id sla_id user_id
##    <dbl> <chr>  <chr>     <dbl>          <dbl>   <dbl> <chr>      <chr>    <dbl>
##  1  2441 20210~ ID: 1~       15              1       8 <NA>       <NA>        46
##  2  2444 20210~ ID: 1~       15              1       8 <NA>       <NA>        48
##  3  2445 20210~ ID: 1~       15              1       8 <NA>       <NA>        48
##  4  2446 20210~ id_62~       16              2       2 <NA>       <NA>        46
##  5  2447 20210~ ID_50~       19              2       2 <NA>       <NA>        42
##  6  2454 20210~ 1098_~       15              1       2 <NA>       <NA>        42
##  7  2455 20210~ 1097_~       15              1       2 <NA>       <NA>        42
##  8  2456 20210~ IWP_F~       45              1       8 <NA>       <NA>        50
##  9  2457 20210~ id_10~       15              1       8 <NA>       <NA>        42
## 10  2468 20210~ id_11~       44              1       2 <NA>       <NA>        42
## 11  2478 20210~ ID: 1~       15              1       8 <NA>       <NA>        38
## 12  2573 20210~ id_46~       15              1       2 <NA>       <NA>        42
## 13  2574 20210~ ID_39~       15              1       2 <NA>       <NA>        42
## 14  2576 20210~ ID_47~       15              1       2 <NA>       <NA>        42
## 15  2577 20210~ ID_83~       15              1       2 <NA>       <NA>        42
## # ... with 16 more variables: responsible_user_id <dbl>,
## #   ticket_priority_id <dbl>, ticket_state_id <dbl>, customer_id <chr>,
## #   customer_user_id <chr>, timeout <dbl>, until_time <dbl>,
## #   escalation_time <dbl>, escalation_update_time <dbl>,
## #   escalation_response_time <dbl>, escalation_solution_time <dbl>,
## #   archive_flag <dbl>, create_time <chr>, create_by <dbl>, change_time <chr>,
## #   change_by <dbl>

Hay 15 filas con este valor vacio.

customer_user_id

table(BD$customer_user_id)
## 
##                           Agrícola Correa Cox 
##                                             1 
##                           Agricola Del Carmen 
##                                             2 
##                            Agricola del Solar 
##                                             2 
##                         Agrícola Ditzler ltda 
##                                             4 
##                           Agrícola Don Jochen 
##                                             1 
##                           Agrícola El Arrollo 
##                                             1 
##                             Agrícola El Huapi 
##                                             2 
##                      AGRICOLA KAIKEN LIMITADA 
##                                             1 
##                 Agricola Las Vertientes Ltda. 
##                                             5 
##                          AGRÍCOLA LOS ESPINOS 
##                                             1 
##                          Agrícola Los Molinos 
##                                             1 
##                           Agrícola Los Robles 
##                                             1 
##                        Agricola los Zorrillos 
##                                             1 
##                               Agricola Malaga 
##                                             2 
##                     Agricola Mercedes Briceño 
##                                             3 
##                            Agricola Ocoa Ltda 
##                                             1 
##                              Agrícola Panagro 
##                                             2 
##                         Agricola Pimpihue S.A 
##                                             2 
##                             Agricola Pinochet 
##                                             1 
##                            Agricola Ponderosa 
##                                             1 
##                           Agricola Portezuelo 
##                                             1 
##  Agrícola Rafael Tomas Eduardo Undurraga EIRL 
##                                             2 
##                  Agrícola San Carlos Limitada 
##                                             2 
##                      Agrícola San Carlos Ltda 
##                                             1 
##                          AGRICOLA SAN RICARDO 
##                                             1 
##                            Agricola Santa Ana 
##                                             1 
##                       Agrícola Santa Graciela 
##                                             1 
##                 Agrícola Santa Marta de Liray 
##                                             2 
##                             Agricola Valvalle 
##                                             2 
##                         Agrícola Verdani Ltda 
##                                             2 
##                Agrícola y Forestal El Durazno 
##                                             1 
##                      Agrofrutícola Mallarauco 
##                                             1 
##                 Agropecuaria Ferias Araucanía 
##                                             1 
##                                     AGROREYES 
##                                             3 
##                                      bnegroni 
##                                             3 
##                            CER - Felipe Labbé 
##                                             2 
##                  CHARLES WINDELSCHMIDT COLLAR 
##                                             1 
##                         Cliente no encontrado 
##                                             5 
##                                    El Encanto 
##                                             2 
##                                       elhuapi 
##                                             1 
##                           EXPORTADORA POMPEIA 
##                                             2 
##                                      Frutasol 
##                                             4 
##            Frutícola Denisse Salinas Martínez 
##                                             1 
##                            Fruticola el Aromo 
##                                             1 
##                                  Fundo Catemu 
##                                             1 
##                                Jhonson Fruits 
##                                             2 
##                         Juan Henriquez Marich 
##                                             1 
##                                     Las Cebra 
##                                             3 
##                                       Llancay 
##                                             1 
##                                      lmiranda 
##                                             1 
##                             Luis Correa Ponce 
##                                             1 
##                         Luis Miranda Oyanadel 
##                                             1 
##                MARIA CRISTINA BUSTOS GALDAMES 
##                                             1 
##                                        McLean 
##                                             1 
##                                       Monfrut 
##                                             2 
##                         New Farm Forestry Spa 
##                                             2 
##                                   Noble Fruit 
##                                             1 
##                                Pablo Basualto 
##                                             1 
##                                 Pablo Massoud 
##                                             1 
##                    Paulina Gajardo Valenzuela 
##                                             1 
##                                   Providencia 
##                                             3 
##                               SAMUEL BUDINICH 
##                                             1 
##                               Santa Francisca 
##                                             1 
##                                   Santa Lucia 
##                                             1 
## Soc. de Inversiones y servicios pro Ingenieri 
##                                             2 
##               Sociedad Agricola El Radal Ltda 
##                                             1 
##                  SOCIEDAD AGRICOLA FISTUR SPA 
##                                             9 
##                     Sociedad agrícola Jg Ltda 
##                                             8 
##             Sociedad Agricola La Hornilla SPA 
##                                             2 
##                   Sociedad Agricola Mataquito 
##                                             1 
##           Sociedad Agricola Munilque Limitada 
##                                             3 
##             Sociedad Agricola Santa Laura SpA 
##                                             1 
##                                  Tierra Buena 
##                                             1 
##                   Transportes La Martina LTDA 
##                                             4 
##                                           UCM 
##                                             1 
##                          Viña Los Boldos Ltda 
##                                             1
BD %>%
  select(customer_user_id) %>%
  count(customer_user_id, sort=TRUE) %>%
  filter(n>20) %>%
  mutate(customer_user_id = as.factor(customer_user_id)) %>%
  ggplot(aes(n,customer_user_id)) + geom_col(stat="identity")
## Warning: Ignoring unknown parameters: stat

BD %>%
  filter(is.na(customer_user_id))
## # A tibble: 15 x 25
##       id tn     title  queue_id ticket_lock_id type_id service_id sla_id user_id
##    <dbl> <chr>  <chr>     <dbl>          <dbl>   <dbl> <chr>      <chr>    <dbl>
##  1  2441 20210~ ID: 1~       15              1       8 <NA>       <NA>        46
##  2  2444 20210~ ID: 1~       15              1       8 <NA>       <NA>        48
##  3  2445 20210~ ID: 1~       15              1       8 <NA>       <NA>        48
##  4  2446 20210~ id_62~       16              2       2 <NA>       <NA>        46
##  5  2447 20210~ ID_50~       19              2       2 <NA>       <NA>        42
##  6  2454 20210~ 1098_~       15              1       2 <NA>       <NA>        42
##  7  2455 20210~ 1097_~       15              1       2 <NA>       <NA>        42
##  8  2456 20210~ IWP_F~       45              1       8 <NA>       <NA>        50
##  9  2457 20210~ id_10~       15              1       8 <NA>       <NA>        42
## 10  2468 20210~ id_11~       44              1       2 <NA>       <NA>        42
## 11  2478 20210~ ID: 1~       15              1       8 <NA>       <NA>        38
## 12  2573 20210~ id_46~       15              1       2 <NA>       <NA>        42
## 13  2574 20210~ ID_39~       15              1       2 <NA>       <NA>        42
## 14  2576 20210~ ID_47~       15              1       2 <NA>       <NA>        42
## 15  2577 20210~ ID_83~       15              1       2 <NA>       <NA>        42
## # ... with 16 more variables: responsible_user_id <dbl>,
## #   ticket_priority_id <dbl>, ticket_state_id <dbl>, customer_id <chr>,
## #   customer_user_id <chr>, timeout <dbl>, until_time <dbl>,
## #   escalation_time <dbl>, escalation_update_time <dbl>,
## #   escalation_response_time <dbl>, escalation_solution_time <dbl>,
## #   archive_flag <dbl>, create_time <chr>, create_by <dbl>, change_time <chr>,
## #   change_by <dbl>
# BD_clean = BD_clean %>%
#   filter(!is.na(customer_user_id))

Hay 913 filas con este valor vacio.

customer_user_id

table(BD$customer_user_id)
## 
##                           Agrícola Correa Cox 
##                                             1 
##                           Agricola Del Carmen 
##                                             2 
##                            Agricola del Solar 
##                                             2 
##                         Agrícola Ditzler ltda 
##                                             4 
##                           Agrícola Don Jochen 
##                                             1 
##                           Agrícola El Arrollo 
##                                             1 
##                             Agrícola El Huapi 
##                                             2 
##                      AGRICOLA KAIKEN LIMITADA 
##                                             1 
##                 Agricola Las Vertientes Ltda. 
##                                             5 
##                          AGRÍCOLA LOS ESPINOS 
##                                             1 
##                          Agrícola Los Molinos 
##                                             1 
##                           Agrícola Los Robles 
##                                             1 
##                        Agricola los Zorrillos 
##                                             1 
##                               Agricola Malaga 
##                                             2 
##                     Agricola Mercedes Briceño 
##                                             3 
##                            Agricola Ocoa Ltda 
##                                             1 
##                              Agrícola Panagro 
##                                             2 
##                         Agricola Pimpihue S.A 
##                                             2 
##                             Agricola Pinochet 
##                                             1 
##                            Agricola Ponderosa 
##                                             1 
##                           Agricola Portezuelo 
##                                             1 
##  Agrícola Rafael Tomas Eduardo Undurraga EIRL 
##                                             2 
##                  Agrícola San Carlos Limitada 
##                                             2 
##                      Agrícola San Carlos Ltda 
##                                             1 
##                          AGRICOLA SAN RICARDO 
##                                             1 
##                            Agricola Santa Ana 
##                                             1 
##                       Agrícola Santa Graciela 
##                                             1 
##                 Agrícola Santa Marta de Liray 
##                                             2 
##                             Agricola Valvalle 
##                                             2 
##                         Agrícola Verdani Ltda 
##                                             2 
##                Agrícola y Forestal El Durazno 
##                                             1 
##                      Agrofrutícola Mallarauco 
##                                             1 
##                 Agropecuaria Ferias Araucanía 
##                                             1 
##                                     AGROREYES 
##                                             3 
##                                      bnegroni 
##                                             3 
##                            CER - Felipe Labbé 
##                                             2 
##                  CHARLES WINDELSCHMIDT COLLAR 
##                                             1 
##                         Cliente no encontrado 
##                                             5 
##                                    El Encanto 
##                                             2 
##                                       elhuapi 
##                                             1 
##                           EXPORTADORA POMPEIA 
##                                             2 
##                                      Frutasol 
##                                             4 
##            Frutícola Denisse Salinas Martínez 
##                                             1 
##                            Fruticola el Aromo 
##                                             1 
##                                  Fundo Catemu 
##                                             1 
##                                Jhonson Fruits 
##                                             2 
##                         Juan Henriquez Marich 
##                                             1 
##                                     Las Cebra 
##                                             3 
##                                       Llancay 
##                                             1 
##                                      lmiranda 
##                                             1 
##                             Luis Correa Ponce 
##                                             1 
##                         Luis Miranda Oyanadel 
##                                             1 
##                MARIA CRISTINA BUSTOS GALDAMES 
##                                             1 
##                                        McLean 
##                                             1 
##                                       Monfrut 
##                                             2 
##                         New Farm Forestry Spa 
##                                             2 
##                                   Noble Fruit 
##                                             1 
##                                Pablo Basualto 
##                                             1 
##                                 Pablo Massoud 
##                                             1 
##                    Paulina Gajardo Valenzuela 
##                                             1 
##                                   Providencia 
##                                             3 
##                               SAMUEL BUDINICH 
##                                             1 
##                               Santa Francisca 
##                                             1 
##                                   Santa Lucia 
##                                             1 
## Soc. de Inversiones y servicios pro Ingenieri 
##                                             2 
##               Sociedad Agricola El Radal Ltda 
##                                             1 
##                  SOCIEDAD AGRICOLA FISTUR SPA 
##                                             9 
##                     Sociedad agrícola Jg Ltda 
##                                             8 
##             Sociedad Agricola La Hornilla SPA 
##                                             2 
##                   Sociedad Agricola Mataquito 
##                                             1 
##           Sociedad Agricola Munilque Limitada 
##                                             3 
##             Sociedad Agricola Santa Laura SpA 
##                                             1 
##                                  Tierra Buena 
##                                             1 
##                   Transportes La Martina LTDA 
##                                             4 
##                                           UCM 
##                                             1 
##                          Viña Los Boldos Ltda 
##                                             1
BD %>%
  select(customer_user_id) %>%
  count(customer_user_id, sort=TRUE) %>%
  filter(n>2) %>%
  mutate(customer_user_id = as.factor(customer_user_id)) %>%
  ggplot(aes(n,customer_user_id)) + geom_col()

BD %>%
  filter(is.na(customer_user_id))
## # A tibble: 15 x 25
##       id tn     title  queue_id ticket_lock_id type_id service_id sla_id user_id
##    <dbl> <chr>  <chr>     <dbl>          <dbl>   <dbl> <chr>      <chr>    <dbl>
##  1  2441 20210~ ID: 1~       15              1       8 <NA>       <NA>        46
##  2  2444 20210~ ID: 1~       15              1       8 <NA>       <NA>        48
##  3  2445 20210~ ID: 1~       15              1       8 <NA>       <NA>        48
##  4  2446 20210~ id_62~       16              2       2 <NA>       <NA>        46
##  5  2447 20210~ ID_50~       19              2       2 <NA>       <NA>        42
##  6  2454 20210~ 1098_~       15              1       2 <NA>       <NA>        42
##  7  2455 20210~ 1097_~       15              1       2 <NA>       <NA>        42
##  8  2456 20210~ IWP_F~       45              1       8 <NA>       <NA>        50
##  9  2457 20210~ id_10~       15              1       8 <NA>       <NA>        42
## 10  2468 20210~ id_11~       44              1       2 <NA>       <NA>        42
## 11  2478 20210~ ID: 1~       15              1       8 <NA>       <NA>        38
## 12  2573 20210~ id_46~       15              1       2 <NA>       <NA>        42
## 13  2574 20210~ ID_39~       15              1       2 <NA>       <NA>        42
## 14  2576 20210~ ID_47~       15              1       2 <NA>       <NA>        42
## 15  2577 20210~ ID_83~       15              1       2 <NA>       <NA>        42
## # ... with 16 more variables: responsible_user_id <dbl>,
## #   ticket_priority_id <dbl>, ticket_state_id <dbl>, customer_id <chr>,
## #   customer_user_id <chr>, timeout <dbl>, until_time <dbl>,
## #   escalation_time <dbl>, escalation_update_time <dbl>,
## #   escalation_response_time <dbl>, escalation_solution_time <dbl>,
## #   archive_flag <dbl>, create_time <chr>, create_by <dbl>, change_time <chr>,
## #   change_by <dbl>
# BD_clean = BD_clean %>%
#   filter(!is.na(customer_user_id))

Hay 15 filas con NA.

timeout

#table(BD$timeout)

BD %>%
  select(timeout) %>%
  mutate(timeout= as.numeric(timeout)) %>%
  filter(!is.na(timeout)) %>%
  ggplot(aes(timeout)) + geom_bar(stat="bin", bins=60)

Hay 40 filas con valor igual a 0. ¿En qué formato está?

until_time

table(BD$until_time)
## 
##   0 
## 154
BD %>%
  filter(until_time!=0)
## # A tibble: 0 x 25
## # ... with 25 variables: id <dbl>, tn <chr>, title <chr>, queue_id <dbl>,
## #   ticket_lock_id <dbl>, type_id <dbl>, service_id <chr>, sla_id <chr>,
## #   user_id <dbl>, responsible_user_id <dbl>, ticket_priority_id <dbl>,
## #   ticket_state_id <dbl>, customer_id <chr>, customer_user_id <chr>,
## #   timeout <dbl>, until_time <dbl>, escalation_time <dbl>,
## #   escalation_update_time <dbl>, escalation_response_time <dbl>,
## #   escalation_solution_time <dbl>, archive_flag <dbl>, create_time <chr>, ...

Hay solo 5 filas con valores distintos de 0.

escalation_time

#table(BD$escalation_time)

BD %>%
  filter(escalation_time!=0)
## # A tibble: 35 x 25
##       id tn     title  queue_id ticket_lock_id type_id service_id sla_id user_id
##    <dbl> <chr>  <chr>     <dbl>          <dbl>   <dbl> <chr>      <chr>    <dbl>
##  1  2441 20210~ ID: 1~       15              1       8 <NA>       <NA>        46
##  2  2442 20210~ 143, ~        9              2       2 <NA>       <NA>        50
##  3  2443 20210~ 805, ~       16              2       2 50         <NA>        46
##  4  2446 20210~ id_62~       16              2       2 <NA>       <NA>        46
##  5  2447 20210~ ID_50~       19              2       2 <NA>       <NA>        42
##  6  2453 20210~ 914, ~       16              2       2 50         <NA>        46
##  7  2457 20210~ id_10~       15              1       8 <NA>       <NA>        42
##  8  2466 20210~ 254, ~       15              2       8 50         <NA>        50
##  9  2467 20210~ 857, ~        9              2       2 50         <NA>        50
## 10  2470 20210~ 1052,~       16              2       2 50         <NA>        46
## # ... with 25 more rows, and 16 more variables: responsible_user_id <dbl>,
## #   ticket_priority_id <dbl>, ticket_state_id <dbl>, customer_id <chr>,
## #   customer_user_id <chr>, timeout <dbl>, until_time <dbl>,
## #   escalation_time <dbl>, escalation_update_time <dbl>,
## #   escalation_response_time <dbl>, escalation_solution_time <dbl>,
## #   archive_flag <dbl>, create_time <chr>, create_by <dbl>, change_time <chr>,
## #   change_by <dbl>

Hay 487 filas con valores distintos de 0.

escalation_update_time

#table(BD$escalation_update_time)

BD %>%
  filter(escalation_update_time!=0)
## # A tibble: 35 x 25
##       id tn     title  queue_id ticket_lock_id type_id service_id sla_id user_id
##    <dbl> <chr>  <chr>     <dbl>          <dbl>   <dbl> <chr>      <chr>    <dbl>
##  1  2441 20210~ ID: 1~       15              1       8 <NA>       <NA>        46
##  2  2442 20210~ 143, ~        9              2       2 <NA>       <NA>        50
##  3  2443 20210~ 805, ~       16              2       2 50         <NA>        46
##  4  2446 20210~ id_62~       16              2       2 <NA>       <NA>        46
##  5  2447 20210~ ID_50~       19              2       2 <NA>       <NA>        42
##  6  2453 20210~ 914, ~       16              2       2 50         <NA>        46
##  7  2457 20210~ id_10~       15              1       8 <NA>       <NA>        42
##  8  2466 20210~ 254, ~       15              2       8 50         <NA>        50
##  9  2467 20210~ 857, ~        9              2       2 50         <NA>        50
## 10  2470 20210~ 1052,~       16              2       2 50         <NA>        46
## # ... with 25 more rows, and 16 more variables: responsible_user_id <dbl>,
## #   ticket_priority_id <dbl>, ticket_state_id <dbl>, customer_id <chr>,
## #   customer_user_id <chr>, timeout <dbl>, until_time <dbl>,
## #   escalation_time <dbl>, escalation_update_time <dbl>,
## #   escalation_response_time <dbl>, escalation_solution_time <dbl>,
## #   archive_flag <dbl>, create_time <chr>, create_by <dbl>, change_time <chr>,
## #   change_by <dbl>

Hay 486 variables con valores distintos a 0.

escalation_response_time

#table(BD$escalation_response_time)

BD %>%
  filter(escalation_response_time!=0)
## # A tibble: 1 x 25
##      id tn      title  queue_id ticket_lock_id type_id service_id sla_id user_id
##   <dbl> <chr>   <chr>     <dbl>          <dbl>   <dbl> <chr>      <chr>    <dbl>
## 1  2453 202107~ 914, ~       16              2       2 50         <NA>        46
## # ... with 16 more variables: responsible_user_id <dbl>,
## #   ticket_priority_id <dbl>, ticket_state_id <dbl>, customer_id <chr>,
## #   customer_user_id <chr>, timeout <dbl>, until_time <dbl>,
## #   escalation_time <dbl>, escalation_update_time <dbl>,
## #   escalation_response_time <dbl>, escalation_solution_time <dbl>,
## #   archive_flag <dbl>, create_time <chr>, create_by <dbl>, change_time <chr>,
## #   change_by <dbl>

Hay 66 filas con valores distintos de 0.

escalation_solution_time

#table(BD$escalation_solution_time)

BD %>%
  filter(escalation_solution_time!=0)
## # A tibble: 35 x 25
##       id tn     title  queue_id ticket_lock_id type_id service_id sla_id user_id
##    <dbl> <chr>  <chr>     <dbl>          <dbl>   <dbl> <chr>      <chr>    <dbl>
##  1  2441 20210~ ID: 1~       15              1       8 <NA>       <NA>        46
##  2  2442 20210~ 143, ~        9              2       2 <NA>       <NA>        50
##  3  2443 20210~ 805, ~       16              2       2 50         <NA>        46
##  4  2446 20210~ id_62~       16              2       2 <NA>       <NA>        46
##  5  2447 20210~ ID_50~       19              2       2 <NA>       <NA>        42
##  6  2453 20210~ 914, ~       16              2       2 50         <NA>        46
##  7  2457 20210~ id_10~       15              1       8 <NA>       <NA>        42
##  8  2466 20210~ 254, ~       15              2       8 50         <NA>        50
##  9  2467 20210~ 857, ~        9              2       2 50         <NA>        50
## 10  2470 20210~ 1052,~       16              2       2 50         <NA>        46
## # ... with 25 more rows, and 16 more variables: responsible_user_id <dbl>,
## #   ticket_priority_id <dbl>, ticket_state_id <dbl>, customer_id <chr>,
## #   customer_user_id <chr>, timeout <dbl>, until_time <dbl>,
## #   escalation_time <dbl>, escalation_update_time <dbl>,
## #   escalation_response_time <dbl>, escalation_solution_time <dbl>,
## #   archive_flag <dbl>, create_time <chr>, create_by <dbl>, change_time <chr>,
## #   change_by <dbl>

Hay que descubrir en que unidad esta el tiempo en esta columna.

archive_flag

table(BD$archive_flag)
## 
##   0 
## 154
BD %>% 
  filter(archive_flag!=0)
## # A tibble: 0 x 25
## # ... with 25 variables: id <dbl>, tn <chr>, title <chr>, queue_id <dbl>,
## #   ticket_lock_id <dbl>, type_id <dbl>, service_id <chr>, sla_id <chr>,
## #   user_id <dbl>, responsible_user_id <dbl>, ticket_priority_id <dbl>,
## #   ticket_state_id <dbl>, customer_id <chr>, customer_user_id <chr>,
## #   timeout <dbl>, until_time <dbl>, escalation_time <dbl>,
## #   escalation_update_time <dbl>, escalation_response_time <dbl>,
## #   escalation_solution_time <dbl>, archive_flag <dbl>, create_time <chr>, ...

Existe solo un valor distinto de 0 que tiene id 2281.

create_time

#table(BD$create_time)

BD %>%
  select(create_time) %>%
  mutate(create_time =  as.Date(create_time, format="%d-%m-%Y %H:%M")) %>%
  #mutate(create_time = round_date(create_time,unit="week"))  %>%
  ggplot(aes(x=create_time)) + geom_bar(stat="bin")+scale_x_date(date_labels = "%m-%Y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 154 rows containing non-finite values (stat_bin).

BD_clean = BD_clean %>%
  mutate(create_time = as.Date(create_time, format="%d-%m-%Y %H:%M"))

create_by

table(BD$create_by)
## 
##  5  7  8 42 45 46 48 
##  8 65 48 11 17  4  1
BD %>%
  select(create_by) %>%
  mutate(create_by = as.factor(create_by)) %>%
  ggplot(aes(x=create_by)) + geom_bar(stat="count")

BD_clean = BD_clean %>%
  filter(!is.na(create_by))

change_time

#table(BD$change_time)

BD %>%
  select(change_time) %>%
  mutate(change_time =  as.Date(change_time, format="%d-%m-%Y %H:%M")) %>%
  #mutate(change_time = round_date(change_time,unit="week"))  %>%
  ggplot(aes(x=change_time)) + geom_bar(stat="bin")+scale_x_date(date_labels = "%m-%Y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 154 rows containing non-finite values (stat_bin).

BD_clean = BD_clean %>%
  mutate(change_time = as.Date(change_time, format="%d-%m-%Y %H:%M"))

change_by

table(BD$change_by)
## 
##  5  7  8 38 42 45 46 48 50 
##  5 23 28  5  7  9 10 65  2
BD %>%
  select(change_by) %>%
  mutate(change_by = as.factor(change_by)) %>%
  ggplot(aes(x=change_by)) + geom_bar(stat="count")

BD_clean = BD_clean %>%
  filter(!is.na(change_by))

Se ve que la columna toma valores del 1 al 50, existiendo algunos más frecuentes.

X26

BD_clean$X26 = NULL