EDA with tables

Cargamos la base de datos en R

library(readxl)
## Warning: package 'readxl' was built under R version 4.1.2
DBmodR <- read_excel("C:/Users/fidel/OneDrive - CINVESTAV/NLPR, LPR, NPR ratio in COVID-19 and DENGUE PAPER/PAPER/3erround/DBmodR.xlsx")

DB<-DBmodR

str(DB)
## tibble [288 x 10] (S3: tbl_df/tbl/data.frame)
##  $ SEXO       : chr [1:288] "F" "M" "F" "F" ...
##  $ EDAD       : num [1:288] 29 33 46 43 90 24 56 29 33 59 ...
##  $ DX         : chr [1:288] "FD" "FD" "FD" "FD" ...
##  $ SEVERITY   : chr [1:288] "Non-severe" "Non-severe" "Non-severe" "Non-severe" ...
##  $ PLAQUETAS  : num [1:288] 44 80 110 205 229 233 189 293 152 280 ...
##  $ LINFOCITOS : num [1:288] 2.26 0.83 0.63 1.33 1.04 0.36 3.55 1.22 1.54 1.03 ...
##  $ NEUTROFILOS: num [1:288] 3.17 2.08 2.11 7.6 3.97 ...
##  $ NPLR       : num [1:288] 3.19 3.13 3.04 2.79 1.67 ...
##  $ NLR        : num [1:288] 1.4 2.51 3.35 5.71 3.82 ...
##  $ LPR        : num [1:288] 19.5 96.4 174.6 154.1 220.2 ...

Hacemos limpienza de la base de datos con Tydiverse

Recodificamos la variable DX, y SEVERITY

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.8
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## Warning: package 'readr' was built under R version 4.1.1
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'stringr' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
DB <- DB %>% mutate(DX=recode(DX,`FD` = "Dengue", `FHD` = "Dengue"))

DB <- DB %>% mutate (SEVERITY=recode(SEVERITY, `NO CRIT` = "Non-critical"))

EDA

Haremos una tabla con el paquete gtsummary

library(gtsummary)
## #BlackLivesMatter
DB <- DB %>% mutate (SEVERITY=recode(SEVERITY, `NO CRIT` = "Non-critical"))

DBCLINICR<-colnames(DB) <- c('Sex', 'Age (years)', 'DX', 'Severity', 
                          'Platelets', 'Lymphocytes', 'Neutrophils','NPLR','NLR','LPR') 
str(DBCLINICR)
##  chr [1:10] "Sex" "Age (years)" "DX" "Severity" "Platelets" "Lymphocytes" ...
DB %>%  select(Sex, Severity, Platelets,Lymphocytes,Neutrophils,NPLR,NLR,LPR, DX) %>%
  tbl_summary(by = DX,                                               # stratify entire table by outcome
                   statistic = list(all_continuous() ~ "{median} ({IQR})",        # stats and format for continuous columns
                                    all_categorical() ~ "{n} ({p}%)"),   # stats and format for categorical columns
                   digits = all_continuous() ~ 1,                              # rounding for continuous columns
                   type   = all_categorical() ~ "categorical") %>% add_p() %>% add_overall()
Characteristic Overall, N = 2881 COVID-19, N = 1051 Dengue, N = 1831 p-value2
Sex <0.001
F 163 (57%) 41 (39%) 122 (67%)
M 125 (43%) 64 (61%) 61 (33%)
Severity <0.001
Non-critical 105 (36%) 105 (100%) 0 (0%)
Non-severe 183 (64%) 0 (0%) 183 (100%)
Platelets 168.5 (159.5) 286.0 (179.0) 125.0 (109.0) <0.001
Lymphocytes 0.9 (0.9) 1.0 (0.8) 0.9 (0.9) 0.7
Neutrophils 3.2 (6.1) 9.0 (6.5) 2.0 (1.9) <0.001
NPLR 2.5 (3.6) 3.2 (4.5) 2.1 (2.9) <0.001
NLR 3.7 (6.7) 8.8 (11.8) 2.1 (2.7) <0.001
LPR 183.6 (228.8) 323.0 (276.4) 155.2 (157.0) <0.001
1 n (%); Median (IQR)
2 Pearson's Chi-squared test; Wilcoxon rank sum test

##Boxplot

dbgatc <- data.frame(DB$DX,DB$Platelets, DB$Lymphocytes, DB$Neutrophils, DB$NPLR,
                     DB$NLR, DB$LPR)


colnames(dbgatc) <- c('DX', 'Platelets', 'Lymphocytes', 'Neutrophils', 'NLPR',
                     'NLR', 'LPR')

DBgat <- gather(data= dbgatc, blodcountpar, level, -DX, na.rm = TRUE) 

##creamos el bloxplot una vez limpia la base de datos

Usaremos el paquete ggplot2, para graficar, además usaremos ggpubr para agregar valores a las sgnificancia estadistica

#boxplot only 

library(ggplot2)
library(ggpubr)
library(rstatix)
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
your_font_size <- 5

DBgat %>% mutate(DX = factor(DX, levels=c("Dengue", "COVID-19"))) %>% 
  ggplot(aes(x=blodcountpar, y=level, fill=factor(DX),add = "jitter"), 
         order = c("Neutrophils","Lymphocytes","Platelets","NLR","LPR", "NPLR")) +
  geom_boxplot()+  geom_jitter(shape=16,
                               position=position_jitter(),
                               alpha = .2)+
  stat_compare_means(aes(group = DX),method = "wilcox.test", label = "p.signif", hide.ns = F, 
                     size = your_font_size)+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+ theme_classic2()+ scale_fill_grey(start = 0.9, end = .5)+
  labs(y="", fill= "", x="", fill="")+ theme(text = element_text(size = 20), axis.text = element_text(size = 20),
                                           legend.text = element_text(size = 20)) +  theme( strip.background = element_blank(),
                                                                                            strip.text.x = element_blank())+ #to eliminate strips
  facet_wrap(blodcountpar ~ ., scales="free")