Author
Affiliation

Eric

Ultragenyx

Published

November 20, 2024

Keywords

EDA, Explore

Code
rm(list=ls(all=TRUE))
library(tinytex)
library(ggplot2)
library(plotly)

library(haven)
library(labelled)
library(bslib)
library(DT)
library(r2rtf)
library(metalite)
library(metalite.table1)
library(haven)
library(tidyr)
library(tools)
library(gtsummary)
library(stringr)
library(RColorBrewer)
library(flextable)
library(stringr)
library(tern)
library(tidyverse)
library(vistime)
library(readxl)
library(hrbrthemes)
library(rlang)
library(lubridate)
library(fuzzyjoin)
library(report)
library(nnet)
library(MASS)
library(dplyr)

library(SmartEDA)
library(summarytools)
library(skimr)
library(ggstatsplot)
library(PerformanceAnalytics)
library(dlookr)
Code
diamonds<-diamonds
ExpCatViz(diamonds, Page=c(1,3))
$`0`

Code
# Set graphics parameters for larger text
par(cex.main = 1.5,    # Title text size
    cex.lab = 1.3,     # Axis labels size
    cex.axis = 1.2)    # Axis numbers size

ExpNumViz(diamonds, 
          target="cut", 
          Page=c(2,4)) 
$`0`

Code
ExpNumStat(diamonds)
  Vname Group    TN nNeg nZero  nPos NegInf PosInf NA_Value Per_of_Missing
1 carat   All 53940    0     0 53940      0      0        0              0
2 depth   All 53940    0     0 53940      0      0        0              0
4 price   All 53940    0     0 53940      0      0        0              0
3 table   All 53940    0     0 53940      0      0        0              0
5     x   All 53940    0     8 53932      0      0        0              0
6     y   All 53940    0     7 53933      0      0        0              0
7     z   All 53940    0    20 53920      0      0        0              0
           sum   min      max     mean  median       SD    CV     IQR Skewness
1     43040.87   0.2     5.01    0.798    0.70    0.474 0.594    0.64    1.117
2   3330762.90  43.0    79.00   61.749   61.80    1.433 0.023    1.50   -0.082
4 212135217.00 326.0 18823.00 3932.800 2401.00 3989.440 1.014 4374.25    1.618
3   3099240.50  43.0    95.00   57.457   57.00    2.234 0.039    3.00    0.797
5    309138.62   0.0    10.74    5.731    5.70    1.122 0.196    1.83    0.379
6    309320.33   0.0    58.90    5.735    5.71    1.142 0.199    1.82    2.434
7    190879.30   0.0    31.80    3.539    3.53    0.706 0.199    1.13    1.522
  Kurtosis
1    1.256
2    5.739
4    2.177
3    2.801
5   -0.618
6   91.206
7   47.082
Code
ExpCatStat(diamonds, Target="cut")
Warning in FUN(X[[i]], ...): NAs introduced by coercion
  Variable Target Unique Chi-squared p-value df IV Value Cramers V
1    color    cut      7     310.318       0 NA        0      0.04
2  clarity    cut      8    4391.398       0 NA        0      0.14
3    carat    cut     10    4024.470       0 NA        0      0.14
4    depth    cut     10   33641.709       0 NA        0      0.39
5    table    cut      7   28971.571       0 NA        0      0.37
6    price    cut     10    3010.290       0 NA        0      0.12
7        x    cut     10    4851.224       0 NA        0      0.15
8        y    cut     10    4333.643       0 NA        0      0.14
9        z    cut     10    3556.788       0 NA        0      0.13
  Degree of Association Predictive Power
1             Very Weak   Not Predictive
2                  Weak   Not Predictive
3                  Weak   Not Predictive
4                Strong   Not Predictive
5                Strong   Not Predictive
6                  Weak   Not Predictive
7                  Weak   Not Predictive
8                  Weak   Not Predictive
9                  Weak   Not Predictive
Code
summary<-dfSummary(diamonds)
summary
Data Frame Summary  
diamonds  
Dimensions: 53940 x 10  
Duplicates: 146  

---------------------------------------------------------------------------------------------------------------------
No   Variable            Stats / Values                Freqs (% of Valid)      Graph             Valid      Missing  
---- ------------------- ----------------------------- ----------------------- ----------------- ---------- ---------
1    carat               Mean (sd) : 0.8 (0.5)         273 distinct values     :                 53940      0        
     [numeric]           min < med < max:                                      : .               (100.0%)   (0.0%)   
                         0.2 < 0.7 < 5                                         : :                                   
                         IQR (CV) : 0.6 (0.6)                                  : : .                                 
                                                                               : : : .                               

2    cut                 1. Fair                        1610 ( 3.0%)                             53940      0        
     [ordered, factor]   2. Good                        4906 ( 9.1%)           I                 (100.0%)   (0.0%)   
                         3. Very Good                  12082 (22.4%)           IIII                                  
                         4. Premium                    13791 (25.6%)           IIIII                                 
                         5. Ideal                      21551 (40.0%)           IIIIIII                               

3    color               1. D                           6775 (12.6%)           II                53940      0        
     [ordered, factor]   2. E                           9797 (18.2%)           III               (100.0%)   (0.0%)   
                         3. F                           9542 (17.7%)           III                                   
                         4. G                          11292 (20.9%)           IIII                                  
                         5. H                           8304 (15.4%)           III                                   
                         6. I                           5422 (10.1%)           II                                    
                         7. J                           2808 ( 5.2%)           I                                     

4    clarity             1. I1                           741 ( 1.4%)                             53940      0        
     [ordered, factor]   2. SI2                         9194 (17.0%)           III               (100.0%)   (0.0%)   
                         3. SI1                        13065 (24.2%)           IIII                                  
                         4. VS2                        12258 (22.7%)           IIII                                  
                         5. VS1                         8171 (15.1%)           III                                   
                         6. VVS2                        5066 ( 9.4%)           I                                     
                         7. VVS1                        3655 ( 6.8%)           I                                     
                         8. IF                          1790 ( 3.3%)                                                 

5    depth               Mean (sd) : 61.7 (1.4)        184 distinct values               :       53940      0        
     [numeric]           min < med < max:                                                :       (100.0%)   (0.0%)   
                         43 < 61.8 < 79                                                  :                           
                         IQR (CV) : 1.5 (0)                                            . :                           
                                                                                       : :                           

6    table               Mean (sd) : 57.5 (2.2)        127 distinct values         :             53940      0        
     [numeric]           min < med < max:                                          :             (100.0%)   (0.0%)   
                         43 < 57 < 95                                              :                                 
                         IQR (CV) : 3 (0)                                          : :                               
                                                                                   : :                               

7    price               Mean (sd) : 3932.8 (3989.4)   11602 distinct values   :                 53940      0        
     [integer]           min < med < max:                                      :                 (100.0%)   (0.0%)   
                         326 < 2401 < 18823                                    :                                     
                         IQR (CV) : 4374.2 (1)                                 : : .                                 
                                                                               : : : : . . .                         

8    x                   Mean (sd) : 5.7 (1.1)         554 distinct values             :         53940      0        
     [numeric]           min < med < max:                                              : .       (100.0%)   (0.0%)   
                         0 < 5.7 < 10.7                                                : : :                         
                         IQR (CV) : 1.8 (0.2)                                          : : :                         
                                                                                     . : : : :                       

9    y                   Mean (sd) : 5.7 (1.1)         552 distinct values     :                 53940      0        
     [numeric]           min < med < max:                                      : :               (100.0%)   (0.0%)   
                         0 < 5.7 < 58.9                                        : :                                   
                         IQR (CV) : 1.8 (0.2)                                  : :                                   
                                                                               : :                                   

10   z                   Mean (sd) : 3.5 (0.7)         375 distinct values       :               53940      0        
     [numeric]           min < med < max:                                        :               (100.0%)   (0.0%)   
                         0 < 3.5 < 31.8                                        : :                                   
                         IQR (CV) : 1.1 (0.2)                                  : :                                   
                                                                               : :                                   
---------------------------------------------------------------------------------------------------------------------
Code
skimr::skim(diamonds)
Data summary
Name diamonds
Number of rows 53940
Number of columns 10
_______________________
Column type frequency:
factor 3
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
cut 0 1 TRUE 5 Ide: 21551, Pre: 13791, Ver: 12082, Goo: 4906
color 0 1 TRUE 7 G: 11292, E: 9797, F: 9542, H: 8304
clarity 0 1 TRUE 8 SI1: 13065, VS2: 12258, SI2: 9194, VS1: 8171

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
carat 0 1 0.80 0.47 0.2 0.40 0.70 1.04 5.01 ▇▂▁▁▁
depth 0 1 61.75 1.43 43.0 61.00 61.80 62.50 79.00 ▁▁▇▁▁
table 0 1 57.46 2.23 43.0 56.00 57.00 59.00 95.00 ▁▇▁▁▁
price 0 1 3932.80 3989.44 326.0 950.00 2401.00 5324.25 18823.00 ▇▂▁▁▁
x 0 1 5.73 1.12 0.0 4.71 5.70 6.54 10.74 ▁▁▇▃▁
y 0 1 5.73 1.14 0.0 4.72 5.71 6.54 58.90 ▇▁▁▁▁
z 0 1 3.54 0.71 0.0 2.91 3.53 4.04 31.80 ▇▁▁▁▁
Code
library(cardx)
Warning: package 'cardx' was built under R version 4.4.2
Code
tbl_summary(mtcars,by=am)%>%add_p()
The following warnings were returned during `add_p()`:
! For variable `disp` (`am`) and "estimate", "statistic", "p.value",
  "conf.low", and "conf.high" statistics: cannot compute exact p-value with
  ties
! For variable `disp` (`am`) and "estimate", "statistic", "p.value",
  "conf.low", and "conf.high" statistics: cannot compute exact confidence
  intervals with ties
! For variable `drat` (`am`) and "estimate", "statistic", "p.value",
  "conf.low", and "conf.high" statistics: cannot compute exact p-value with
  ties
! For variable `drat` (`am`) and "estimate", "statistic", "p.value",
  "conf.low", and "conf.high" statistics: cannot compute exact confidence
  intervals with ties
! For variable `hp` (`am`) and "estimate", "statistic", "p.value", "conf.low",
  and "conf.high" statistics: cannot compute exact p-value with ties
! For variable `hp` (`am`) and "estimate", "statistic", "p.value", "conf.low",
  and "conf.high" statistics: cannot compute exact confidence intervals with
  ties
! For variable `mpg` (`am`) and "estimate", "statistic", "p.value", "conf.low",
  and "conf.high" statistics: cannot compute exact p-value with ties
! For variable `mpg` (`am`) and "estimate", "statistic", "p.value", "conf.low",
  and "conf.high" statistics: cannot compute exact confidence intervals with
  ties
! For variable `qsec` (`am`) and "estimate", "statistic", "p.value",
  "conf.low", and "conf.high" statistics: cannot compute exact p-value with
  ties
! For variable `qsec` (`am`) and "estimate", "statistic", "p.value",
  "conf.low", and "conf.high" statistics: cannot compute exact confidence
  intervals with ties
! For variable `wt` (`am`) and "estimate", "statistic", "p.value", "conf.low",
  and "conf.high" statistics: cannot compute exact p-value with ties
! For variable `wt` (`am`) and "estimate", "statistic", "p.value", "conf.low",
  and "conf.high" statistics: cannot compute exact confidence intervals with
  ties
Characteristic 0
N = 191
1
N = 131
p-value2
mpg 17.3 (14.7, 19.2) 22.8 (21.0, 30.4) 0.002
cyl

0.009
    4 3 (16%) 8 (62%)
    6 4 (21%) 3 (23%)
    8 12 (63%) 2 (15%)
disp 276 (168, 360) 120 (79, 160) <0.001
hp 175 (110, 205) 109 (66, 113) 0.046
drat 3.15 (3.07, 3.70) 4.08 (3.85, 4.22) <0.001
wt 3.52 (3.44, 3.85) 2.32 (1.94, 2.78) <0.001
qsec 17.82 (17.05, 19.44) 17.02 (16.46, 18.61) 0.3
vs 7 (37%) 7 (54%) 0.3
gear

<0.001
    3 15 (79%) 0 (0%)
    4 4 (21%) 8 (62%)
    5 0 (0%) 5 (38%)
carb

0.3
    1 3 (16%) 4 (31%)
    2 6 (32%) 4 (31%)
    3 3 (16%) 0 (0%)
    4 7 (37%) 3 (23%)
    6 0 (0%) 1 (7.7%)
    8 0 (0%) 1 (7.7%)
1 Median (Q1, Q3); n (%)
2 Wilcoxon rank sum test; Fisher’s exact test; Pearson’s Chi-squared test
Code
ggcorrmat(data=iris)

Code
PerformanceAnalytics::chart.Correlation(iris%>%dplyr::select(-Species), method="kendall")
Warning in par(usr): argument 1 does not name a graphical parameter
Warning in par(usr): argument 1 does not name a graphical parameter
Warning in par(usr): argument 1 does not name a graphical parameter
Warning in par(usr): argument 1 does not name a graphical parameter
Warning in par(usr): argument 1 does not name a graphical parameter
Warning in par(usr): argument 1 does not name a graphical parameter

Code
diamonds%>%eda_web_report(target=cut, output_format = "html")

  |                                                        
  |                                                  |   0%
  |                                                        
  |.                                                 |   2%                    
  |                                                        
  |..                                                |   3% [setup]            
  |                                                        
  |..                                                |   5%                    
  |                                                        
  |...                                               |   6% [load_packages]    
  |                                                        
  |....                                              |   8%                    
  |                                                        
  |.....                                             |  10% [unnamed-chunk-26] 
  |                                                        
  |......                                            |  11%                    
  |                                                        
  |......                                            |  13% [udf]              
  |                                                        
  |.......                                           |  14%                    
  |                                                        
  |........                                          |  16% [check_variables]  
  |                                                        
  |.........                                         |  17%                    
  |                                                        
  |..........                                        |  19% [create-overview]  
  |                                                        
  |..........                                        |  21%                    
  |                                                        
  |...........                                       |  22% [overview]         
  |                                                        
  |............                                      |  24%                    
  |                                                        
  |.............                                     |  25% [overview-pre]     
  |                                                        
  |.............                                     |  27%                    
  |                                                        
  |..............                                    |  29% [unnamed-chunk-27] 
  |                                                        
  |...............                                   |  30%                    
  |                                                        
  |................                                  |  32% [unnamed-chunk-28] 
  |                                                        
  |.................                                 |  33%                    
  |                                                        
  |.................                                 |  35% [variables]        

  |                                                        
  |..................                                |  37%                    
  |                                                        
  |...................                               |  38% [normality]        
  |                                                        
  |....................                              |  40%                    
  |                                                        
  |.....................                             |  41% [normality-list]   

  |                                                        
  |.....................                             |  43%                    
  |                                                        
  |......................                            |  44% [unnamed-chunk-29] 
  |                                                        
  |.......................                           |  46%                    
  |                                                        
  |........................                          |  48% [unnamed-chunk-30] 
  |                                                        
  |.........................                         |  49%                    
  |                                                        
  |.........................                         |  51% [compare_numerical]

  |                                                        
  |..........................                        |  52%                    
  |                                                        
  |...........................                       |  54% [unnamed-chunk-31] 
  |                                                        
  |............................                      |  56%                    
  |                                                        
  |.............................                     |  57% [compare-category] 
  |                                                        
  |.............................                     |  59%                    
  |                                                        
  |..............................                    |  60% [unnamed-chunk-32] 
  |                                                        
  |...............................                   |  62%                    
  |                                                        
  |................................                  |  63% [unnamed-chunk-33] 
  |                                                        
  |.................................                 |  65%                    
  |                                                        
  |.................................                 |  67% [unnamed-chunk-34] 
  |                                                        
  |..................................                |  68%                    
  |                                                        
  |...................................               |  70% [correlation]      
  |                                                        
  |....................................              |  71%                    
  |                                                        
  |.....................................             |  73% [unnamed-chunk-35] 
  |                                                        
  |.....................................             |  75%                    
  |                                                        
  |......................................            |  76% [plot-correlation] 

  |                                                        
  |.......................................           |  78%                    
  |                                                        
  |........................................          |  79% [unnamed-chunk-36] 
  |                                                        
  |........................................          |  81%                    
  |                                                        
  |.........................................         |  83% [unnamed-chunk-37] 
  |                                                        
  |..........................................        |  84%                    
  |                                                        
  |...........................................       |  86% [group-numerical]  

  |                                                        
  |............................................      |  87%                    
  |                                                        
  |............................................      |  89% [unnamed-chunk-38] 
  |                                                        
  |.............................................     |  90%                    
  |                                                        
  |..............................................    |  92% [group-categorical]

  |                                                        
  |...............................................   |  94%                    
  |                                                        
  |................................................  |  95% [unnamed-chunk-39] 
  |                                                        
  |................................................  |  97%                    
  |                                                        
  |................................................. |  98% [group-correlation]

  |                                                        
  |..................................................| 100%                    
                                                                                                              
"C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS eda_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc71cc6994bad.html --lua-filter "C:\PROGRA~1\R\R-44~1.1\library\RMARKD~1\RMARKD~1\lua\PAGEBR~1.LUA" --lua-filter "C:\PROGRA~1\R\R-44~1.1\library\RMARKD~1\RMARKD~1\lua\LATEX-~1.LUA" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\PROGRA~1\R\R-44~1.1\library\RMARKD~1\rmd\h\DEFAUL~1.HTM" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:\PROGRA~1\R\R-44~1.1\library\dlookr\RESOUR~1\DLOOKR~1.CSS" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\echen\AppData\Local\Temp\4\RtmpYfxnrR\rmarkdown-str71cc74bf7714.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\PROGRA~1\R\R-44~1.1\library\dlookr\RESOUR~1\FOOTER~1.HTM" 
Code
# dlookr::diagnose_paged_report(diamonds)
# dlookr::diagnose_web_report(diamonds)
Code
# transformation_web_report(airquality)
Code
describe(iris)%>%flextable()

described_variables

n

na

mean

sd

se_mean

IQR

skewness

kurtosis

p00

p01

p05

p10

p20

p25

p30

p40

p50

p60

p70

p75

p80

p90

p95

p99

p100

Sepal.Length

150

0

5.843333

0.8280661

0.06761132

1.3

0.3149110

-0.552064

4.3

4.400

4.600

4.8

5.0

5.1

5.27

5.60

5.80

6.10

6.3

6.4

6.52

6.90

7.255

7.700

7.9

Sepal.Width

150

0

3.057333

0.4358663

0.03558833

0.5

0.3189657

0.228249

2.0

2.200

2.345

2.5

2.7

2.8

2.80

3.00

3.00

3.10

3.2

3.3

3.40

3.61

3.800

4.151

4.4

Petal.Length

150

0

3.758000

1.7652982

0.14413600

3.5

-0.2748842

-1.402103

1.0

1.149

1.300

1.4

1.5

1.6

1.70

3.90

4.35

4.64

5.0

5.1

5.32

5.80

6.100

6.700

6.9

Petal.Width

150

0

1.199333

0.7622377

0.06223645

1.5

-0.1029667

-1.340604

0.1

0.100

0.200

0.2

0.2

0.3

0.40

1.16

1.30

1.50

1.8

1.8

1.90

2.20

2.300

2.500

2.5

Code
plot_na_pareto(airquality, only_na=T)

Code
plot(imputate_outlier(diamonds, carat, method="capping"))

Code
library(dataxray)
#| column: screen
diamonds %>%make_xray() %>% view_xray()

Citation

BibTeX citation:
@online{2024,
  author = {, Eric},
  title = {Explore {Data}},
  date = {2024-11-20},
  langid = {en}
}
For attribution, please cite this work as:
Eric. 2024. “Explore Data.” ExploreData. November 20, 2024.