Author
Affiliation

Eric

Ultragenyx

Published

November 20, 2024

Keywords

EDA, Explore

Code
rm(list=ls(all=TRUE))
library(tinytex)
library(ggplot2)
library(plotly)

library(haven)
library(labelled)
library(bslib)
library(DT)
library(r2rtf)
library(metalite)
library(metalite.table1)
library(haven)
library(tidyr)
library(tools)
library(gtsummary)
library(stringr)
library(RColorBrewer)
library(flextable)
library(stringr)
library(tern)
library(tidyverse)
library(vistime)
library(readxl)
library(hrbrthemes)
library(rlang)
library(lubridate)
library(fuzzyjoin)
library(report)
library(nnet)
library(MASS)
library(dplyr)

library(SmartEDA)
library(summarytools)
library(skimr)
library(ggstatsplot)
library(PerformanceAnalytics)
library(dlookr)
library(ISLR) # for the Wage dataset
Code
ggstatsplot::ggbarstats(data=Wage, x=jobclass,y=education, label="both")

Code
dlookr::describe(iris)%>%flextable()

described_variables

n

na

mean

sd

se_mean

IQR

skewness

kurtosis

p00

p01

p05

p10

p20

p25

p30

p40

p50

p60

p70

p75

p80

p90

p95

p99

p100

Sepal.Length

150

0

5.843333

0.8280661

0.06761132

1.3

0.3149110

-0.552064

4.3

4.400

4.600

4.8

5.0

5.1

5.27

5.60

5.80

6.10

6.3

6.4

6.52

6.90

7.255

7.700

7.9

Sepal.Width

150

0

3.057333

0.4358663

0.03558833

0.5

0.3189657

0.228249

2.0

2.200

2.345

2.5

2.7

2.8

2.80

3.00

3.00

3.10

3.2

3.3

3.40

3.61

3.800

4.151

4.4

Petal.Length

150

0

3.758000

1.7652982

0.14413600

3.5

-0.2748842

-1.402103

1.0

1.149

1.300

1.4

1.5

1.6

1.70

3.90

4.35

4.64

5.0

5.1

5.32

5.80

6.100

6.700

6.9

Petal.Width

150

0

1.199333

0.7622377

0.06223645

1.5

-0.1029667

-1.340604

0.1

0.100

0.200

0.2

0.2

0.3

0.40

1.16

1.30

1.50

1.8

1.8

1.90

2.20

2.300

2.500

2.5

Code
iris%>%group_by(Species)%>%univar_numeric()
Adding missing grouping variables: `Species`
$statistics
# A tibble: 12 x 10
   described_variables Species        n    na  mean    sd se_mean   IQR skewness
   <chr>               <fct>      <int> <int> <dbl> <dbl>   <dbl> <dbl>    <dbl>
 1 Petal.Length        setosa        50     0 1.46  0.174  0.0246 0.175   0.106 
 2 Petal.Length        versicolor    50     0 4.26  0.470  0.0665 0.600  -0.607 
 3 Petal.Length        virginica     50     0 5.55  0.552  0.0780 0.775   0.549 
 4 Petal.Width         setosa        50     0 0.246 0.105  0.0149 0.1     1.25  
 5 Petal.Width         versicolor    50     0 1.33  0.198  0.0280 0.3    -0.0312
 6 Petal.Width         virginica     50     0 2.03  0.275  0.0388 0.5    -0.129 
 7 Sepal.Length        setosa        50     0 5.01  0.352  0.0498 0.400   0.120 
 8 Sepal.Length        versicolor    50     0 5.94  0.516  0.0730 0.7     0.105 
 9 Sepal.Length        virginica     50     0 6.59  0.636  0.0899 0.675   0.118 
10 Sepal.Width         setosa        50     0 3.43  0.379  0.0536 0.475   0.0412
11 Sepal.Width         versicolor    50     0 2.77  0.314  0.0444 0.475  -0.363 
12 Sepal.Width         virginica     50     0 2.97  0.322  0.0456 0.375   0.366 
# i 1 more variable: median <dbl>
Code
diamonds%>%diagnose_numeric()%>%flextable()

variables

min

Q1

mean

median

Q3

max

zero

minus

outlier

carat

0

4.71

5.731157

5.7

6.54

10.74

8

0

32

depth

0

4.71

5.731157

5.7

6.54

10.74

8

0

32

table

0

4.71

5.731157

5.7

6.54

10.74

8

0

32

price

0

4.71

5.731157

5.7

6.54

10.74

8

0

32

x

0

4.71

5.731157

5.7

6.54

10.74

8

0

32

y

0

4.71

5.731157

5.7

6.54

10.74

8

0

32

z

0

4.71

5.731157

5.7

6.54

10.74

8

0

32

1 normality test

Code
library(moments)

Attaching package: 'moments'
The following objects are masked from 'package:dlookr':

    kurtosis, skewness
The following objects are masked from 'package:PerformanceAnalytics':

    kurtosis, skewness
Code
agostino.test(airquality$Ozone)

    D'Agostino skewness test

data:  airquality$Ozone
skew = 1.2257, z = 4.6564, p-value = 3.219e-06
alternative hypothesis: data have a skewness
Code
library(DataExplorer)
plot_qq(iris,by="Species")

Code
iris%>%group_by(Species)%>%plot_normality(Petal.Length)

Code
library(ggpubr)

Attaching package: 'ggpubr'
The following objects are masked from 'package:flextable':

    border, font, rotate
Code
ggqqplot(iris, "Sepal.Length", facet.by="Species")

Code
iris%>%group_by(Species)%>%normality()%>%mutate_if(is.numeric, ~round(.,3))%>%flextable()

variable

Species

statistic

p_value

sample

Sepal.Length

setosa

0.978

0.460

50

Sepal.Length

versicolor

0.978

0.465

50

Sepal.Length

virginica

0.971

0.258

50

Sepal.Width

setosa

0.972

0.272

50

Sepal.Width

versicolor

0.974

0.338

50

Sepal.Width

virginica

0.967

0.181

50

Petal.Length

setosa

0.955

0.055

50

Petal.Length

versicolor

0.966

0.158

50

Petal.Length

virginica

0.962

0.110

50

Petal.Width

setosa

0.800

0.000

50

Petal.Width

versicolor

0.948

0.027

50

Petal.Width

virginica

0.960

0.087

50

Code
plot_boxplot(iris, by="Species")

Code
ggstatsplot::ggbetweenstats(data=iris, x=Species,y=Sepal.Length, type="np")

Code
correlate(iris, Petal.Width)%>%flextable()

var1

var2

coef_corr

Petal.Width

Sepal.Length

0.8179411

Petal.Width

Sepal.Width

-0.3661259

Petal.Width

Petal.Length

0.9628654

Code
plot_correlate(iris, method="kendall")
Warning in plot_correlate(iris, method = "kendall"): 'plot_correlate' is deprecated.
Use 'plot.correlate' instead.
See help("Deprecated")

Code
ggcorrmat(data=iris)

Code
# ggcorrmat(data=iris, type="np", output="dataframe")%>%mutate_if(is.numeric, ~round(.,2))%>%flextable()

cor(iris[,1:4], method = "spearman") %>%
  as.data.frame() %>%
  tibble::rownames_to_column("Variable") %>%
  dplyr::mutate(across(where(is.numeric), ~round(., 2))) %>%
  flextable::flextable() %>%
  flextable::theme_vanilla()

Variable

Sepal.Length

Sepal.Width

Petal.Length

Petal.Width

Sepal.Length

1.00

-0.17

0.88

0.83

Sepal.Width

-0.17

1.00

-0.31

-0.29

Petal.Length

0.88

-0.31

1.00

0.94

Petal.Width

0.83

-0.29

0.94

1.00

Code
ggscatterstats(data=iris, x=Sepal.Length, y=Sepal.Width, type="np",marginal.type="boxplot")
Registered S3 method overwritten by 'ggside':
  method from  
  +.gg   GGally
`stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.

Code
chart.Correlation(iris%>%dplyr::select(-Species), method="kendall")

Code
ggplot(iris, aes(Sepal.Length, Sepal.Width))+geom_point()+geom_smooth()+facet_wrap(~Species)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Code
plot_na_intersect(airquality)

Code
plot(imputate_na(airquality, Ozone, Temp, method="knn"))

Code
library(performance)
Warning: package 'performance' was built under R version 4.4.2
Code
plot(check_outliers(airquality$Wind, method="zscore"))

Code
check_outliers(airquality$Wind,method="iqr")
2 outliers detected: cases 9, 48.
- Based on the following method and threshold: iqr (2).
- For variable: airquality$Wind.

-----------------------------------------------------------------------------
Outliers per variable (iqr): 

$`airquality$Wind`
   Row Distance_IQR
9    9     1.527977
48  48     1.614060
Code
dlookr::diagnose_outlier(diamonds)%>%flextable()

variables

outliers_cnt

outliers_ratio

outliers_mean

with_mean

without_mean

carat

32

0.05932518

7.239375

5.731157

5.730262

depth

32

0.05932518

7.239375

5.731157

5.730262

table

32

0.05932518

7.239375

5.731157

5.730262

price

32

0.05932518

7.239375

5.731157

5.730262

x

32

0.05932518

7.239375

5.731157

5.730262

y

32

0.05932518

7.239375

5.731157

5.730262

z

32

0.05932518

7.239375

5.731157

5.730262

Code
airquality%>%dplyr::select(Ozone, Wind)%>%plot_outlier()

Code
plots<-diamonds%>%plot_outlier(diamonds%>%diagnose_outlier()%>%dplyr::filter(outliers_ratio>5)%>%dplyr::select(variables)%>%pull())

Citation

BibTeX citation:
@online{2024,
  author = {, Eric},
  title = {Explore {Data}},
  date = {2024-11-20},
  langid = {en}
}
For attribution, please cite this work as:
Eric. 2024. “Explore Data.” ExploreData. November 20, 2024.