1 Required libraries and source files

library(tidyverse)
library(explore)
library(gt)
library(viridis)
library(afcommon)

2 Creating Sample Dataset

We’ll create a dataset with the following variables: - salary: normally distributed numeric variable - sales: right-skewed numeric variable - gender: binary variable - student_status: binary variable - education: categorical variable with 3 levels - job_sector: categorical variable with 5 levels

set.seed(123) # For reproducibility

# Create sample size
n <- 1000

# Create the dataframe
df <- data.frame(
  # Normally distributed salary (mean = 50000, sd = 10000)
  salary = rnorm(n, mean = 50000, sd = 10000),
  
  # Right-skewed sales data using log-normal distribution
  sales = rlnorm(n, meanlog = 3, sdlog = 0.5),
  
  # Binary variables as factors
  gender = factor(sample(c("Male", "Female"), n, replace = TRUE),
                 levels = c("Male", "Female"),
                 labels = c("Male", "Female")),
  student_status = factor(sample(c("No", "Yes"), n, replace = TRUE),
                         levels = c("No", "Yes"),
                         labels = c("No", "Yes")),
  
  # Categorical with 3 ordered levels
  education = factor(sample(c("High School", "Bachelor", "Graduate"), 
                          n, replace = TRUE, 
                          prob = c(0.3, 0.5, 0.2)),
                    levels = c("High School", "Bachelor", "Graduate"),
                    ordered = TRUE),
  
  # Categorical with 5 levels
  job_sector = sample(c("Technology", "Healthcare", "Finance", 
                       "Education", "Manufacturing"),
                     n, replace = TRUE)
)

3 Exploring the Data Structure

# View the structure of the dataset
str(df)
'data.frame':   1000 obs. of  6 variables:
 $ salary        : num  44395 47698 65587 50705 51293 ...
 $ sales         : num  12.21 11.94 19.91 18.8 5.61 ...
 $ gender        : Factor w/ 2 levels "Male","Female": 2 2 2 1 1 1 1 1 2 2 ...
 $ student_status: Factor w/ 2 levels "No","Yes": 1 2 2 2 2 1 1 2 1 2 ...
 $ education     : Ord.factor w/ 3 levels "High School"<..: 2 2 2 1 2 1 2 3 3 2 ...
 $ job_sector    : chr  "Finance" "Finance" "Education" "Technology" ...


# Display the first few rows
head(df)
    salary     sales gender student_status   education job_sector
1 44395.24 12.208112 Female             No    Bachelor    Finance
2 47698.23 11.941533 Female            Yes    Bachelor    Finance
3 65587.08 19.905775 Female            Yes    Bachelor  Education
4 50705.08 18.801045   Male            Yes High School Technology
5 51292.88  5.614366   Male            Yes    Bachelor  Education
6 67150.65 33.794117   Male             No High School  Education

4 Data Dictionary

af_create_data_dictionary(df)

Name	Label	Type	Levels
salary	salary	Numeric	NA
sales	sales	Numeric	NA
gender	gender	Factor	Male, Female
student_status	student_status	Factor	No, Yes
education	education	Factor	High School, Bachelor, Graduate
job_sector	job_sector	Character	NA

5 Descriptive Tables

Use the following code:

descriptive_tbls <- af_descriptive(df)

descriptive_tbls$numeric

descriptive_tbls$binary

descriptive_tbls$categorical

descriptive_tbls <- af_descriptive(df)

descriptive_tbls$numeric

Variable	Mean	SD	Median	IQR	Min	Max	N_Missing	N
Numeric Variables Summary
salary	50,161.28	9,916.95	50,092.10	12,929.26	21,902.25	82,410.40	0	1000
sales	23.30	12.55	20.64	14.79	4.38	109.42	0	1000


descriptive_tbls$binary

Variable	First_Level	%	Second_Level	%	N_Missing	N
Binary Variables Summary
gender	Male	47	Female	53	0	1000
student_status	No	48	Yes	52	0	1000


descriptive_tbls$categorical

Variable	Type	N_Categories	Categories	Min	Max	Mode	N_Missing	N
Categorical Variables Summary
education	ordinal	3	Bachelor, High School, Graduate	High School	Graduate	Bachelor	0	1000
job_sector	nominal	5	Finance, Education, Technology, Manufacturing, Healthcare	NA	NA	Finance	0	1000

5.0.1 Visual Overview

df %>% explore_all(color = viridis(2, begin = 0.95))

Descriptive Statistics

Creating descriptive statistics for a dataset

Amir Freund

2025-02-06