1 Required libraries and source files

library(tidyverse)
library(explore)
library(gt)
library(viridis)
library(afcommon)

2 Creating Sample Dataset

We’ll create a dataset with the following variables: - salary: normally distributed numeric variable - sales: right-skewed numeric variable - gender: binary variable - student_status: binary variable - education: categorical variable with 3 levels - job_sector: categorical variable with 5 levels

set.seed(123) # For reproducibility

# Create sample size
n <- 1000

# Create the dataframe
df <- data.frame(
  # Normally distributed salary (mean = 50000, sd = 10000)
  salary = rnorm(n, mean = 50000, sd = 10000),
  
  # Right-skewed sales data using log-normal distribution
  sales = rlnorm(n, meanlog = 3, sdlog = 0.5),
  
  # Binary variables as factors
  gender = factor(sample(c("Male", "Female"), n, replace = TRUE),
                 levels = c("Male", "Female"),
                 labels = c("Male", "Female")),
  student_status = factor(sample(c("No", "Yes"), n, replace = TRUE),
                         levels = c("No", "Yes"),
                         labels = c("No", "Yes")),
  
  # Categorical with 3 ordered levels
  education = factor(sample(c("High School", "Bachelor", "Graduate"), 
                          n, replace = TRUE, 
                          prob = c(0.3, 0.5, 0.2)),
                    levels = c("High School", "Bachelor", "Graduate"),
                    ordered = TRUE),
  
  # Categorical with 5 levels
  job_sector = sample(c("Technology", "Healthcare", "Finance", 
                       "Education", "Manufacturing"),
                     n, replace = TRUE)
)

3 Exploring the Data Structure

# View the structure of the dataset
str(df)
'data.frame':   1000 obs. of  6 variables:
 $ salary        : num  44395 47698 65587 50705 51293 ...
 $ sales         : num  12.21 11.94 19.91 18.8 5.61 ...
 $ gender        : Factor w/ 2 levels "Male","Female": 2 2 2 1 1 1 1 1 2 2 ...
 $ student_status: Factor w/ 2 levels "No","Yes": 1 2 2 2 2 1 1 2 1 2 ...
 $ education     : Ord.factor w/ 3 levels "High School"<..: 2 2 2 1 2 1 2 3 3 2 ...
 $ job_sector    : chr  "Finance" "Finance" "Education" "Technology" ...

# Display the first few rows
head(df)
    salary     sales gender student_status   education job_sector
1 44395.24 12.208112 Female             No    Bachelor    Finance
2 47698.23 11.941533 Female            Yes    Bachelor    Finance
3 65587.08 19.905775 Female            Yes    Bachelor  Education
4 50705.08 18.801045   Male            Yes High School Technology
5 51292.88  5.614366   Male            Yes    Bachelor  Education
6 67150.65 33.794117   Male             No High School  Education

4 Data Dictionary

af_create_data_dictionary(df)
Name Label Type Levels
salary salary Numeric NA
sales sales Numeric NA
gender gender Factor Male, Female
student_status student_status Factor No, Yes
education education Factor High School, Bachelor, Graduate
job_sector job_sector Character NA

5 Descriptive Tables

Use the following code:

descriptive_tbls <- af_descriptive(df)

descriptive_tbls$numeric

descriptive_tbls$binary

descriptive_tbls$categorical

descriptive_tbls <- af_descriptive(df)

descriptive_tbls$numeric
Numeric Variables Summary
Variable Mean SD Median IQR Min Max N_Missing N
salary 50,161.28 9,916.95 50,092.10 12,929.26 21,902.25 82,410.40 0 1000
sales 23.30 12.55 20.64 14.79 4.38 109.42 0 1000

descriptive_tbls$binary
Binary Variables Summary
Variable First_Level % Second_Level % N_Missing N
gender Male 47 Female 53 0 1000
student_status No 48 Yes 52 0 1000

descriptive_tbls$categorical
Categorical Variables Summary
Variable Type N_Categories Categories Min Max Mode N_Missing N
education ordinal 3 Bachelor, High School, Graduate High School Graduate Bachelor 0 1000
job_sector nominal 5 Finance, Education, Technology, Manufacturing, Healthcare NA NA Finance 0 1000

5.0.1 Visual Overview

df %>% explore_all(color = viridis(2, begin = 0.95))