This analytic tool provides a series of useful descriptive statistics for evaluating a given data variable including (but not limited to):
For illustration purposes, a data frame with six continuous variables and three categorical variables is created. This data frame will be used to run the descriptive staistics analysis.
# Set seed for reproducibility
set.seed(117)
# Sample size
n <- 100
# Create continuous variables on different scales
JOB.PERF <- runif(n, min = 0, max = 100) # Job performance evaluation
CRT3 <- sample(0:3, n, replace = TRUE) # Cognitive Reflection Test score
BDI <- runif(n, min = 1, max = 40) # Beck Depression Inventory score
PSYSAFE <- runif(n, min = 1, max = 5) # Psychological Safety Scale score
TENURE <- runif(n, min = 1, max = 25) # Number of years employed at organization
P.AGE <- runif(n, min = 18, max = 65) # Participant age (in years)
# Binary demographic variables
P.SEX <- sample(c(0, 1), n, replace = TRUE) # Participant Sex: 0 = Female, 1 = Male
P.RACE <- sample(c(0, 1), n, replace = TRUE) # Participant Race: 1 = White, 0 = Not White
P.MENTOR <- sample(c(0, 1), n, replace = TRUE) # Participant Mentor Status 1 = Has Mentor, 0 = Does Not Have Mentor
# Combine into data frame
df <- data.frame(P.SEX, P.RACE, P.MENTOR, P.AGE, CRT3, BDI, PSYSAFE, TENURE, JOB.PERF)
# Randomly choose 20 unique row indices to set as missing data
na_indices <- sample(1:n, 20)
# Set PSYCHSAFE to NA for those indices
df$PSYSAFE[na_indices] <- NA
# View first few rows
head(df)
## P.SEX P.RACE P.MENTOR P.AGE CRT3 BDI PSYSAFE TENURE JOB.PERF
## 1 1 1 0 45.25423 2 11.708343 4.247106 21.819029 71.78151
## 2 1 1 1 46.34804 2 29.124640 2.270630 1.636009 4.53752
## 3 1 1 1 30.80766 2 19.097521 3.204139 17.665743 53.53098
## 4 1 1 1 27.33312 2 27.746177 2.588245 13.505467 46.50488
## 5 1 1 0 40.80463 3 3.046204 3.008607 17.358275 75.57024
## 6 1 1 1 43.00619 0 31.886807 1.308279 8.430738 13.44329
The code below provides a function to be used to calculate the constellation of descriptive statistics and output it into a neat data frame.
desc.tbl2 <- function (x) {
# Subset only numerical variables
data.desc <- x
# Descriptive stats table for variables in data frame
dat <- pastecs::stat.desc(data.desc, norm = FALSE, p = .95)
# Count unique values by variable and add new row
dat2 <- data.desc %>% dplyr::summarize_at(vars(everything()), n_distinct, na.rm = TRUE)
dat2 <- as.data.frame(dat2)
row.names(dat2) <- "val.unique"
# Combine descriptive stats and unique value counts
dat3 <- round(rbind(dat, dat2), 4)
# Function to calculate D'Agostino-Pearson K^2 Test for Normality ####
K2 <- function(x)
{
# D'Agostino-Pearson K^2 Test for Normality
# American Statistician 1990, vol. 44, No. 4, 316-321
# Check to see if x is numeric and treat it as a vector
if(mode(x) != "numeric") stop("need numeric data")
x <- as.vector(x) #Remove any NA's
x <- x[!is.na(x)]
n <- length(x) # Object for calculating the central moments
centralmoment <- function(x, k)
{
sum((x - mean(x))^k)/length(x)
}
#Compute coefficient of skewness
sqrt.b.1 <- centralmoment(x, 3)/centralmoment(x, 2)^(3/2)
# Compute coefficient of kurtosis
b.2 <- centralmoment(x, 4)/centralmoment(x, 2)^2
# Test of Skewness
y <- sqrt.b.1 * sqrt(((n + 1) * (n + 3))/(6 * (n - 2)))
B.2.sqrt.b.1 <- (3 * (n^2 + 27 * n - 70) * (n + 1) * (n + 3))/((n - 2) * (n + 5) * (n + 7) * (n + 9))
W2 <- -1 + sqrt(2 * (B.2.sqrt.b.1 - 1))
W <- sqrt(W2)
delta <- 1/sqrt(log(W))
alpha <- sqrt(2/(W2 - 1))
Z.sqrt.b.1 <- delta * log(y/alpha + sqrt((y/alpha)^2 + 1))
#Normal approximation
prob.skewness <- (1 - pnorm(abs(Z.sqrt.b.1), 0, 1))
#Test of Kurtosis
#Compute the mean of b_2
exp.b.2 <- (3 * (n - 1))/(n + 1) #Compute the variance of b.2
var.b.2 <- (24 * n * (n - 2) * (n - 3))/((n + 1)^2 * (n + 3) * (n + 5))
# Compute the standardized version of b.2
std.b.2 <- (b.2 - exp.b.2)/sqrt(var.b.2)
# Compute the third standardized moment of b.2
sqrt.B.1.b.2 <- ((6 * (n^2 - 5 * n + 2))/((n + 7) * (n + 9))) * sqrt((6 * (n + 3) * (n + 5))/(n * (n - 2) * (n - 3)))
A <- 6 + (8/sqrt.B.1.b.2) * (2/sqrt.B.1.b.2 + sqrt((1 + 4/(sqrt.B.1.b.2^2))))
Z.b.2 <- ((1 - 2/(9 * A)) - ((1 - 2/A)/(1 + std.b.2 * sqrt(2/(A - 4))))^(1/3))/(sqrt(2/(9 * A))) # Normal approximation
prob.kurtosis <- (1 - pnorm(abs(Z.b.2), 0, 1))
# Omnibus K2 Test of Normality (Skewness/Kurtosis)
K2 <- Z.sqrt.b.1^2 + Z.b.2^2
prob.K2 <- 1 - pchisq(K2, 2)
ret.val <- c("skewness" = sqrt.b.1,
"Normal Approx. for Skewness" = Z.sqrt.b.1,
"Prob(Normal Approx. for Skewness)" = prob.skewness,
"kurtosis" = b.2, "Normal Approx. for Kurtosis" =
Z.b.2, "Prob(Normal Approx. for Kurtosis)" = prob.kurtosis, normtest.K2 = K2,
"prob.K2" = prob.K2)
return(ret.val)
}
# Calculate D'Agostino-Pearson K^2 Test for Normality
dat4 <- as.data.frame(data.desc) %>% sapply(K2)
dat4 <- round(dat4[c("skewness", "kurtosis", "normtest.K2", "prob.K2"),], 4)
# Combine all stats into one table
dat5 <- rbind(dat3, dat4)
dat5 <- as.data.frame(t(as.matrix(dat5))) # Transpose data frame from rows to columns
# Add proportion of 1 values for variables with 1 or 2 unique values
prop_1_values <- sapply(data.desc, function(col) {
unique_vals <- unique(na.omit(col))
if (length(unique_vals) <= 2) {
sum(col == 1, na.rm = TRUE) / sum(!is.na(col))
} else {
NA
}
})
prop_0_values <- 1 - prop_1_values
dat5 <- dat5 %>% tibble::rownames_to_column() %>%
dplyr::mutate(pct.na = (nbr.na) / (nbr.na + nbr.val)) %>%
dplyr::mutate(CI.low = mean - CI.mean.0.95) %>%
dplyr::mutate(CI.high = mean + CI.mean.0.95) %>%
dplyr::mutate(prop_1 = prop_1_values) %>%
dplyr::mutate(prop_0 = prop_0_values)
# Reorder columns
dat5 <- dat5 %>% dplyr::select(rowname, nbr.val, nbr.null, nbr.na, pct.na, val.unique, prop_0, prop_1, min, max, range, sum, median, mean, std.dev, SE.mean, CI.mean.0.95, CI.low, CI.high, var,
coef.var, skewness, kurtosis, normtest.K2, prob.K2)
return(dat5)
}
The descriptive statistics function outputs the results into a data frame that can be saved for further inspection. The following results are provided for each of the provided variables:
# Use the descriptive statistics function on the data frame of variables to acquire the results
desc_tbl <- desc.tbl2(df)
DT::datatable(desc_tbl)
head(desc_tbl, 9)
## rowname nbr.val nbr.null nbr.na pct.na val.unique prop_0 prop_1 min
## 1 P.SEX 100 43 0 0.0 2 0.43 0.57 0.0000
## 2 P.RACE 100 48 0 0.0 2 0.48 0.52 0.0000
## 3 P.MENTOR 100 55 0 0.0 2 0.55 0.45 0.0000
## 4 P.AGE 100 0 0 0.0 100 NA NA 18.0096
## 5 CRT3 100 28 0 0.0 4 NA NA 0.0000
## 6 BDI 100 0 0 0.0 100 NA NA 1.3747
## 7 PSYSAFE 80 0 20 0.2 80 NA NA 1.2178
## 8 TENURE 100 0 0 0.0 100 NA NA 1.0171
## 9 JOB.PERF 100 0 0 0.0 100 NA NA 0.0357
## max range sum median mean std.dev SE.mean CI.mean.0.95 CI.low
## 1 1.0000 1.0000 57.000 1.0000 0.5700 0.4976 0.0498 0.0987 0.4713
## 2 1.0000 1.0000 52.000 1.0000 0.5200 0.5021 0.0502 0.0996 0.4204
## 3 1.0000 1.0000 45.000 0.0000 0.4500 0.5000 0.0500 0.0992 0.3508
## 4 64.8490 46.8393 4183.306 42.8156 41.8331 13.7916 1.3792 2.7365 39.0966
## 5 3.0000 3.0000 144.000 1.0000 1.4400 1.1575 0.1157 0.2297 1.2103
## 6 39.6982 38.3235 2065.390 22.0469 20.6539 11.7645 1.1764 2.3343 18.3196
## 7 4.9890 3.7712 243.287 3.0502 3.0411 1.1677 0.1306 0.2599 2.7812
## 8 24.7872 23.7701 1350.893 14.2756 13.5089 7.4628 0.7463 1.4808 12.0281
## 9 99.1146 99.0789 5048.725 50.2303 50.4872 29.0698 2.9070 5.7681 44.7191
## CI.high var coef.var skewness kurtosis normtest.K2 prob.K2
## 1 0.6687 0.2476 0.8729 -0.2828 1.0800 NaN NaN
## 2 0.6196 0.2521 0.9656 -0.0801 1.0064 NaN NaN
## 3 0.5492 0.2500 1.1111 0.2010 1.0404 NaN NaN
## 4 44.5696 190.2075 0.3297 -0.0306 1.8408 27.9004 0
## 5 1.6697 1.3398 0.8038 0.1073 1.5763 115.1606 0
## 6 22.9882 138.4028 0.5696 0.0078 1.7369 45.4745 0
## 7 3.3010 1.3635 0.3840 -0.0663 1.6931 39.2442 0
## 8 14.9897 55.6941 0.5524 -0.1332 1.6595 68.7203 0
## 9 56.2553 845.0556 0.5758 -0.1025 1.7931 34.8847 0