Descriptive Statistics Analytic Tool

Overview of descriptive statistics tool

This analytic tool provides a series of useful descriptive statistics for evaluating a given data variable including (but not limited to):

number of valid values
percentage of missing values
number of unique values
proportions of binary variables
min/max values
arithmetic mean
standard deviation
variance
standard error
95% confidence interval around the mean
skewness
kurtosis
D’Agostino-Pearson K^2 Test for Normality

Data

For illustration purposes, a data frame with six continuous variables and three categorical variables is created. This data frame will be used to run the descriptive staistics analysis.

# Set seed for reproducibility
set.seed(117)

# Sample size
n <- 100

# Create continuous variables on different scales
JOB.PERF <- runif(n, min = 0, max = 100)    # Job performance evaluation
CRT3 <- sample(0:3, n, replace = TRUE)      # Cognitive Reflection Test score
BDI <- runif(n, min = 1, max = 40)          # Beck Depression Inventory score
PSYSAFE <- runif(n, min = 1, max = 5)       # Psychological Safety Scale score
TENURE <- runif(n, min = 1, max = 25)       # Number of years employed at organization
P.AGE <- runif(n, min = 18, max = 65)       # Participant age (in years)

# Binary demographic variables
P.SEX <- sample(c(0, 1), n, replace = TRUE)    # Participant Sex: 0 = Female, 1 = Male
P.RACE <- sample(c(0, 1), n, replace = TRUE)   # Participant Race: 1 = White, 0 = Not White
P.MENTOR <- sample(c(0, 1), n, replace = TRUE) # Participant Mentor Status 1 = Has Mentor, 0 = Does Not Have Mentor

# Combine into data frame
df <- data.frame(P.SEX, P.RACE, P.MENTOR, P.AGE, CRT3, BDI, PSYSAFE, TENURE, JOB.PERF)

# Randomly choose 20 unique row indices to set as missing data
na_indices <- sample(1:n, 20)

# Set PSYCHSAFE to NA for those indices
df$PSYSAFE[na_indices] <- NA


# View first few rows
head(df)

##   P.SEX P.RACE P.MENTOR    P.AGE CRT3       BDI  PSYSAFE    TENURE JOB.PERF
## 1     1      1        0 45.25423    2 11.708343 4.247106 21.819029 71.78151
## 2     1      1        1 46.34804    2 29.124640 2.270630  1.636009  4.53752
## 3     1      1        1 30.80766    2 19.097521 3.204139 17.665743 53.53098
## 4     1      1        1 27.33312    2 27.746177 2.588245 13.505467 46.50488
## 5     1      1        0 40.80463    3  3.046204 3.008607 17.358275 75.57024
## 6     1      1        1 43.00619    0 31.886807 1.308279  8.430738 13.44329

Descriptive statistics function

The code below provides a function to be used to calculate the constellation of descriptive statistics and output it into a neat data frame.

desc.tbl2 <- function (x) {
  # Subset only numerical variables
  data.desc <- x 
  
  # Descriptive stats table for variables in data frame
  dat <- pastecs::stat.desc(data.desc, norm = FALSE, p = .95)
  
  # Count unique values by variable and add new row
  dat2 <- data.desc %>% dplyr::summarize_at(vars(everything()), n_distinct, na.rm = TRUE)
  dat2 <- as.data.frame(dat2)
  row.names(dat2) <- "val.unique"
  
  # Combine descriptive stats and unique value counts
  dat3 <- round(rbind(dat, dat2), 4)
  
      # Function to calculate D'Agostino-Pearson K^2 Test for Normality ####   
      K2 <- function(x)
      {
      # D'Agostino-Pearson K^2 Test for Normality
      # American Statistician 1990, vol. 44, No. 4, 316-321
      # Check to see if x is numeric and treat it as a vector 
      if(mode(x) != "numeric") stop("need numeric data")
      x <- as.vector(x) #Remove any NA's
      x <- x[!is.na(x)]
      n <- length(x)    # Object for calculating the central moments
      centralmoment <- function(x, k)
      {
        sum((x - mean(x))^k)/length(x)
      }
      #Compute coefficient of skewness
      sqrt.b.1 <- centralmoment(x, 3)/centralmoment(x, 2)^(3/2) 
      # Compute coefficient of kurtosis
      b.2 <- centralmoment(x, 4)/centralmoment(x, 2)^2  
      # Test of Skewness
      y <- sqrt.b.1 * sqrt(((n + 1) * (n + 3))/(6 * (n - 2)))
      B.2.sqrt.b.1 <- (3 * (n^2 + 27 * n - 70) * (n + 1) * (n + 3))/((n - 2) * (n + 5) * (n + 7) * (n + 9))
      W2 <- -1 + sqrt(2 * (B.2.sqrt.b.1 - 1))
      W <- sqrt(W2)
      delta <- 1/sqrt(log(W))
      alpha <- sqrt(2/(W2 - 1))
      Z.sqrt.b.1 <- delta * log(y/alpha + sqrt((y/alpha)^2 + 1))    
      #Normal approximation
      prob.skewness <- (1 - pnorm(abs(Z.sqrt.b.1), 0, 1))   
      #Test of Kurtosis
      #Compute the mean of b_2
      exp.b.2 <- (3 * (n - 1))/(n + 1)  #Compute the variance of b.2
      var.b.2 <- (24 * n * (n - 2) * (n - 3))/((n + 1)^2 * (n + 3) * (n + 5))   
      # Compute the standardized version of b.2
      std.b.2 <- (b.2 - exp.b.2)/sqrt(var.b.2)  
      # Compute the third standardized moment of b.2
      sqrt.B.1.b.2 <- ((6 * (n^2 - 5 * n + 2))/((n + 7) * (n + 9))) * sqrt((6 * (n + 3) * (n + 5))/(n * (n - 2) * (n - 3)))
      A <- 6 + (8/sqrt.B.1.b.2) * (2/sqrt.B.1.b.2 + sqrt((1 + 4/(sqrt.B.1.b.2^2))))
      Z.b.2 <- ((1 - 2/(9 * A)) - ((1 - 2/A)/(1 + std.b.2 * sqrt(2/(A - 4))))^(1/3))/(sqrt(2/(9 * A)))  # Normal approximation
      prob.kurtosis <- (1 - pnorm(abs(Z.b.2), 0, 1))    
      # Omnibus K2 Test of Normality (Skewness/Kurtosis) 
      K2 <- Z.sqrt.b.1^2 + Z.b.2^2
      prob.K2 <- 1 - pchisq(K2, 2)
      ret.val <- c("skewness" = sqrt.b.1, 
                   "Normal Approx. for Skewness" = Z.sqrt.b.1, 
                   "Prob(Normal Approx. for Skewness)" = prob.skewness, 
                   "kurtosis" = b.2, "Normal Approx. for Kurtosis" = 
                     Z.b.2, "Prob(Normal Approx. for Kurtosis)" = prob.kurtosis, normtest.K2 = K2, 
                   "prob.K2" = prob.K2)
      return(ret.val)
        }
  
  # Calculate D'Agostino-Pearson K^2 Test for Normality 
  dat4 <- as.data.frame(data.desc) %>% sapply(K2)
  dat4 <- round(dat4[c("skewness", "kurtosis", "normtest.K2", "prob.K2"),], 4)
  
  # Combine all stats into one table
  dat5 <- rbind(dat3, dat4)
  dat5 <- as.data.frame(t(as.matrix(dat5))) # Transpose data frame from rows to columns
  
  # Add proportion of 1 values for variables with 1 or 2 unique values
  prop_1_values <- sapply(data.desc, function(col) {
    unique_vals <- unique(na.omit(col))
    if (length(unique_vals) <= 2) {
      sum(col == 1, na.rm = TRUE) / sum(!is.na(col))
    } else {
      NA
    }
  })
  
  prop_0_values <- 1 - prop_1_values
  
  dat5 <- dat5 %>% tibble::rownames_to_column() %>% 
    dplyr::mutate(pct.na = (nbr.na) / (nbr.na + nbr.val)) %>% 
    dplyr::mutate(CI.low = mean - CI.mean.0.95) %>% 
    dplyr::mutate(CI.high = mean + CI.mean.0.95) %>% 
    dplyr::mutate(prop_1 = prop_1_values) %>% 
    dplyr::mutate(prop_0 = prop_0_values)
  
  # Reorder columns
  dat5 <- dat5 %>% dplyr::select(rowname, nbr.val, nbr.null, nbr.na, pct.na, val.unique, prop_0, prop_1, min, max, range, sum, median, mean, std.dev, SE.mean, CI.mean.0.95, CI.low, CI.high, var,
                                 coef.var, skewness, kurtosis, normtest.K2, prob.K2)
  
  return(dat5)
}

Output

The descriptive statistics function outputs the results into a data frame that can be saved for further inspection. The following results are provided for each of the provided variables:

rowname - The name of the variable.
nbr.val - The number of valid rows in the sample.
nbr.null - Number of values that were zero.
nbr.na - Number of rows in the sample with missing data (“NA”).
pct.na - Proportion of rows in the sample with missing data (“NA”).
val.unique - Number of unique values.
prop_0 - For variables with only two unique values, what proportion were zero.
prop_1 - For variables with only two unique values, what proportion were one.
min - The minimum value.
max - The maximum value.
range - The range between the minimum and maximum value.
sum - The sum of all values.
median - The median value.
mean - The arithmetic mean (i.e., average).
std.dev - The standard deviation.
SE.mean - The standard error.
CI.mean.0.95 - The margin of error (MOE).
CI.low - The lower bound of the 95% confidence interval around the mean.
CI.high - The upper bound of the 95% confidence interval around the mean.
var - The variance.
coef.var - The coefficient of variability.
skewness - The skewness value.
kurtosis - The kurtosis value.
normtest.K2 - The D’Agostino-Pearson K^2 Test for Normality statistic.
prop.K2 - The p-value for the K^2 normality test.

# Use the descriptive statistics function on the data frame of variables to acquire the results
desc_tbl <- desc.tbl2(df)
DT::datatable(desc_tbl)

head(desc_tbl, 9)

##    rowname nbr.val nbr.null nbr.na pct.na val.unique prop_0 prop_1     min
## 1    P.SEX     100       43      0    0.0          2   0.43   0.57  0.0000
## 2   P.RACE     100       48      0    0.0          2   0.48   0.52  0.0000
## 3 P.MENTOR     100       55      0    0.0          2   0.55   0.45  0.0000
## 4    P.AGE     100        0      0    0.0        100     NA     NA 18.0096
## 5     CRT3     100       28      0    0.0          4     NA     NA  0.0000
## 6      BDI     100        0      0    0.0        100     NA     NA  1.3747
## 7  PSYSAFE      80        0     20    0.2         80     NA     NA  1.2178
## 8   TENURE     100        0      0    0.0        100     NA     NA  1.0171
## 9 JOB.PERF     100        0      0    0.0        100     NA     NA  0.0357
##       max   range      sum  median    mean std.dev SE.mean CI.mean.0.95  CI.low
## 1  1.0000  1.0000   57.000  1.0000  0.5700  0.4976  0.0498       0.0987  0.4713
## 2  1.0000  1.0000   52.000  1.0000  0.5200  0.5021  0.0502       0.0996  0.4204
## 3  1.0000  1.0000   45.000  0.0000  0.4500  0.5000  0.0500       0.0992  0.3508
## 4 64.8490 46.8393 4183.306 42.8156 41.8331 13.7916  1.3792       2.7365 39.0966
## 5  3.0000  3.0000  144.000  1.0000  1.4400  1.1575  0.1157       0.2297  1.2103
## 6 39.6982 38.3235 2065.390 22.0469 20.6539 11.7645  1.1764       2.3343 18.3196
## 7  4.9890  3.7712  243.287  3.0502  3.0411  1.1677  0.1306       0.2599  2.7812
## 8 24.7872 23.7701 1350.893 14.2756 13.5089  7.4628  0.7463       1.4808 12.0281
## 9 99.1146 99.0789 5048.725 50.2303 50.4872 29.0698  2.9070       5.7681 44.7191
##   CI.high      var coef.var skewness kurtosis normtest.K2 prob.K2
## 1  0.6687   0.2476   0.8729  -0.2828   1.0800         NaN     NaN
## 2  0.6196   0.2521   0.9656  -0.0801   1.0064         NaN     NaN
## 3  0.5492   0.2500   1.1111   0.2010   1.0404         NaN     NaN
## 4 44.5696 190.2075   0.3297  -0.0306   1.8408     27.9004       0
## 5  1.6697   1.3398   0.8038   0.1073   1.5763    115.1606       0
## 6 22.9882 138.4028   0.5696   0.0078   1.7369     45.4745       0
## 7  3.3010   1.3635   0.3840  -0.0663   1.6931     39.2442       0
## 8 14.9897  55.6941   0.5524  -0.1332   1.6595     68.7203       0
## 9 56.2553 845.0556   0.5758  -0.1025   1.7931     34.8847       0

Descriptive Statistics Analytic Tool

For more data analytic tools and visualization code in R check out: https://rpubs.com/nratcliff/

Nathaniel Ratcliff, PhD

2024-03-28

Overview of descriptive statistics tool

Data

Descriptive statistics function

Output