Introduction

This document demonstrates different ways of generating descriptive statistics and makes use of the diamonds dataset.

library(knitr)
knitr::opts_chunk$set(tidy=T, 
               fig.width=10,
               fig.height=5,
               fig.align='left',
               warning=FALSE,
               message=FALSE,
               echo=TRUE)
options(width = 120)
library(dplyr)
library(ggplot2)
library(psych)
library(pastecs)
attach(diamonds)

Base Package

# show metadata for the dataset
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# show the beginning and ending rows of the dataset
kable(head(diamonds), 
      format='markdown', 
      caption="Books Read",
      digits=4)
carat cut color clarity depth table price x y z
0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
kable(tail(diamonds), 
      format='markdown', 
      caption="Books Read",
      digits=4)
carat cut color clarity depth table price x y z
0.72 Premium D SI1 62.7 59 2757 5.69 5.73 3.58
0.72 Ideal D SI1 60.8 57 2757 5.75 5.76 3.50
0.72 Good D SI1 63.1 55 2757 5.69 5.75 3.61
0.70 Very Good D SI1 62.8 60 2757 5.66 5.68 3.56
0.86 Premium H SI2 61.0 58 2757 6.15 6.12 3.74
0.75 Ideal D SI2 62.2 55 2757 5.83 5.87 3.64
# show the mean of all variables in the dataset.
# This technique could also be used with other functions like
#   median()
#   sd()
#   min()
#   max()
numeric_columns = c(1,5:10)
sapply(diamonds[numeric_columns],mean,na.rm=T)
##        carat        depth        table        price            x            y            z 
##    0.7979397   61.7494049   57.4571839 3932.7997219    5.7311572    5.7345260    3.5387338
# show Tukey's five number summary (min, 1Q, median, 3Q, max)
fivenum(diamonds$price)
## [1]   326.0   950.0  2401.0  5324.5 18823.0
# base summary function; provides counts for categorical variables
summary(diamonds)
##      carat               cut        color        clarity          depth           table           price      
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00   Min.   :43.00   Min.   :  326  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80   Median :57.00   Median : 2401  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75   Mean   :57.46   Mean   : 3933  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00   Max.   :95.00   Max.   :18823  
##                                     J: 2808   (Other): 2531                                                  
##        x                y                z         
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.710   1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.700   Median : 5.710   Median : 3.530  
##  Mean   : 5.731   Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :10.740   Max.   :58.900   Max.   :31.800  
## 

psych Package

# good alternative library for descriptive statistics
kable(describe(diamonds[numeric_columns]), 
      format='markdown', 
      caption="Books Read",
      digits=4)
vars n mean sd median trimmed mad min max range skew kurtosis se
carat 1 53940 0.7979 0.4740 0.70 0.7350 0.4744 0.2 5.01 4.81 1.1166 1.2562 0.0020
depth 2 53940 61.7494 1.4326 61.80 61.7846 1.0378 43.0 79.00 36.00 -0.0823 5.7384 0.0062
table 3 53940 57.4572 2.2345 57.00 57.3184 1.4826 43.0 95.00 52.00 0.7969 2.8013 0.0096
price 4 53940 3932.7997 3989.4397 2401.00 3158.9924 2475.9420 326.0 18823.00 18497.00 1.6183 2.1772 17.1774
x 5 53940 5.7312 1.1218 5.70 5.6601 1.3788 0.0 10.74 10.74 0.3787 -0.6183 0.0048
y 6 53940 5.7345 1.1421 5.71 5.6627 1.3640 0.0 58.90 58.90 2.4340 91.2025 0.0049
z 7 53940 3.5387 0.7057 3.53 3.4949 0.8451 0.0 31.80 31.80 1.5223 47.0803 0.0030
describeBy(diamonds[numeric_columns], group=color)
## 
##  Descriptive statistics by group 
## group: D
##       vars    n    mean      sd  median trimmed     mad   min      max    range  skew kurtosis    se
## carat    1 6775    0.66    0.36    0.53    0.61    0.31   0.2     3.40     3.20  1.29     2.04  0.00
## depth    2 6775   61.70    1.41   61.80   61.74    1.19  52.2    71.60    19.40 -0.26     2.56  0.02
## table    3 6775   57.40    2.21   57.00   57.26    1.48  52.0    73.00    21.00  0.75     1.09  0.03
## price    4 6775 3169.95 3356.59 1838.00 2457.57 1657.55 357.0 18693.00 18336.00  2.10     4.67 40.78
## x        5 6775    5.42    0.94    5.23    5.34    1.10   0.0     9.42     9.42  0.56    -0.26  0.01
## y        6 6775    5.42    0.94    5.24    5.35    1.10   0.0     9.34     9.34  0.55    -0.28  0.01
## z        7 6775    3.34    0.58    3.22    3.30    0.68   0.0     6.27     6.27  0.55    -0.26  0.01
## ------------------------------------------------------------------------------------------ 
## group: E
##       vars    n    mean      sd  median trimmed     mad    min      max    range skew kurtosis    se
## carat    1 9797    0.66    0.37    0.53    0.61    0.31   0.20     3.05     2.85 1.29     1.85  0.00
## depth    2 9797   61.66    1.44   61.80   61.71    1.19  51.00    79.00    28.00 0.14     8.15  0.01
## table    3 9797   57.49    2.24   57.00   57.34    1.48  44.00    73.00    29.00 0.71     1.27  0.02
## price    4 9797 3076.75 3344.16 1739.00 2349.98 1537.46 326.00 18731.00 18405.00 2.17     4.89 33.79
## x        5 9797    5.41    0.96    5.23    5.34    1.11   3.74     9.26     5.52 0.57    -0.39  0.01
## y        6 9797    5.42    0.99    5.24    5.35    1.11   3.71    31.80    28.09 2.42    50.14  0.01
## z        7 9797    3.34    0.66    3.22    3.29    0.68   2.06    31.80    29.74 8.60   352.76  0.01
## ------------------------------------------------------------------------------------------ 
## group: F
##       vars    n    mean      sd  median trimmed     mad   min      max    range  skew kurtosis    se
## carat    1 9542    0.74    0.40    0.70    0.69    0.46   0.2     3.01     2.81  0.99     0.90  0.00
## depth    2 9542   61.69    1.44   61.80   61.73    1.04  52.3    71.00    18.70 -0.16     3.01  0.01
## table    3 9542   57.43    2.26   57.00   57.28    1.48  50.0    95.00    45.00  1.30     9.83  0.02
## price    4 9542 3724.89 3784.99 2343.50 2974.69 2274.31 342.0 18791.00 18449.00  1.75     2.82 38.75
## x        5 9542    5.61    1.01    5.65    5.57    1.25   0.0     9.24     9.24  0.27    -0.45  0.01
## y        6 9542    5.62    1.00    5.65    5.57    1.25   0.0     9.13     9.13  0.28    -0.55  0.01
## z        7 9542    3.46    0.63    3.48    3.43    0.77   0.0     5.73     5.73  0.24    -0.38  0.01
## ------------------------------------------------------------------------------------------ 
## group: G
##       vars     n    mean      sd  median trimmed     mad    min      max    range  skew kurtosis    se
## carat    1 11292    0.77    0.44    0.70    0.72    0.49   0.23     3.01     2.78  0.96     0.57  0.00
## depth    2 11292   61.76    1.37   61.80   61.78    1.04  43.00    72.90    29.90 -0.22     9.05  0.01
## table    3 11292   57.29    2.15   57.00   57.16    1.48  52.00    76.00    24.00  0.75     1.45  0.02
## price    4 11292 3999.14 4051.10 2242.00 3245.61 2277.27 354.00 18818.00 18464.00  1.50     1.72 38.12
## x        5 11292    5.68    1.08    5.64    5.61    1.36   0.00     9.44     9.44  0.33    -0.78  0.01
## y        6 11292    5.68    1.08    5.63    5.62    1.35   0.00     9.37     9.37  0.32    -0.79  0.01
## z        7 11292    3.51    0.67    3.48    3.47    0.85   0.00     6.16     6.16  0.25    -0.44  0.01
## ------------------------------------------------------------------------------------------ 
## group: H
##       vars    n    mean      sd  median trimmed     mad    min      max    range skew kurtosis    se
## carat    1 8304    0.91    0.52    0.90    0.86    0.56   0.23     4.13     3.90 0.79     0.34  0.01
## depth    2 8304   61.84    1.44   61.90   61.86    1.04  52.70    71.80    19.10 0.16     3.71  0.02
## table    3 8304   57.52    2.24   57.00   57.39    1.48  50.00    73.00    23.00 0.66     1.17  0.02
## price    4 8304 4486.67 4215.94 3460.00 3755.13 3683.52 337.00 18803.00 18466.00 1.38     1.45 46.26
## x        5 8304    5.98    1.20    6.14    5.94    1.36   0.00    10.00    10.00 0.07    -0.82  0.01
## y        6 8304    5.98    1.32    6.14    5.94    1.35   0.00    58.90    58.90 7.80   309.80  0.01
## z        7 8304    3.70    0.74    3.82    3.67    0.86   0.00     8.06     8.06 0.05    -0.42  0.01
## ------------------------------------------------------------------------------------------ 
## group: I
##       vars    n    mean      sd  median trimmed     mad    min      max    range  skew kurtosis    se
## carat    1 5422    1.03    0.58    1.00    0.98    0.73   0.23     4.01     3.78  0.64    -0.15  0.01
## depth    2 5422   61.85    1.46   61.90   61.88    1.04  50.80    71.30    20.50 -0.03     3.80  0.02
## table    3 5422   57.58    2.30   57.00   57.46    1.48  43.00    70.00    27.00  0.57     1.21  0.03
## price    4 5422 5091.87 4722.39 3730.00 4332.86 4067.51 334.00 18823.00 18489.00  1.16     0.42 64.13
## x        5 5422    6.22    1.25    6.35    6.20    1.47   3.94    10.14     6.20  0.02    -1.00  0.02
## y        6 5422    6.22    1.24    6.35    6.20    1.45   3.90    10.10     6.20  0.01    -1.00  0.02
## z        7 5422    3.85    0.77    3.93    3.83    0.92   0.00     6.31     6.31 -0.02    -0.88  0.01
## ------------------------------------------------------------------------------------------ 
## group: J
##       vars    n    mean      sd  median trimmed     mad    min      max    range  skew kurtosis    se
## carat    1 2808    1.16    0.60    1.11    1.13    0.61   0.23     5.01     4.78  0.61     0.51  0.01
## depth    2 2808   61.89    1.55   62.00   61.93    1.04  43.00    73.60    30.60 -0.50    10.91  0.03
## table    3 2808   57.81    2.31   58.00   57.73    2.97  51.60    68.00    16.40  0.46     0.54  0.04
## price    4 2808 5323.82 4438.19 4234.00 4721.87 4088.27 335.00 18710.00 18375.00  1.03     0.28 83.75
## x        5 2808    6.52    1.20    6.64    6.54    1.25   3.93    10.74     6.81 -0.15    -0.73  0.02
## y        6 2808    6.52    1.20    6.63    6.54    1.25   3.90    10.54     6.64 -0.15    -0.75  0.02
## z        7 2808    4.03    0.74    4.11    4.05    0.79   2.46     6.98     4.52 -0.13    -0.69  0.01

pastecs Package

# another library
kable(stat.desc(diamonds[numeric_columns]), 
      format='markdown', 
      caption="Books Read",
      digits=4)
carat depth table price x y z
nbr.val 53940.0000 53940.0000 5.39400e+04 5.394000e+04 53940.0000 53940.0000 53940.0000
nbr.null 0.0000 0.0000 0.00000e+00 0.000000e+00 8.0000 7.0000 20.0000
nbr.na 0.0000 0.0000 0.00000e+00 0.000000e+00 0.0000 0.0000 0.0000
min 0.2000 43.0000 4.30000e+01 3.260000e+02 0.0000 0.0000 0.0000
max 5.0100 79.0000 9.50000e+01 1.882300e+04 10.7400 58.9000 31.8000
range 4.8100 36.0000 5.20000e+01 1.849700e+04 10.7400 58.9000 31.8000
sum 43040.8700 3330762.9000 3.09924e+06 2.121352e+08 309138.6200 309320.3300 190879.3000
median 0.7000 61.8000 5.70000e+01 2.401000e+03 5.7000 5.7100 3.5300
mean 0.7979 61.7494 5.74572e+01 3.932800e+03 5.7312 5.7345 3.5387
SE.mean 0.0020 0.0062 9.60000e-03 1.717740e+01 0.0048 0.0049 0.0030
CI.mean.0.95 0.0040 0.0121 1.89000e-02 3.366780e+01 0.0095 0.0096 0.0060
var 0.2247 2.0524 4.99290e+00 1.591563e+07 1.2583 1.3045 0.4980
std.dev 0.4740 1.4326 2.23450e+00 3.989440e+03 1.1218 1.1421 0.7057
coef.var 0.5940 0.0232 3.89000e-02 1.014400e+00 0.1957 0.1992 0.1994

dplyr Package

# custom function for std err of the mean
se = function(x) {
  data = na.omit(x)
  sd(data) / sqrt(length(data)) 
}

# descriptive statistics
price_by_color = diamonds %>%
  group_by(color) %>% 
  summarise(ct=n(),
            mn=mean(price),
            md=median(price),
            min=min(price),
            max=max(price),
            range=max-min,
            sd=sd(price),
            se=se(price),
            skew=skew(price),
            kurtosis=kurtosi(price))
colnames(price_by_color) = 
  c('Color','Count','Mean','Median','Min','Max','Range','Std Dev','Std Err','Skew','Kurtosis')
kable(price_by_color, format='markdown', digits=4)
Color Count Mean Median Min Max Range Std Dev Std Err Skew Kurtosis
D 6775 3169.954 1838.0 357 18693 18336 3356.591 40.7797 2.1046 4.6674
E 9797 3076.753 1739.0 326 18731 18405 3344.159 33.7863 2.1692 4.8945
F 9542 3724.886 2343.5 342 18791 18449 3784.992 38.7476 1.7543 2.8169
G 11292 3999.136 2242.0 354 18818 18464 4051.103 38.1231 1.4989 1.7212
H 8304 4486.669 3460.0 337 18803 18466 4215.944 46.2649 1.3825 1.4460
I 5422 5091.875 3730.0 334 18823 18489 4722.388 64.1330 1.1579 0.4183
J 2808 5323.818 4234.0 335 18710 18375 4438.187 83.7543 1.0343 0.2794
# percentiles
price_by_color_percentiles = diamonds %>%
    summarise('Min' = min(price),
              '10%' = quantile(price,prob=.10),
              '20%' = quantile(price,prob=.20),
              '30%' = quantile(price,prob=.30),
              '40%' = quantile(price,prob=.40),
              '50% / Median' = quantile(price,prob=.50),
              '60%' = quantile(price,prob=.60),
              '70%' = quantile(price,prob=.70),
              '80%' = quantile(price,prob=.80),
              '90%' = quantile(price,prob=.90),
              '95%' = quantile(price,prob=.95),
              'Max' = max(price))
kable(price_by_color_percentiles, format='markdown', digits=1)
Min 10% 20% 30% 40% 50% / Median 60% 70% 80% 90% 95% Max
326 646 837 1087 1698 2401 3465 4662 6301.2 9821 13107.1 18823

Programming Environment

sessionInfo()
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] bindrcpp_0.2   pastecs_1.3-18 boot_1.3-20    psych_1.7.8    ggplot2_2.2.1  dplyr_0.7.4    knitr_1.17    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.13     compiler_3.4.2   formatR_1.5      plyr_1.8.4       highr_0.6        bindr_0.1       
##  [7] tools_3.4.2      digest_0.6.12    evaluate_0.10.1  tibble_1.3.4     gtable_0.2.0     nlme_3.1-131    
## [13] lattice_0.20-35  pkgconfig_2.0.1  rlang_0.1.2      yaml_2.1.14      parallel_3.4.2   stringr_1.2.0   
## [19] rprojroot_1.2    grid_3.4.2       glue_1.2.0       R6_2.2.2         foreign_0.8-69   rmarkdown_1.6   
## [25] magrittr_1.5     backports_1.1.1  scales_0.5.0     htmltools_0.3.6  assertthat_0.2.0 mnormt_1.5-5    
## [31] colorspace_1.3-2 stringi_1.1.5    lazyeval_0.2.1   munsell_0.4.3