Introduction
This document demonstrates different ways of generating descriptive statistics and makes use of the diamonds dataset.
library(knitr)
knitr::opts_chunk$set(tidy=T,
fig.width=10,
fig.height=5,
fig.align='left',
warning=FALSE,
message=FALSE,
echo=TRUE)
options(width = 120)
library(dplyr)
library(ggplot2)
library(psych)
library(pastecs)
attach(diamonds)
Base Package
# show metadata for the dataset
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# show the beginning and ending rows of the dataset
kable(head(diamonds),
format='markdown',
caption="Books Read",
digits=4)
| 0.23 |
Ideal |
E |
SI2 |
61.5 |
55 |
326 |
3.95 |
3.98 |
2.43 |
| 0.21 |
Premium |
E |
SI1 |
59.8 |
61 |
326 |
3.89 |
3.84 |
2.31 |
| 0.23 |
Good |
E |
VS1 |
56.9 |
65 |
327 |
4.05 |
4.07 |
2.31 |
| 0.29 |
Premium |
I |
VS2 |
62.4 |
58 |
334 |
4.20 |
4.23 |
2.63 |
| 0.31 |
Good |
J |
SI2 |
63.3 |
58 |
335 |
4.34 |
4.35 |
2.75 |
| 0.24 |
Very Good |
J |
VVS2 |
62.8 |
57 |
336 |
3.94 |
3.96 |
2.48 |
kable(tail(diamonds),
format='markdown',
caption="Books Read",
digits=4)
| 0.72 |
Premium |
D |
SI1 |
62.7 |
59 |
2757 |
5.69 |
5.73 |
3.58 |
| 0.72 |
Ideal |
D |
SI1 |
60.8 |
57 |
2757 |
5.75 |
5.76 |
3.50 |
| 0.72 |
Good |
D |
SI1 |
63.1 |
55 |
2757 |
5.69 |
5.75 |
3.61 |
| 0.70 |
Very Good |
D |
SI1 |
62.8 |
60 |
2757 |
5.66 |
5.68 |
3.56 |
| 0.86 |
Premium |
H |
SI2 |
61.0 |
58 |
2757 |
6.15 |
6.12 |
3.74 |
| 0.75 |
Ideal |
D |
SI2 |
62.2 |
55 |
2757 |
5.83 |
5.87 |
3.64 |
# show the mean of all variables in the dataset.
# This technique could also be used with other functions like
# median()
# sd()
# min()
# max()
numeric_columns = c(1,5:10)
sapply(diamonds[numeric_columns],mean,na.rm=T)
## carat depth table price x y z
## 0.7979397 61.7494049 57.4571839 3932.7997219 5.7311572 5.7345260 3.5387338
# show Tukey's five number summary (min, 1Q, median, 3Q, max)
fivenum(diamonds$price)
## [1] 326.0 950.0 2401.0 5324.5 18823.0
# base summary function; provides counts for categorical variables
summary(diamonds)
## carat cut color clarity depth table price
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00 Min. :43.00 Min. : 326
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80 Median :57.00 Median : 2401
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75 Mean :57.46 Mean : 3933
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324
## Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00 Max. :95.00 Max. :18823
## J: 2808 (Other): 2531
## x y z
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.710 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.700 Median : 5.710 Median : 3.530
## Mean : 5.731 Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :10.740 Max. :58.900 Max. :31.800
##
psych Package
# good alternative library for descriptive statistics
kable(describe(diamonds[numeric_columns]),
format='markdown',
caption="Books Read",
digits=4)
| carat |
1 |
53940 |
0.7979 |
0.4740 |
0.70 |
0.7350 |
0.4744 |
0.2 |
5.01 |
4.81 |
1.1166 |
1.2562 |
0.0020 |
| depth |
2 |
53940 |
61.7494 |
1.4326 |
61.80 |
61.7846 |
1.0378 |
43.0 |
79.00 |
36.00 |
-0.0823 |
5.7384 |
0.0062 |
| table |
3 |
53940 |
57.4572 |
2.2345 |
57.00 |
57.3184 |
1.4826 |
43.0 |
95.00 |
52.00 |
0.7969 |
2.8013 |
0.0096 |
| price |
4 |
53940 |
3932.7997 |
3989.4397 |
2401.00 |
3158.9924 |
2475.9420 |
326.0 |
18823.00 |
18497.00 |
1.6183 |
2.1772 |
17.1774 |
| x |
5 |
53940 |
5.7312 |
1.1218 |
5.70 |
5.6601 |
1.3788 |
0.0 |
10.74 |
10.74 |
0.3787 |
-0.6183 |
0.0048 |
| y |
6 |
53940 |
5.7345 |
1.1421 |
5.71 |
5.6627 |
1.3640 |
0.0 |
58.90 |
58.90 |
2.4340 |
91.2025 |
0.0049 |
| z |
7 |
53940 |
3.5387 |
0.7057 |
3.53 |
3.4949 |
0.8451 |
0.0 |
31.80 |
31.80 |
1.5223 |
47.0803 |
0.0030 |
describeBy(diamonds[numeric_columns], group=color)
##
## Descriptive statistics by group
## group: D
## vars n mean sd median trimmed mad min max range skew kurtosis se
## carat 1 6775 0.66 0.36 0.53 0.61 0.31 0.2 3.40 3.20 1.29 2.04 0.00
## depth 2 6775 61.70 1.41 61.80 61.74 1.19 52.2 71.60 19.40 -0.26 2.56 0.02
## table 3 6775 57.40 2.21 57.00 57.26 1.48 52.0 73.00 21.00 0.75 1.09 0.03
## price 4 6775 3169.95 3356.59 1838.00 2457.57 1657.55 357.0 18693.00 18336.00 2.10 4.67 40.78
## x 5 6775 5.42 0.94 5.23 5.34 1.10 0.0 9.42 9.42 0.56 -0.26 0.01
## y 6 6775 5.42 0.94 5.24 5.35 1.10 0.0 9.34 9.34 0.55 -0.28 0.01
## z 7 6775 3.34 0.58 3.22 3.30 0.68 0.0 6.27 6.27 0.55 -0.26 0.01
## ------------------------------------------------------------------------------------------
## group: E
## vars n mean sd median trimmed mad min max range skew kurtosis se
## carat 1 9797 0.66 0.37 0.53 0.61 0.31 0.20 3.05 2.85 1.29 1.85 0.00
## depth 2 9797 61.66 1.44 61.80 61.71 1.19 51.00 79.00 28.00 0.14 8.15 0.01
## table 3 9797 57.49 2.24 57.00 57.34 1.48 44.00 73.00 29.00 0.71 1.27 0.02
## price 4 9797 3076.75 3344.16 1739.00 2349.98 1537.46 326.00 18731.00 18405.00 2.17 4.89 33.79
## x 5 9797 5.41 0.96 5.23 5.34 1.11 3.74 9.26 5.52 0.57 -0.39 0.01
## y 6 9797 5.42 0.99 5.24 5.35 1.11 3.71 31.80 28.09 2.42 50.14 0.01
## z 7 9797 3.34 0.66 3.22 3.29 0.68 2.06 31.80 29.74 8.60 352.76 0.01
## ------------------------------------------------------------------------------------------
## group: F
## vars n mean sd median trimmed mad min max range skew kurtosis se
## carat 1 9542 0.74 0.40 0.70 0.69 0.46 0.2 3.01 2.81 0.99 0.90 0.00
## depth 2 9542 61.69 1.44 61.80 61.73 1.04 52.3 71.00 18.70 -0.16 3.01 0.01
## table 3 9542 57.43 2.26 57.00 57.28 1.48 50.0 95.00 45.00 1.30 9.83 0.02
## price 4 9542 3724.89 3784.99 2343.50 2974.69 2274.31 342.0 18791.00 18449.00 1.75 2.82 38.75
## x 5 9542 5.61 1.01 5.65 5.57 1.25 0.0 9.24 9.24 0.27 -0.45 0.01
## y 6 9542 5.62 1.00 5.65 5.57 1.25 0.0 9.13 9.13 0.28 -0.55 0.01
## z 7 9542 3.46 0.63 3.48 3.43 0.77 0.0 5.73 5.73 0.24 -0.38 0.01
## ------------------------------------------------------------------------------------------
## group: G
## vars n mean sd median trimmed mad min max range skew kurtosis se
## carat 1 11292 0.77 0.44 0.70 0.72 0.49 0.23 3.01 2.78 0.96 0.57 0.00
## depth 2 11292 61.76 1.37 61.80 61.78 1.04 43.00 72.90 29.90 -0.22 9.05 0.01
## table 3 11292 57.29 2.15 57.00 57.16 1.48 52.00 76.00 24.00 0.75 1.45 0.02
## price 4 11292 3999.14 4051.10 2242.00 3245.61 2277.27 354.00 18818.00 18464.00 1.50 1.72 38.12
## x 5 11292 5.68 1.08 5.64 5.61 1.36 0.00 9.44 9.44 0.33 -0.78 0.01
## y 6 11292 5.68 1.08 5.63 5.62 1.35 0.00 9.37 9.37 0.32 -0.79 0.01
## z 7 11292 3.51 0.67 3.48 3.47 0.85 0.00 6.16 6.16 0.25 -0.44 0.01
## ------------------------------------------------------------------------------------------
## group: H
## vars n mean sd median trimmed mad min max range skew kurtosis se
## carat 1 8304 0.91 0.52 0.90 0.86 0.56 0.23 4.13 3.90 0.79 0.34 0.01
## depth 2 8304 61.84 1.44 61.90 61.86 1.04 52.70 71.80 19.10 0.16 3.71 0.02
## table 3 8304 57.52 2.24 57.00 57.39 1.48 50.00 73.00 23.00 0.66 1.17 0.02
## price 4 8304 4486.67 4215.94 3460.00 3755.13 3683.52 337.00 18803.00 18466.00 1.38 1.45 46.26
## x 5 8304 5.98 1.20 6.14 5.94 1.36 0.00 10.00 10.00 0.07 -0.82 0.01
## y 6 8304 5.98 1.32 6.14 5.94 1.35 0.00 58.90 58.90 7.80 309.80 0.01
## z 7 8304 3.70 0.74 3.82 3.67 0.86 0.00 8.06 8.06 0.05 -0.42 0.01
## ------------------------------------------------------------------------------------------
## group: I
## vars n mean sd median trimmed mad min max range skew kurtosis se
## carat 1 5422 1.03 0.58 1.00 0.98 0.73 0.23 4.01 3.78 0.64 -0.15 0.01
## depth 2 5422 61.85 1.46 61.90 61.88 1.04 50.80 71.30 20.50 -0.03 3.80 0.02
## table 3 5422 57.58 2.30 57.00 57.46 1.48 43.00 70.00 27.00 0.57 1.21 0.03
## price 4 5422 5091.87 4722.39 3730.00 4332.86 4067.51 334.00 18823.00 18489.00 1.16 0.42 64.13
## x 5 5422 6.22 1.25 6.35 6.20 1.47 3.94 10.14 6.20 0.02 -1.00 0.02
## y 6 5422 6.22 1.24 6.35 6.20 1.45 3.90 10.10 6.20 0.01 -1.00 0.02
## z 7 5422 3.85 0.77 3.93 3.83 0.92 0.00 6.31 6.31 -0.02 -0.88 0.01
## ------------------------------------------------------------------------------------------
## group: J
## vars n mean sd median trimmed mad min max range skew kurtosis se
## carat 1 2808 1.16 0.60 1.11 1.13 0.61 0.23 5.01 4.78 0.61 0.51 0.01
## depth 2 2808 61.89 1.55 62.00 61.93 1.04 43.00 73.60 30.60 -0.50 10.91 0.03
## table 3 2808 57.81 2.31 58.00 57.73 2.97 51.60 68.00 16.40 0.46 0.54 0.04
## price 4 2808 5323.82 4438.19 4234.00 4721.87 4088.27 335.00 18710.00 18375.00 1.03 0.28 83.75
## x 5 2808 6.52 1.20 6.64 6.54 1.25 3.93 10.74 6.81 -0.15 -0.73 0.02
## y 6 2808 6.52 1.20 6.63 6.54 1.25 3.90 10.54 6.64 -0.15 -0.75 0.02
## z 7 2808 4.03 0.74 4.11 4.05 0.79 2.46 6.98 4.52 -0.13 -0.69 0.01
pastecs Package
# another library
kable(stat.desc(diamonds[numeric_columns]),
format='markdown',
caption="Books Read",
digits=4)
| nbr.val |
53940.0000 |
53940.0000 |
5.39400e+04 |
5.394000e+04 |
53940.0000 |
53940.0000 |
53940.0000 |
| nbr.null |
0.0000 |
0.0000 |
0.00000e+00 |
0.000000e+00 |
8.0000 |
7.0000 |
20.0000 |
| nbr.na |
0.0000 |
0.0000 |
0.00000e+00 |
0.000000e+00 |
0.0000 |
0.0000 |
0.0000 |
| min |
0.2000 |
43.0000 |
4.30000e+01 |
3.260000e+02 |
0.0000 |
0.0000 |
0.0000 |
| max |
5.0100 |
79.0000 |
9.50000e+01 |
1.882300e+04 |
10.7400 |
58.9000 |
31.8000 |
| range |
4.8100 |
36.0000 |
5.20000e+01 |
1.849700e+04 |
10.7400 |
58.9000 |
31.8000 |
| sum |
43040.8700 |
3330762.9000 |
3.09924e+06 |
2.121352e+08 |
309138.6200 |
309320.3300 |
190879.3000 |
| median |
0.7000 |
61.8000 |
5.70000e+01 |
2.401000e+03 |
5.7000 |
5.7100 |
3.5300 |
| mean |
0.7979 |
61.7494 |
5.74572e+01 |
3.932800e+03 |
5.7312 |
5.7345 |
3.5387 |
| SE.mean |
0.0020 |
0.0062 |
9.60000e-03 |
1.717740e+01 |
0.0048 |
0.0049 |
0.0030 |
| CI.mean.0.95 |
0.0040 |
0.0121 |
1.89000e-02 |
3.366780e+01 |
0.0095 |
0.0096 |
0.0060 |
| var |
0.2247 |
2.0524 |
4.99290e+00 |
1.591563e+07 |
1.2583 |
1.3045 |
0.4980 |
| std.dev |
0.4740 |
1.4326 |
2.23450e+00 |
3.989440e+03 |
1.1218 |
1.1421 |
0.7057 |
| coef.var |
0.5940 |
0.0232 |
3.89000e-02 |
1.014400e+00 |
0.1957 |
0.1992 |
0.1994 |
dplyr Package
# custom function for std err of the mean
se = function(x) {
data = na.omit(x)
sd(data) / sqrt(length(data))
}
# descriptive statistics
price_by_color = diamonds %>%
group_by(color) %>%
summarise(ct=n(),
mn=mean(price),
md=median(price),
min=min(price),
max=max(price),
range=max-min,
sd=sd(price),
se=se(price),
skew=skew(price),
kurtosis=kurtosi(price))
colnames(price_by_color) =
c('Color','Count','Mean','Median','Min','Max','Range','Std Dev','Std Err','Skew','Kurtosis')
kable(price_by_color, format='markdown', digits=4)
| D |
6775 |
3169.954 |
1838.0 |
357 |
18693 |
18336 |
3356.591 |
40.7797 |
2.1046 |
4.6674 |
| E |
9797 |
3076.753 |
1739.0 |
326 |
18731 |
18405 |
3344.159 |
33.7863 |
2.1692 |
4.8945 |
| F |
9542 |
3724.886 |
2343.5 |
342 |
18791 |
18449 |
3784.992 |
38.7476 |
1.7543 |
2.8169 |
| G |
11292 |
3999.136 |
2242.0 |
354 |
18818 |
18464 |
4051.103 |
38.1231 |
1.4989 |
1.7212 |
| H |
8304 |
4486.669 |
3460.0 |
337 |
18803 |
18466 |
4215.944 |
46.2649 |
1.3825 |
1.4460 |
| I |
5422 |
5091.875 |
3730.0 |
334 |
18823 |
18489 |
4722.388 |
64.1330 |
1.1579 |
0.4183 |
| J |
2808 |
5323.818 |
4234.0 |
335 |
18710 |
18375 |
4438.187 |
83.7543 |
1.0343 |
0.2794 |
# percentiles
price_by_color_percentiles = diamonds %>%
summarise('Min' = min(price),
'10%' = quantile(price,prob=.10),
'20%' = quantile(price,prob=.20),
'30%' = quantile(price,prob=.30),
'40%' = quantile(price,prob=.40),
'50% / Median' = quantile(price,prob=.50),
'60%' = quantile(price,prob=.60),
'70%' = quantile(price,prob=.70),
'80%' = quantile(price,prob=.80),
'90%' = quantile(price,prob=.90),
'95%' = quantile(price,prob=.95),
'Max' = max(price))
kable(price_by_color_percentiles, format='markdown', digits=1)
| 326 |
646 |
837 |
1087 |
1698 |
2401 |
3465 |
4662 |
6301.2 |
9821 |
13107.1 |
18823 |
Programming Environment
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2 pastecs_1.3-18 boot_1.3-20 psych_1.7.8 ggplot2_2.2.1 dplyr_0.7.4 knitr_1.17
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.13 compiler_3.4.2 formatR_1.5 plyr_1.8.4 highr_0.6 bindr_0.1
## [7] tools_3.4.2 digest_0.6.12 evaluate_0.10.1 tibble_1.3.4 gtable_0.2.0 nlme_3.1-131
## [13] lattice_0.20-35 pkgconfig_2.0.1 rlang_0.1.2 yaml_2.1.14 parallel_3.4.2 stringr_1.2.0
## [19] rprojroot_1.2 grid_3.4.2 glue_1.2.0 R6_2.2.2 foreign_0.8-69 rmarkdown_1.6
## [25] magrittr_1.5 backports_1.1.1 scales_0.5.0 htmltools_0.3.6 assertthat_0.2.0 mnormt_1.5-5
## [31] colorspace_1.3-2 stringi_1.1.5 lazyeval_0.2.1 munsell_0.4.3