Introduction

Categorical data

Tables

  • Create a data vector of \(200\) samples
  • Display the number of unique values using the table() command
set.seed(1)
groups <- sample(c("I", "II", "III", "IV"),
                 200,
                 replace = TRUE)
table(groups)
## groups
##   I  II III  IV 
##  41  57  53  49

Bar plot

  • Use individual bars for each unique data point value
  • Bar height indicates count
barplot(table(groups),
        main = "Sample size for each grade",
        xlab = "Grade",
        ylab = "Count",
        las = 1,
        col = "deepskyblue")

* ALWAYS display the full y axis

barplot(table(groups),
        main = "This is intentionally misleading",
        xlab = "Grade",
        ylab = "Count",
        las = 1,
        col = "deepskyblue",
        ylim = c(40, 60),
        xpd = FALSE)

  • An intercative plot using plotly
p <- plot_ly(x = names(table(groups)),
             y = as.numeric(table(groups)),
             type = "bar",
             marker = list(color = "deepskyblue")) %>% 
  layout(title = "Sample size for each grade",
         xaxis = list(title = "Grade",
                      zeroline = FALSE),
         yaxis = list(title = "Count",
                      zeroline = FALSE))
p

Factors

  • Specify a variable as categorical
  • Create a data vector of pain scale values (\(0\) through \(5\))
pain <- c(1, 4, 3, 2, 2, 2, 1, 2, 3, 0, 5, 5, 5, 5, 4, 3, 4, 0, 2, 1, 3, 3, 1)
  • Change to categorical values
pain <- as.factor(pain)
table(pain)
## pain
## 0 1 2 3 4 5 
## 2 4 5 5 3 4

Numerical data

set.seed(1)
wcc = round(rnorm(100,
                  15,
                  4),
            digits = 1)

Measures of central tendency

  • Mean
mean(wcc)
## [1] 15.437
  • Median
median(wcc)
## [1] 15.45

Measures of dispersion

  • Minimum min()
min(wcc)
## [1] 6.1
  • Maximum max()
max(wcc)
## [1] 24.6
  • Range range()
range(wcc)
## [1]  6.1 24.6
  • Variance var()
var(wcc)
## [1] 12.91528
  • Standard deviation sd()
sd(wcc)
## [1] 3.593784
  • Quantiles quantile()
quantile(wcc)
##     0%    25%    50%    75%   100% 
##  6.100 13.025 15.450 17.800 24.600
  • Specifying the quantiles
quantile(wcc,
         c(0.1, 0.9))
##   10%   90% 
## 10.77 19.71
  • Interquartile range IQR()
IQR(wcc)
## [1] 4.775
  • Proportions
    • Proportions of wcc data point values larger than \(20\)
sum(wcc > 20) / length(wcc)
## [1] 0.09
  • z scores (standard deviations away from the mean)
(wcc - mean(wcc)) / sd(wcc)
##   [1] -0.81724443  0.07318192 -1.03985102  1.65925384  0.24013685
##   [6] -1.03985102  0.40709179  0.71317585  0.51839509 -0.45550873
##  [11]  1.54795055  0.32361432 -0.81724443 -2.59809712  1.13056320
##  [16] -0.17725049 -0.14942467  0.93578244  0.79665332  0.54622091
##  [21]  0.90795661  0.74100167 -0.03812138 -2.34766471  0.57404673
##  [26] -0.17725049 -0.28855379 -1.76332242 -0.65028949  0.35144015
##  [31]  1.38099561 -0.23290214  0.32361432 -0.17725049 -1.65201913
##  [36] -0.59463784 -0.56681202 -0.17725049  1.10273738  0.74100167
##  [41] -0.31637961 -0.39985708  0.65752420  0.49056926 -0.90072190
##  [46] -0.90072190  0.29578850  0.74100167 -0.23290214  0.85230497
##  [51]  0.32361432 -0.78941861  0.26796268 -1.37376089  1.46447308
##  [56]  2.07664119 -0.53898620 -1.29028343  0.51839509 -0.26072796
##  [61]  2.54968019 -0.17725049  0.65752420 -0.09377302 -0.95637355
##  [66]  0.10100774 -2.12505812  1.52012473  0.04535609  2.29924778
##  [71]  0.40709179 -0.90072190  0.54622091 -1.15115431 -1.51289001
##  [76]  0.21231103 -0.62246367 -0.12159885 -0.03812138 -0.78941861
##  [81] -0.76159278 -0.26072796  1.18621485 -1.81897407  0.54622091
##  [86]  0.24013685  1.07491155 -0.45550873  0.29578850  0.18448521
##  [91] -0.73376696  1.21404067  1.15838902  0.65752420  1.63142802
##  [96]  0.49056926 -1.54071583 -0.76159278 -1.48506419 -0.65028949

Summary statistic

  • Most common descriptive statistics
summary(wcc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.10   13.03   15.45   15.44   17.80   24.60

Histogram

  • Show counts by binning the numerical variable
hist(wcc,
     main = "White cell count distribution",
     xlab = "White cell count",
     ylab = "Count",
     col = "orange",
     las = 1)

  • Probability density
    • Rug plot added
hist(wcc,
     main = "White cell count distribution",
     xlab = "White cell count",
     ylab = "Fraction",
     col = "orange",
     las = 1,
     prob = TRUE)
rug(wcc)

Box plots

  • View data distribution by quartiles and possible outliers
boxplot(wcc,
        main = "White cell count",
     xlab = "White cell count",
     ylab = "Distribution",
     col = "green",
     las = 1)