library(haven)
## Warning: package 'haven' was built under R version 4.2.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.2.3
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks rstatix::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.2.3
library(descr)
## Warning: package 'descr' was built under R version 4.2.3
library(labelled)
## Warning: package 'labelled' was built under R version 4.2.3

Descriptive Data Review

Before you ever start analyzing your data, you should always review it closely to understand how the variable types, how they are distributed, if you have missing data and how it is handled. Here we will review a wide variety of techniques to show you techniques to understand and describe your dataset prior to beginning any type of analysis.

For this, we will rely on a built-in R dataset called mtcars. R has a wide variety of built-in data that can be used, and this is one of the class ones.

?mtcars #This gives you general information about the dataset including variable descriptions
## starting httpd help server ... done
head(mtcars) #Displays the first 5 rows of a dataframe in R
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
head(mtcars, 10) #Controls the number of rows to display. Here the first 10 
##                    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360        14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D         24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230          22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280          19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
tail(mtcars) #Displays the final 5 rows 
##                 mpg cyl  disp  hp drat    wt qsec vs am gear carb
## Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.7  0  1    5    2
## Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.9  1  1    5    2
## Ford Pantera L 15.8   8 351.0 264 4.22 3.170 14.5  0  1    5    4
## Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.5  0  1    5    6
## Maserati Bora  15.0   8 301.0 335 3.54 3.570 14.6  0  1    5    8
## Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.6  1  1    4    2
tail(mtcars, 10) #Displays the final 10 rows
##                   mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## AMC Javelin      15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28       13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9        27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2    26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa     30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L   15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino     19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora    15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E       21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
summary(mtcars) #This code gives wide variety of statistics on each variable in the dataset
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
skim(mtcars) #This gives you similar information as summary but provides additional info like missing data and very high-level of how each variable is distributed 
Data summary
Name mtcars
Number of rows 32
Number of columns 11
_______________________
Column type frequency:
numeric 11
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
mpg 0 1 20.09 6.03 10.40 15.43 19.20 22.80 33.90 ▃▇▅▁▂
cyl 0 1 6.19 1.79 4.00 4.00 6.00 8.00 8.00 ▆▁▃▁▇
disp 0 1 230.72 123.94 71.10 120.83 196.30 326.00 472.00 ▇▃▃▃▂
hp 0 1 146.69 68.56 52.00 96.50 123.00 180.00 335.00 ▇▇▆▃▁
drat 0 1 3.60 0.53 2.76 3.08 3.70 3.92 4.93 ▇▃▇▅▁
wt 0 1 3.22 0.98 1.51 2.58 3.33 3.61 5.42 ▃▃▇▁▂
qsec 0 1 17.85 1.79 14.50 16.89 17.71 18.90 22.90 ▃▇▇▂▁
vs 0 1 0.44 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
am 0 1 0.41 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
gear 0 1 3.69 0.74 3.00 3.00 4.00 4.00 5.00 ▇▁▆▁▂
carb 0 1 2.81 1.62 1.00 2.00 2.00 4.00 8.00 ▇▂▅▁▁
mtcars %>%
  get_summary_stats(mpg, hp, gear, type = "common") #This code allows you to specify specific variables to summarize and is useful for really large datasets 
## # A tibble: 3 × 10
##   variable     n   min   max median   iqr   mean     sd    se     ci
##   <fct>    <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>  <dbl> <dbl>  <dbl>
## 1 mpg         32  10.4  33.9   19.2  7.38  20.1   6.03   1.06  2.17 
## 2 hp          32  52   335    123   83.5  147.   68.6   12.1  24.7  
## 3 gear        32   3     5      4    1      3.69  0.738  0.13  0.266

If you want to only focus on missingness in your data, the following code chunk shows two ways to do so. We use the Base R sum and colSum functions to count total missing data across the entire data and to look at missingness in individual variables respectively. If the dataset is really large, it can be beneficial to focus on only a few variables at a time. We do that in the final part of the code chunk.

sum(is.na(mtcars))  # Number of missing values
## [1] 0
colSums(is.na(mtcars))  # Missing values by variable
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    0    0    0    0    0    0    0    0
colSums(is.na(mtcars[, c("mpg", "gear", "hp")])) #Only include a few variables to view missingness
##  mpg gear   hp 
##    0    0    0

Reviewing Individual Variables

Oftentimes, you will want to review one variable at a time rather than the entire dataset. There are any number of ways to do this. However, there are different techniques to use depending on the type of variable you are reviewing.

We will start using the freq function from the descr package. Freq stands for frequency and is simply a count of how many times each individual value occurs in a specific variable. This code also will produce a histogram of the variable distribution - unless you explicitly repress the plot from being created. Then we will look at how to do this using tidyverse.

For discrete variables - those that take on specific values with a small range - running a frequency table is extremely useful at understanding what type of values are included in that variable. However, for continuous variables, it is not as informative because of the sheer number of values the variable can take on. For these variable types, it is more informative to know the mean and spread around the mean so a different technique is appropriate.

Evenwith a discrete and/or an ordinal variable, the mean and amount of dispersion around can be important to understand. In addition looking at the frequency each value is selected in a variable, you still want to understand its mean and SD.

freq(mtcars$cyl, plot=F) #Variable = number of engine cylinders for each car and is an ordinal variable
## mtcars$cyl 
##       Frequency Percent
## 4            11   34.38
## 6             7   21.88
## 8            14   43.75
## Total        32  100.00
mtcars %>% #Can do the same thing using tidyverse but isn't as efficient 
  count(cyl) %>%
  mutate(percentage = n / sum(n) * 100,
         percentage_label = paste0(round(percentage, 1), "%"))
##   cyl  n percentage percentage_label
## 1   4 11     34.375            34.4%
## 2   6  7     21.875            21.9%
## 3   8 14     43.750            43.8%
freq(mtcars$hp, plot=F) #Variable = Number of horsepower for the individual car. It is continuous. Notice how difficult it is to understand the distribution just looking at frequency table  
## mtcars$hp 
##       Frequency Percent
## 52            1   3.125
## 62            1   3.125
## 65            1   3.125
## 66            2   6.250
## 91            1   3.125
## 93            1   3.125
## 95            1   3.125
## 97            1   3.125
## 105           1   3.125
## 109           1   3.125
## 110           3   9.375
## 113           1   3.125
## 123           2   6.250
## 150           2   6.250
## 175           3   9.375
## 180           3   9.375
## 205           1   3.125
## 215           1   3.125
## 230           1   3.125
## 245           2   6.250
## 264           1   3.125
## 335           1   3.125
## Total        32 100.000
#With continuously distributed variables, better to look at means, sd, etc. 

summary(mtcars$hp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    52.0    96.5   123.0   146.7   180.0   335.0
mtcars %>%
  get_summary_stats(hp, type = "common") #This summarizes the central tendency and its dispersion 
## # A tibble: 1 × 10
##   variable     n   min   max median   iqr  mean    sd    se    ci
##   <fct>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 hp          32    52   335    123  83.5  147.  68.6  12.1  24.7
mtcars %>% #Can also do this using tidyverse  
  summarise(mean = mean(hp), #Mean of HP from mtcars
            median=median(hp), #Median of HP from mtcars
         var = var(hp),  #Variance of HP from mtcars
         sd = sd(hp),  #Standard deviation of HP from mtcars
         n=n()) #Count of cases in above calculations 
##       mean median      var       sd  n
## 1 146.6875    123 4700.867 68.56287 32

Wide variety of ways to get the summary information for a continuous variable. Important information to always review is the central tendencies - mean & median (mode is not really used in most data analysis since it doesn’t tell us much about the overall distribution) - as well as dispersion around the mean. Above we saw one way to do this using the get_summary function but there are many others as well.

Reviewing Individual Variable Distributions

While the above gives you general information about your variables, none allow you to review how each variable is distributed around it’s mean. Remember our visualization of skewness where the mean and standard deviation of some continuous variable were identical yet two of the variables were strongly skewed either to the left or right. This fact makes it imperative that you examine the histogram of your variables - especially the continuous ones - prior to starting any analysis. During transformations week later in the semester, you will learn techniques to deal with skewed distributions. But before you can adjust for it, you must know it exists.

Histograms are the most common approach to visualizing a variable’s distribution. We will review two ways to do this: Base R and GGPlot.

hist(mtcars$hp) #This is Base R approach which is informative but not as customizable as GGPlot

#GGPlot makes it easier to fully customize the histogram. Here we start by telling R to break into 30 bins. But that isn't the best number to choose. Why? 
ggplot(mtcars, aes(x = hp)) +
  geom_histogram(aes(y = after_stat(density)), position = "identity", bins = 30, alpha = 0.25, color="black") +
  labs(title = "Histogram Plot of Car Horsepower",
       x = "Horsepower",
       y = "Density") +
  theme_minimal()

ggplot(mtcars, aes(x = hp)) +
  geom_histogram(aes(y = after_stat(density)), position = "identity", bins = 10, alpha = 0.25, color="black") +
  labs(title = "Histogram Plot of Car Horsepower",
       x = "Horsepower",
       y = "Density") +
  theme_minimal()

Both Base R and GGPlot will return the same information - summary of how many cases fall within each bin in the data. However, GGPlot makes it easier to customize the graph and the number of bins.

We start with the histogram for a continuous variable. In the first GGPlot, we use 30 bins. However, that is not an appropriate number to use since we only have 32 observations! We basically are just graphing each individual value at that point. In the second GGPlot, we drop the number of bins to 10 which better reveals how the data is bucketed together.

The number of bins is entirely customizable based on the specific variable in use. When deciding the number of bins to use, do not use too few or too many. There is no specific number of bins that should always be used so use your best judgement. Ultimately, you want to reveal the overall distribution of your variable so that you can understand if you are dealing with a skewed or normally distributed variable.

When dealing with a discrete variable, the number of bins should simply be the number of possible discrete values that variable can take on. For instance, the cyl variable in the mtcars dataset only has 3 possible options. For that variable, you would only want to include 3 bins in your histogram. Once again, this can be done either through Base R or GGPlot.

hist(mtcars$cyl)

ggplot(mtcars, aes(x = cyl)) +
  geom_histogram(aes(y = after_stat(density)), position = "identity", bins = 3, alpha = 0.25, color="black") +
  labs(title = "Histogram Plot of Car Cylinders",
       x = "Car Cylinders",
       y = "Density") +
  theme_minimal()