Introduction to Data in R

DataCamp: Statistics with R

Bonnie Cooper

Notes from videos and exercises. This is a casual follow along to help me remember all the things and to serve as future reference.

Language of Data

library( openintro )
library( dplyr )
library( ggplot2 )
library( gapminder )
data( hsb2 )
head( hsb2 )
## # A tibble: 6 x 11
##      id gender race  ses    schtyp prog        read write  math science socst
##   <int> <chr>  <chr> <fct>  <fct>  <fct>      <int> <int> <int>   <int> <int>
## 1    70 male   white low    public general       57    52    41      47    57
## 2   121 female white middle public vocational    68    59    53      63    61
## 3    86 male   white high   public general       44    33    54      58    31
## 4   141 male   white high   public vocational    63    44    47      53    56
## 5   172 male   white middle public academic      47    52    57      53    61
## 6   113 male   white middle public academic      44    52    51      63    61
str( hsb2 )
## tibble [200 × 11] (S3: tbl_df/tbl/data.frame)
##  $ id     : int [1:200] 70 121 86 141 172 113 50 11 84 48 ...
##  $ gender : chr [1:200] "male" "female" "male" "male" ...
##  $ race   : chr [1:200] "white" "white" "white" "white" ...
##  $ ses    : Factor w/ 3 levels "low","middle",..: 1 2 3 3 2 2 2 2 2 2 ...
##  $ schtyp : Factor w/ 2 levels "public","private": 1 1 1 1 1 1 1 1 1 1 ...
##  $ prog   : Factor w/ 3 levels "general","academic",..: 1 3 1 3 2 2 1 2 1 2 ...
##  $ read   : int [1:200] 57 68 44 63 47 44 50 34 63 57 ...
##  $ write  : int [1:200] 52 59 33 44 52 52 59 46 57 55 ...
##  $ math   : int [1:200] 41 53 54 47 57 51 42 45 54 52 ...
##  $ science: int [1:200] 47 63 58 53 53 63 53 39 58 50 ...
##  $ socst  : int [1:200] 57 61 31 56 61 61 61 36 51 51 ...
glimpse( hsb2 )
## Rows: 200
## Columns: 11
## $ id      <int> 70, 121, 86, 141, 172, 113, 50, 11, 84, 48, 75, 60, 95, 104, …
## $ gender  <chr> "male", "female", "male", "male", "male", "male", "male", "ma…
## $ race    <chr> "white", "white", "white", "white", "white", "white", "africa…
## $ ses     <fct> low, middle, high, high, middle, middle, middle, middle, midd…
## $ schtyp  <fct> public, public, public, public, public, public, public, publi…
## $ prog    <fct> general, vocational, general, vocational, academic, academic,…
## $ read    <int> 57, 68, 44, 63, 47, 44, 50, 34, 63, 57, 60, 57, 73, 54, 45, 4…
## $ write   <int> 52, 59, 33, 44, 52, 52, 59, 46, 57, 55, 46, 65, 60, 63, 57, 4…
## $ math    <int> 41, 53, 54, 47, 57, 51, 42, 45, 54, 52, 51, 51, 71, 57, 50, 4…
## $ science <int> 47, 63, 58, 53, 53, 63, 53, 39, 58, 50, 53, 63, 61, 55, 31, 5…
## $ socst   <int> 57, 61, 31, 56, 61, 61, 61, 36, 51, 51, 61, 61, 71, 46, 56, 5…
data( email50 )
str( email50 )
## tibble [50 × 21] (S3: tbl_df/tbl/data.frame)
##  $ spam        : num [1:50] 0 0 1 0 0 0 0 0 0 0 ...
##  $ to_multiple : num [1:50] 0 0 0 0 0 0 0 0 0 0 ...
##  $ from        : num [1:50] 1 1 1 1 1 1 1 1 1 1 ...
##  $ cc          : int [1:50] 0 0 4 0 0 0 0 0 1 0 ...
##  $ sent_email  : num [1:50] 1 0 0 0 0 0 0 1 1 0 ...
##  $ time        : POSIXct[1:50], format: "2012-01-04 08:19:16" "2012-02-16 15:10:06" ...
##  $ image       : num [1:50] 0 0 0 0 0 0 0 0 0 0 ...
##  $ attach      : num [1:50] 0 0 2 0 0 0 0 0 0 0 ...
##  $ dollar      : num [1:50] 0 0 0 0 9 0 0 0 0 23 ...
##  $ winner      : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ inherit     : num [1:50] 0 0 0 0 0 0 0 0 0 0 ...
##  $ viagra      : num [1:50] 0 0 0 0 0 0 0 0 0 0 ...
##  $ password    : num [1:50] 0 0 0 0 1 0 0 0 0 0 ...
##  $ num_char    : num [1:50] 21.705 7.011 0.631 2.454 41.623 ...
##  $ line_breaks : int [1:50] 551 183 28 61 1088 5 17 88 242 578 ...
##  $ format      : num [1:50] 1 1 0 0 1 0 0 1 1 1 ...
##  $ re_subj     : num [1:50] 1 0 0 0 0 0 0 1 1 0 ...
##  $ exclaim_subj: num [1:50] 0 0 0 0 0 0 0 0 1 0 ...
##  $ urgent_subj : num [1:50] 0 0 0 0 0 0 0 0 0 0 ...
##  $ exclaim_mess: num [1:50] 8 1 2 1 43 0 0 2 22 3 ...
##  $ number      : Factor w/ 3 levels "none","small",..: 2 3 1 2 2 2 2 2 2 2 ...

Types of variables

Variable type helps us to determine:

  1. What type of summary statistics are needed to describe the data
  2. What type of visualizations best represent the data
  3. What statistical methods can be applied to the data

Numeric (qualitative):

  • Continuous: inifite number of values within a given range
  • Discrete: specific set of numeric values that can be counted or enumerated, often counted

Categorical (qualitative):

  • Ordinal: finite number of values within a given range, often measured

Categorical Data

  • Is often stored as factors in R
    • factors have an important use in statistical modeling
    • Sometimes factors are undesirable, but sometimes essetial
  • Common in subgroup analysis
    • Only interested in a subset of the data
    • Filter for specific levels of categorical variable
#frquency tables
table( hsb2$schtyp )
## 
##  public private 
##     168      32
#use dplyr filter to subset the 'public' data
hsb2_public <- hsb2 %>% filter( schtyp == 'public' )
head( hsb2_public )
## # A tibble: 6 x 11
##      id gender race  ses    schtyp prog        read write  math science socst
##   <int> <chr>  <chr> <fct>  <fct>  <fct>      <int> <int> <int>   <int> <int>
## 1    70 male   white low    public general       57    52    41      47    57
## 2   121 female white middle public vocational    68    59    53      63    61
## 3    86 male   white high   public general       44    33    54      58    31
## 4   141 male   white high   public vocational    63    44    47      53    56
## 5   172 male   white middle public academic      47    52    57      53    61
## 6   113 male   white middle public academic      44    52    51      63    61

The pipe operator %>%:

  • take the dataframe before the operator and pass to the function following
  • ex: x %>% f( y ) is effectively f( x,y )
  • ex: sum( 3,4 ) is effectively 3 %>% sum( 4 )
table( hsb2_public$schtyp )
## 
##  public private 
##     168       0

The output still shows ‘private’ as an option even though it does not exist in this subset of the data.

#drop unused levels
hsb2_public$schtyp <- droplevels( hsb2_public$schtyp )
table( hsb2_public$schtyp )
## 
## public 
##    168
# Subset of emails with big numbers: email50_big
email50_big <- email50 %>%
  filter( number == 'big' )

# Glimpse the subset
#glimpse( email50_big )
table( email50_big$number )
## 
##  none small   big 
##     0     0     7
email50_big$number = droplevels( email50_big$number )
table( email50_big$number )
## 
## big 
##   7

Descretize a variable

redefining a numeric variable as a categorical variable

#descretize the average reading score feature as either above or below average
(avg_read <- mean( hsb2$read )) #the extra parentheses will tell R to print the result
## [1] 52.23

use dplyr mutate to create a new discrete variable that categorizes a record as being either above or below average reading scores

#create new variable: read_cat
hsb2 <- hsb2 %>%
  mutate( read_cat = ifelse(
    read < avg_read,       # <-- logical condition
    'below average',       # <--what to do if condition is TRUE
    'at or above average'  # <--what to do if condition is FALSE
  ))
head( hsb2 )
## # A tibble: 6 x 12
##      id gender race  ses   schtyp prog   read write  math science socst read_cat
##   <int> <chr>  <chr> <fct> <fct>  <fct> <int> <int> <int>   <int> <int> <chr>   
## 1    70 male   white low   public gene…    57    52    41      47    57 at or a…
## 2   121 female white midd… public voca…    68    59    53      63    61 at or a…
## 3    86 male   white high  public gene…    44    33    54      58    31 below a…
## 4   141 male   white high  public voca…    63    44    47      53    56 at or a…
## 5   172 male   white midd… public acad…    47    52    57      53    61 below a…
## 6   113 male   white midd… public acad…    44    52    51      63    61 below a…
# Calculate median number of characters: med_num_char
med_num_char <- median( email50$num_char )

# Create num_char_cat variable in email50
email50_fortified <- email50 %>%
  mutate(num_char_cat = ifelse(num_char < med_num_char, 'below median', "at or above median" ))
  
# Count emails in each category
email50_fortified %>%
  count(num_char_cat)
## # A tibble: 2 x 2
##   num_char_cat           n
##   <chr>              <int>
## 1 at or above median    25
## 2 below median          25
# Create number_yn column in email50
email50_fortified <- email50 %>%
  mutate(
    number_yn = case_when(
      # if number is "none", make number_yn "no"
      number == "none" ~ "no", 
      # if number is not "none", make number_yn "yes"
      number != "none" ~ "yes"  
    )
  )
  

# Visualize the distribution of number_yn
ggplot(email50_fortified, aes(x = number_yn)) +
  geom_bar()

Visualizing numerical data

ggplot2

  • Modern looking hasstle-free plots (don’t have to explicitly render legends etc.)
  • easy to extend code to multivariate plots
  • Iterative construction (can build complex visualizations by adding relatively simple layers)
#scatterplot of math vs science scores
ggplot( data = hsb2, aes( x = science, y = math, color = prog ) ) + 
  #layers are separated by '+'
  geom_point() #having the next layer on a new line makes code readable

# Scatterplot of exclaim_mess vs. num_char
ggplot(email50, aes(x = num_char, y = exclaim_mess, color = factor(spam))) +
  geom_point()

Study types and cautionary tales

Observational studies and experiments

Types of studies:

  • Observational Study:
    • Collect data in a way that does not directly interfere with how the data arise
    • Only correlations can be inferred
  • Experiments:
    • Randomly assign subjects to various treatments
    • Causation can be inferred

In an experiment, variables that migth also contribute to the outcome (confounding variables) are most likely represented equally in the study groups due to random assignment. Therefore, if there is a significant difference found between the averages of the dependent variable, then we can make causal statements.

An example of an observational study:

data( gapminder )
glimpse( gapminder )
## Rows: 1,704
## Columns: 6
## $ country   <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afghani…
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia,…
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997,…
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 1…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134,…

Random sampling and random assignment

Random….

  • Random Sampling:
    • At selection of subjects from population
    • Helps generalizability of results
  • Random Assignment:
    • At selection of subjects from population
    • helps infer causation from results

Scope of Inference

Random Assignment No Random Assignment
Random Sampling Causal & Generalizable Not Causal but Generalizable Generalizable
No Random Sampling Causal but not Generalizable Not Causal or Generalizable Not Generalizable
Causal Not Causal

Simpson’s paradox: not considering an important variable when studying an effect

load( file="ucb_admit.RData" )
head( ucb_admit )
##      Admit Gender Dept
## 1 Admitted   Male    A
## 2 Admitted   Male    A
## 3 Admitted   Male    A
## 4 Admitted   Male    A
## 5 Admitted   Male    A
## 6 Admitted   Male    A
# Count number of male and female applicants admitted
ucb_admission_counts <- ucb_admit %>%
  count(Gender, Admit)
#calculate the proportion of males admitted overall
ucb_admission_counts %>%
  # Group by gender
  group_by(Gender) %>%
  # Create new variable
  mutate(prop = n / sum( n )) %>%
  # Filter for admitted
  filter(Admit == "Admitted")
## # A tibble: 2 x 4
## # Groups:   Gender [2]
##   Gender Admit        n  prop
##   <fct>  <fct>    <int> <dbl>
## 1 Male   Admitted  1198 0.445
## 2 Female Admitted   557 0.304
#now to find the proportions of males and females admitted by department
ucb_admission_counts <- ucb_admit %>%
  # Counts by department, then gender, then admission status
  count(Dept, Gender, Admit)

ucb_admission_counts  %>%
  # Group by department, then gender
  group_by(Dept, Gender) %>%
  # Create new variable
  mutate(prop = n/sum(n)) %>%
  # Filter for male and admitted
  filter(Gender == 'Male', Admit == "Admitted")
## # A tibble: 6 x 5
## # Groups:   Dept, Gender [6]
##   Dept  Gender Admit        n   prop
##   <chr> <fct>  <fct>    <int>  <dbl>
## 1 A     Male   Admitted   512 0.621 
## 2 B     Male   Admitted   353 0.630 
## 3 C     Male   Admitted   120 0.369 
## 4 D     Male   Admitted   138 0.331 
## 5 E     Male   Admitted    53 0.277 
## 6 F     Male   Admitted    22 0.0590

Sampling strategies and experimental design

Sampling strategies

Why sample? Why not take a census of the entire population of interest?

  • Conducting a census is very resource intensive
  • Nearly imposible to collect data from all individuals, hence there is no guarentee of unbiased results
  • Populations constantly change

Simple Random Samples: select cases randomly from the population such that each case is equally likely to be selected.
Stratefied Samples: first, divide the population into homogenious groups, or stata. Then, randomly sample within strata.
Cluster Samples: divide the population into clusters, only samplea few clusters, but sample these clusters in entirety.
Multistage Samples: add another step after cluster sampling: randomly sample observations from within randomly sampled clusters.

Sampling in R

#load the county dataframe
data( county )
glimpse( county )
## Rows: 3,142
## Columns: 15
## $ name              <chr> "Autauga County", "Baldwin County", "Barbour County…
## $ state             <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabam…
## $ pop2000           <dbl> 43671, 140415, 29038, 20826, 51024, 11714, 21399, 1…
## $ pop2010           <dbl> 54571, 182265, 27457, 22915, 57322, 10914, 20947, 1…
## $ pop2017           <int> 55504, 212628, 25270, 22668, 58013, 10309, 19825, 1…
## $ pop_change        <dbl> 1.48, 9.19, -6.22, 0.73, 0.68, -2.28, -2.69, -1.51,…
## $ poverty           <dbl> 13.7, 11.8, 27.2, 15.2, 15.6, 28.5, 24.4, 18.6, 18.…
## $ homeownership     <dbl> 77.5, 76.7, 68.0, 82.9, 82.0, 76.9, 69.0, 70.7, 71.…
## $ multi_unit        <dbl> 7.2, 22.6, 11.1, 6.6, 3.7, 9.9, 13.7, 14.3, 8.7, 4.…
## $ unemployment_rate <dbl> 3.86, 3.99, 5.90, 4.39, 4.02, 4.93, 5.49, 4.93, 4.0…
## $ metro             <fct> yes, yes, no, yes, yes, no, no, yes, no, no, yes, n…
## $ median_edu        <fct> some_college, some_college, hs_diploma, hs_diploma,…
## $ per_capita_income <dbl> 27841.70, 27779.85, 17891.73, 20572.05, 21367.39, 1…
## $ median_hh_income  <int> 55317, 52562, 33368, 43404, 47412, 29655, 36326, 43…
## $ smoking_ban       <fct> none, none, partial, none, none, none, NA, NA, none…
set.seed( 1349 )
#remove DC (because it is not a state ?)
county_noDC <- county %>%
  filter( state != 'District of Columbia') %>%
  droplevels()

#simple random sample
county_srs <- county_noDC %>%
  sample_n( size = 150 )

( length( unique( county_srs$state ) ) ) #depending on seed, might not sample all 50 states
## [1] 43
#and would very very very unlikely sample them evenly

county_srs %>%
  group_by( state ) %>%
  count() %>%
  ggplot( aes( x = state, y = n ) ) +
  geom_bar( stat = 'identity' ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggtitle( 'Simple Sample Distribution' )

#stratefied sample
county_str <- county_noDC %>%
  group_by( state ) %>%
  sample_n( size = 3 ) %>%
  count() %>%
  ggplot( aes( x = state, y = n ) ) +
  geom_bar( stat = 'identity' ) +
  ylim( 0,5 ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggtitle( 'Stratefied Sample Distribution' )
county_str

load( file="us_regions.RData" )
glimpse( us_regions )
## Rows: 51
## Columns: 2
## $ state  <fct> Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island…
## $ region <fct> Northeast, Northeast, Northeast, Northeast, Northeast, Northea…
# Simple random sample: states_srs
states_srs <- us_regions %>%
  sample_n( 8 )

# Count states by region
states_srs %>%
  count(region)
##      region n
## 1   Midwest 3
## 2 Northeast 1
## 3     South 2
## 4      West 2
# Stratified sample
states_str <- us_regions %>%
  group_by(region) %>%
  sample_n(2)

# Count states by region
states_str %>%
  count()
## # A tibble: 4 x 2
## # Groups:   region [4]
##   region        n
##   <fct>     <int>
## 1 Midwest       2
## 2 Northeast     2
## 3 South         2
## 4 West          2

Principles of Experimental Design

  • Control: compare treatment of interest to a control group
  • Randomize: randomly assign subjects to treatment
  • Replicate: collect a sufficiently large sample within a study, or replicate the entire study
  • Block: account for the potential effect of confounding variables
    • Group subjects unto blocks based on variables
    • Randomize within each block to treatment groups

Case study

Beauty in the Classroom: are attractive instructors given higher evaluations

load( file="evals.RData" )
glimpse(evals)
## Rows: 463
## Columns: 21
## $ score         <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, …
## $ rank          <fct> tenure track, tenure track, tenure track, tenure track,…
## $ ethnicity     <fct> minority, minority, minority, minority, not minority, n…
## $ gender        <fct> female, female, female, female, male, male, male, male,…
## $ language      <fct> english, english, english, english, english, english, e…
## $ age           <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40,…
## $ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000, 87.50…
## $ cls_did_eval  <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24, 17, 14…
## $ cls_students  <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 25, 20,…
## $ cls_level     <fct> upper, upper, upper, upper, upper, upper, upper, upper,…
## $ cls_profs     <fct> single, single, single, single, multiple, multiple, mul…
## $ cls_credits   <fct> multi credit, multi credit, multi credit, multi credit,…
## $ bty_f1lower   <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 7, 7…
## $ bty_f1upper   <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 9, 9…
## $ bty_f2upper   <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 9, 9…
## $ bty_m1lower   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7…
## $ bty_m1upper   <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 6…
## $ bty_m2upper   <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6…
## $ bty_avg       <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.333,…
## $ pic_outfit    <fct> not formal, not formal, not formal, not formal, not for…
## $ pic_color     <fct> color, color, color, color, color, color, color, color,…
# Recode cls_students as cls_type
evals_fortified <- evals %>%
  mutate(
    cls_type = case_when(
      cls_students <= 18 ~ "small",
      cls_students > 18 & cls_students <= 59 ~ "midsize",
      cls_students >= 60 ~ "large"
    )
  )
glimpse( evals_fortified )
## Rows: 463
## Columns: 22
## $ score         <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, …
## $ rank          <fct> tenure track, tenure track, tenure track, tenure track,…
## $ ethnicity     <fct> minority, minority, minority, minority, not minority, n…
## $ gender        <fct> female, female, female, female, male, male, male, male,…
## $ language      <fct> english, english, english, english, english, english, e…
## $ age           <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40,…
## $ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000, 87.50…
## $ cls_did_eval  <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24, 17, 14…
## $ cls_students  <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 25, 20,…
## $ cls_level     <fct> upper, upper, upper, upper, upper, upper, upper, upper,…
## $ cls_profs     <fct> single, single, single, single, multiple, multiple, mul…
## $ cls_credits   <fct> multi credit, multi credit, multi credit, multi credit,…
## $ bty_f1lower   <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 7, 7…
## $ bty_f1upper   <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 9, 9…
## $ bty_f2upper   <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 9, 9…
## $ bty_m1lower   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7…
## $ bty_m1upper   <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 6…
## $ bty_m2upper   <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6…
## $ bty_avg       <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.333,…
## $ pic_outfit    <fct> not formal, not formal, not formal, not formal, not for…
## $ pic_color     <fct> color, color, color, color, color, color, color, color,…
## $ cls_type      <chr> "midsize", "large", "large", "large", "midsize", "midsi…
# Scatterplot of score vs. bty_avg
ggplot(evals_fortified, aes(x=bty_avg, y=score, color=cls_type)) +
  geom_point()