Notes from videos and exercises. This is a casual follow along to help me remember all the things and to serve as future reference.
Language of Data
library( openintro )
library( dplyr )
library( ggplot2 )
library( gapminder )
data( hsb2 )
head( hsb2 )
## # A tibble: 6 x 11
## id gender race ses schtyp prog read write math science socst
## <int> <chr> <chr> <fct> <fct> <fct> <int> <int> <int> <int> <int>
## 1 70 male white low public general 57 52 41 47 57
## 2 121 female white middle public vocational 68 59 53 63 61
## 3 86 male white high public general 44 33 54 58 31
## 4 141 male white high public vocational 63 44 47 53 56
## 5 172 male white middle public academic 47 52 57 53 61
## 6 113 male white middle public academic 44 52 51 63 61
str( hsb2 )
## tibble [200 × 11] (S3: tbl_df/tbl/data.frame)
## $ id : int [1:200] 70 121 86 141 172 113 50 11 84 48 ...
## $ gender : chr [1:200] "male" "female" "male" "male" ...
## $ race : chr [1:200] "white" "white" "white" "white" ...
## $ ses : Factor w/ 3 levels "low","middle",..: 1 2 3 3 2 2 2 2 2 2 ...
## $ schtyp : Factor w/ 2 levels "public","private": 1 1 1 1 1 1 1 1 1 1 ...
## $ prog : Factor w/ 3 levels "general","academic",..: 1 3 1 3 2 2 1 2 1 2 ...
## $ read : int [1:200] 57 68 44 63 47 44 50 34 63 57 ...
## $ write : int [1:200] 52 59 33 44 52 52 59 46 57 55 ...
## $ math : int [1:200] 41 53 54 47 57 51 42 45 54 52 ...
## $ science: int [1:200] 47 63 58 53 53 63 53 39 58 50 ...
## $ socst : int [1:200] 57 61 31 56 61 61 61 36 51 51 ...
glimpse( hsb2 )
## Rows: 200
## Columns: 11
## $ id <int> 70, 121, 86, 141, 172, 113, 50, 11, 84, 48, 75, 60, 95, 104, …
## $ gender <chr> "male", "female", "male", "male", "male", "male", "male", "ma…
## $ race <chr> "white", "white", "white", "white", "white", "white", "africa…
## $ ses <fct> low, middle, high, high, middle, middle, middle, middle, midd…
## $ schtyp <fct> public, public, public, public, public, public, public, publi…
## $ prog <fct> general, vocational, general, vocational, academic, academic,…
## $ read <int> 57, 68, 44, 63, 47, 44, 50, 34, 63, 57, 60, 57, 73, 54, 45, 4…
## $ write <int> 52, 59, 33, 44, 52, 52, 59, 46, 57, 55, 46, 65, 60, 63, 57, 4…
## $ math <int> 41, 53, 54, 47, 57, 51, 42, 45, 54, 52, 51, 51, 71, 57, 50, 4…
## $ science <int> 47, 63, 58, 53, 53, 63, 53, 39, 58, 50, 53, 63, 61, 55, 31, 5…
## $ socst <int> 57, 61, 31, 56, 61, 61, 61, 36, 51, 51, 61, 61, 71, 46, 56, 5…
data( email50 )
str( email50 )
## tibble [50 × 21] (S3: tbl_df/tbl/data.frame)
## $ spam : num [1:50] 0 0 1 0 0 0 0 0 0 0 ...
## $ to_multiple : num [1:50] 0 0 0 0 0 0 0 0 0 0 ...
## $ from : num [1:50] 1 1 1 1 1 1 1 1 1 1 ...
## $ cc : int [1:50] 0 0 4 0 0 0 0 0 1 0 ...
## $ sent_email : num [1:50] 1 0 0 0 0 0 0 1 1 0 ...
## $ time : POSIXct[1:50], format: "2012-01-04 08:19:16" "2012-02-16 15:10:06" ...
## $ image : num [1:50] 0 0 0 0 0 0 0 0 0 0 ...
## $ attach : num [1:50] 0 0 2 0 0 0 0 0 0 0 ...
## $ dollar : num [1:50] 0 0 0 0 9 0 0 0 0 23 ...
## $ winner : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ inherit : num [1:50] 0 0 0 0 0 0 0 0 0 0 ...
## $ viagra : num [1:50] 0 0 0 0 0 0 0 0 0 0 ...
## $ password : num [1:50] 0 0 0 0 1 0 0 0 0 0 ...
## $ num_char : num [1:50] 21.705 7.011 0.631 2.454 41.623 ...
## $ line_breaks : int [1:50] 551 183 28 61 1088 5 17 88 242 578 ...
## $ format : num [1:50] 1 1 0 0 1 0 0 1 1 1 ...
## $ re_subj : num [1:50] 1 0 0 0 0 0 0 1 1 0 ...
## $ exclaim_subj: num [1:50] 0 0 0 0 0 0 0 0 1 0 ...
## $ urgent_subj : num [1:50] 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_mess: num [1:50] 8 1 2 1 43 0 0 2 22 3 ...
## $ number : Factor w/ 3 levels "none","small",..: 2 3 1 2 2 2 2 2 2 2 ...
Types of variables
Variable type helps us to determine:
- What type of summary statistics are needed to describe the data
- What type of visualizations best represent the data
- What statistical methods can be applied to the data
Numeric (qualitative):
- Continuous: inifite number of values within a given range
- Discrete: specific set of numeric values that can be counted or enumerated, often counted
Categorical (qualitative):
- Ordinal: finite number of values within a given range, often measured
Categorical Data
- Is often stored as factors in R
- factors have an important use in statistical modeling
- Sometimes factors are undesirable, but sometimes essetial
- Common in subgroup analysis
- Only interested in a subset of the data
- Filter for specific levels of categorical variable
#frquency tables
table( hsb2$schtyp )
##
## public private
## 168 32
#use dplyr filter to subset the 'public' data
hsb2_public <- hsb2 %>% filter( schtyp == 'public' )
head( hsb2_public )
## # A tibble: 6 x 11
## id gender race ses schtyp prog read write math science socst
## <int> <chr> <chr> <fct> <fct> <fct> <int> <int> <int> <int> <int>
## 1 70 male white low public general 57 52 41 47 57
## 2 121 female white middle public vocational 68 59 53 63 61
## 3 86 male white high public general 44 33 54 58 31
## 4 141 male white high public vocational 63 44 47 53 56
## 5 172 male white middle public academic 47 52 57 53 61
## 6 113 male white middle public academic 44 52 51 63 61
The pipe operator %>%:
- take the dataframe before the operator and pass to the function following
- ex:
x %>% f( y )is effectivelyf( x,y ) - ex:
sum( 3,4 )is effectively3 %>% sum( 4 )
table( hsb2_public$schtyp )
##
## public private
## 168 0
The output still shows ‘private’ as an option even though it does not exist in this subset of the data.
#drop unused levels
hsb2_public$schtyp <- droplevels( hsb2_public$schtyp )
table( hsb2_public$schtyp )
##
## public
## 168
# Subset of emails with big numbers: email50_big
email50_big <- email50 %>%
filter( number == 'big' )
# Glimpse the subset
#glimpse( email50_big )
table( email50_big$number )
##
## none small big
## 0 0 7
email50_big$number = droplevels( email50_big$number )
table( email50_big$number )
##
## big
## 7
Descretize a variable
redefining a numeric variable as a categorical variable
#descretize the average reading score feature as either above or below average
(avg_read <- mean( hsb2$read )) #the extra parentheses will tell R to print the result
## [1] 52.23
use dplyr mutate to create a new discrete variable that categorizes a record as being either above or below average reading scores
#create new variable: read_cat
hsb2 <- hsb2 %>%
mutate( read_cat = ifelse(
read < avg_read, # <-- logical condition
'below average', # <--what to do if condition is TRUE
'at or above average' # <--what to do if condition is FALSE
))
head( hsb2 )
## # A tibble: 6 x 12
## id gender race ses schtyp prog read write math science socst read_cat
## <int> <chr> <chr> <fct> <fct> <fct> <int> <int> <int> <int> <int> <chr>
## 1 70 male white low public gene… 57 52 41 47 57 at or a…
## 2 121 female white midd… public voca… 68 59 53 63 61 at or a…
## 3 86 male white high public gene… 44 33 54 58 31 below a…
## 4 141 male white high public voca… 63 44 47 53 56 at or a…
## 5 172 male white midd… public acad… 47 52 57 53 61 below a…
## 6 113 male white midd… public acad… 44 52 51 63 61 below a…
# Calculate median number of characters: med_num_char
med_num_char <- median( email50$num_char )
# Create num_char_cat variable in email50
email50_fortified <- email50 %>%
mutate(num_char_cat = ifelse(num_char < med_num_char, 'below median', "at or above median" ))
# Count emails in each category
email50_fortified %>%
count(num_char_cat)
## # A tibble: 2 x 2
## num_char_cat n
## <chr> <int>
## 1 at or above median 25
## 2 below median 25
# Create number_yn column in email50
email50_fortified <- email50 %>%
mutate(
number_yn = case_when(
# if number is "none", make number_yn "no"
number == "none" ~ "no",
# if number is not "none", make number_yn "yes"
number != "none" ~ "yes"
)
)
# Visualize the distribution of number_yn
ggplot(email50_fortified, aes(x = number_yn)) +
geom_bar()
Visualizing numerical data
ggplot2
- Modern looking hasstle-free plots (don’t have to explicitly render legends etc.)
- easy to extend code to multivariate plots
- Iterative construction (can build complex visualizations by adding relatively simple layers)
#scatterplot of math vs science scores
ggplot( data = hsb2, aes( x = science, y = math, color = prog ) ) +
#layers are separated by '+'
geom_point() #having the next layer on a new line makes code readable
# Scatterplot of exclaim_mess vs. num_char
ggplot(email50, aes(x = num_char, y = exclaim_mess, color = factor(spam))) +
geom_point()
Study types and cautionary tales
Observational studies and experiments
Types of studies:
- Observational Study:
- Collect data in a way that does not directly interfere with how the data arise
- Only correlations can be inferred
- Experiments:
- Randomly assign subjects to various treatments
- Causation can be inferred
In an experiment, variables that migth also contribute to the outcome (confounding variables) are most likely represented equally in the study groups due to random assignment. Therefore, if there is a significant difference found between the averages of the dependent variable, then we can make causal statements.
An example of an observational study:
data( gapminder )
glimpse( gapminder )
## Rows: 1,704
## Columns: 6
## $ country <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afghani…
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia,…
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997,…
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.…
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 1…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134,…
Random sampling and random assignment
Random….
- Random Sampling:
- At selection of subjects from population
- Helps generalizability of results
- Random Assignment:
- At selection of subjects from population
- helps infer causation from results
Scope of Inference
| Random Assignment | No Random Assignment | ||
|---|---|---|---|
| Random Sampling | Causal & Generalizable | Not Causal but Generalizable | Generalizable |
| No Random Sampling | Causal but not Generalizable | Not Causal or Generalizable | Not Generalizable |
| Causal | Not Causal |
Simpson’s paradox: not considering an important variable when studying an effect
load( file="ucb_admit.RData" )
head( ucb_admit )
## Admit Gender Dept
## 1 Admitted Male A
## 2 Admitted Male A
## 3 Admitted Male A
## 4 Admitted Male A
## 5 Admitted Male A
## 6 Admitted Male A
# Count number of male and female applicants admitted
ucb_admission_counts <- ucb_admit %>%
count(Gender, Admit)
#calculate the proportion of males admitted overall
ucb_admission_counts %>%
# Group by gender
group_by(Gender) %>%
# Create new variable
mutate(prop = n / sum( n )) %>%
# Filter for admitted
filter(Admit == "Admitted")
## # A tibble: 2 x 4
## # Groups: Gender [2]
## Gender Admit n prop
## <fct> <fct> <int> <dbl>
## 1 Male Admitted 1198 0.445
## 2 Female Admitted 557 0.304
#now to find the proportions of males and females admitted by department
ucb_admission_counts <- ucb_admit %>%
# Counts by department, then gender, then admission status
count(Dept, Gender, Admit)
ucb_admission_counts %>%
# Group by department, then gender
group_by(Dept, Gender) %>%
# Create new variable
mutate(prop = n/sum(n)) %>%
# Filter for male and admitted
filter(Gender == 'Male', Admit == "Admitted")
## # A tibble: 6 x 5
## # Groups: Dept, Gender [6]
## Dept Gender Admit n prop
## <chr> <fct> <fct> <int> <dbl>
## 1 A Male Admitted 512 0.621
## 2 B Male Admitted 353 0.630
## 3 C Male Admitted 120 0.369
## 4 D Male Admitted 138 0.331
## 5 E Male Admitted 53 0.277
## 6 F Male Admitted 22 0.0590
Sampling strategies and experimental design
Sampling strategies
Why sample? Why not take a census of the entire population of interest?
- Conducting a census is very resource intensive
- Nearly imposible to collect data from all individuals, hence there is no guarentee of unbiased results
- Populations constantly change
Simple Random Samples: select cases randomly from the population such that each case is equally likely to be selected.
Stratefied Samples: first, divide the population into homogenious groups, or stata. Then, randomly sample within strata.
Cluster Samples: divide the population into clusters, only samplea few clusters, but sample these clusters in entirety.
Multistage Samples: add another step after cluster sampling: randomly sample observations from within randomly sampled clusters.
Sampling in R
#load the county dataframe
data( county )
glimpse( county )
## Rows: 3,142
## Columns: 15
## $ name <chr> "Autauga County", "Baldwin County", "Barbour County…
## $ state <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabam…
## $ pop2000 <dbl> 43671, 140415, 29038, 20826, 51024, 11714, 21399, 1…
## $ pop2010 <dbl> 54571, 182265, 27457, 22915, 57322, 10914, 20947, 1…
## $ pop2017 <int> 55504, 212628, 25270, 22668, 58013, 10309, 19825, 1…
## $ pop_change <dbl> 1.48, 9.19, -6.22, 0.73, 0.68, -2.28, -2.69, -1.51,…
## $ poverty <dbl> 13.7, 11.8, 27.2, 15.2, 15.6, 28.5, 24.4, 18.6, 18.…
## $ homeownership <dbl> 77.5, 76.7, 68.0, 82.9, 82.0, 76.9, 69.0, 70.7, 71.…
## $ multi_unit <dbl> 7.2, 22.6, 11.1, 6.6, 3.7, 9.9, 13.7, 14.3, 8.7, 4.…
## $ unemployment_rate <dbl> 3.86, 3.99, 5.90, 4.39, 4.02, 4.93, 5.49, 4.93, 4.0…
## $ metro <fct> yes, yes, no, yes, yes, no, no, yes, no, no, yes, n…
## $ median_edu <fct> some_college, some_college, hs_diploma, hs_diploma,…
## $ per_capita_income <dbl> 27841.70, 27779.85, 17891.73, 20572.05, 21367.39, 1…
## $ median_hh_income <int> 55317, 52562, 33368, 43404, 47412, 29655, 36326, 43…
## $ smoking_ban <fct> none, none, partial, none, none, none, NA, NA, none…
set.seed( 1349 )
#remove DC (because it is not a state ?)
county_noDC <- county %>%
filter( state != 'District of Columbia') %>%
droplevels()
#simple random sample
county_srs <- county_noDC %>%
sample_n( size = 150 )
( length( unique( county_srs$state ) ) ) #depending on seed, might not sample all 50 states
## [1] 43
#and would very very very unlikely sample them evenly
county_srs %>%
group_by( state ) %>%
count() %>%
ggplot( aes( x = state, y = n ) ) +
geom_bar( stat = 'identity' ) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle( 'Simple Sample Distribution' )
#stratefied sample
county_str <- county_noDC %>%
group_by( state ) %>%
sample_n( size = 3 ) %>%
count() %>%
ggplot( aes( x = state, y = n ) ) +
geom_bar( stat = 'identity' ) +
ylim( 0,5 ) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle( 'Stratefied Sample Distribution' )
county_str
load( file="us_regions.RData" )
glimpse( us_regions )
## Rows: 51
## Columns: 2
## $ state <fct> Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island…
## $ region <fct> Northeast, Northeast, Northeast, Northeast, Northeast, Northea…
# Simple random sample: states_srs
states_srs <- us_regions %>%
sample_n( 8 )
# Count states by region
states_srs %>%
count(region)
## region n
## 1 Midwest 3
## 2 Northeast 1
## 3 South 2
## 4 West 2
# Stratified sample
states_str <- us_regions %>%
group_by(region) %>%
sample_n(2)
# Count states by region
states_str %>%
count()
## # A tibble: 4 x 2
## # Groups: region [4]
## region n
## <fct> <int>
## 1 Midwest 2
## 2 Northeast 2
## 3 South 2
## 4 West 2
Principles of Experimental Design
- Control: compare treatment of interest to a control group
- Randomize: randomly assign subjects to treatment
- Replicate: collect a sufficiently large sample within a study, or replicate the entire study
- Block: account for the potential effect of confounding variables
- Group subjects unto blocks based on variables
- Randomize within each block to treatment groups
Case study
Beauty in the Classroom: are attractive instructors given higher evaluations
load( file="evals.RData" )
glimpse(evals)
## Rows: 463
## Columns: 21
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, …
## $ rank <fct> tenure track, tenure track, tenure track, tenure track,…
## $ ethnicity <fct> minority, minority, minority, minority, not minority, n…
## $ gender <fct> female, female, female, female, male, male, male, male,…
## $ language <fct> english, english, english, english, english, english, e…
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40,…
## $ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000, 87.50…
## $ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24, 17, 14…
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 25, 20,…
## $ cls_level <fct> upper, upper, upper, upper, upper, upper, upper, upper,…
## $ cls_profs <fct> single, single, single, single, multiple, multiple, mul…
## $ cls_credits <fct> multi credit, multi credit, multi credit, multi credit,…
## $ bty_f1lower <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 7, 7…
## $ bty_f1upper <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 9, 9…
## $ bty_f2upper <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 9, 9…
## $ bty_m1lower <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7…
## $ bty_m1upper <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 6…
## $ bty_m2upper <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6…
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.333,…
## $ pic_outfit <fct> not formal, not formal, not formal, not formal, not for…
## $ pic_color <fct> color, color, color, color, color, color, color, color,…
# Recode cls_students as cls_type
evals_fortified <- evals %>%
mutate(
cls_type = case_when(
cls_students <= 18 ~ "small",
cls_students > 18 & cls_students <= 59 ~ "midsize",
cls_students >= 60 ~ "large"
)
)
glimpse( evals_fortified )
## Rows: 463
## Columns: 22
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, …
## $ rank <fct> tenure track, tenure track, tenure track, tenure track,…
## $ ethnicity <fct> minority, minority, minority, minority, not minority, n…
## $ gender <fct> female, female, female, female, male, male, male, male,…
## $ language <fct> english, english, english, english, english, english, e…
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40,…
## $ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000, 87.50…
## $ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24, 17, 14…
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 25, 20,…
## $ cls_level <fct> upper, upper, upper, upper, upper, upper, upper, upper,…
## $ cls_profs <fct> single, single, single, single, multiple, multiple, mul…
## $ cls_credits <fct> multi credit, multi credit, multi credit, multi credit,…
## $ bty_f1lower <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 7, 7…
## $ bty_f1upper <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 9, 9…
## $ bty_f2upper <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 9, 9…
## $ bty_m1lower <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7…
## $ bty_m1upper <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 6…
## $ bty_m2upper <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6…
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.333,…
## $ pic_outfit <fct> not formal, not formal, not formal, not formal, not for…
## $ pic_color <fct> color, color, color, color, color, color, color, color,…
## $ cls_type <chr> "midsize", "large", "large", "large", "midsize", "midsi…
# Scatterplot of score vs. bty_avg
ggplot(evals_fortified, aes(x=bty_avg, y=score, color=cls_type)) +
geom_point()