# source("create_datasets.R")
# load('data/test_datasets.RData')
library(dplyr)
library(tidyr)
library(ggplot2)
library(openintro)
library(gapminder)
## email50 dataset is in the openintro library which has been loaded
# Load data
data(email50)
# View its structure
str(email50)
## 'data.frame': 50 obs. of 21 variables:
## $ spam : num 0 0 1 0 0 0 0 0 0 0 ...
## $ to_multiple : num 0 0 0 0 0 0 0 0 0 0 ...
## $ from : num 1 1 1 1 1 1 1 1 1 1 ...
## $ cc : int 0 0 4 0 0 0 0 0 1 0 ...
## $ sent_email : num 1 0 0 0 0 0 0 1 1 0 ...
## $ time : POSIXct, format: "2012-01-04 08:19:16" "2012-02-16 15:10:06" ...
## $ image : num 0 0 0 0 0 0 0 0 0 0 ...
## $ attach : num 0 0 2 0 0 0 0 0 0 0 ...
## $ dollar : num 0 0 0 0 9 0 0 0 0 23 ...
## $ winner : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ inherit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ viagra : num 0 0 0 0 0 0 0 0 0 0 ...
## $ password : num 0 0 0 0 1 0 0 0 0 0 ...
## $ num_char : num 21.705 7.011 0.631 2.454 41.623 ...
## $ line_breaks : int 551 183 28 61 1088 5 17 88 242 578 ...
## $ format : num 1 1 0 0 1 0 0 1 1 1 ...
## $ re_subj : num 1 0 0 0 0 0 0 1 1 0 ...
## $ exclaim_subj: num 0 0 0 0 0 0 0 0 1 0 ...
## $ urgent_subj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_mess: num 8 1 2 1 43 0 0 2 22 3 ...
## $ number : Factor w/ 3 levels "none","small",..: 2 3 1 2 2 2 2 2 2 2 ...
glimpse(email50)
## Observations: 50
## Variables: 21
## $ spam <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0...
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0...
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ cc <int> 0, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ sent_email <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ time <dttm> 2012-01-04 08:19:16, 2012-02-16 15:10:06, 2012-0...
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ attach <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0...
## $ dollar <dbl> 0, 0, 0, 0, 9, 0, 0, 0, 0, 23, 4, 0, 3, 2, 0, 0, ...
## $ winner <fctr> no, no, no, no, no, no, no, no, no, no, no, no, ...
## $ inherit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ password <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0...
## $ num_char <dbl> 21.705, 7.011, 0.631, 2.454, 41.623, 0.057, 0.809...
## $ line_breaks <int> 551, 183, 28, 61, 1088, 5, 17, 88, 242, 578, 1167...
## $ format <dbl> 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1...
## $ re_subj <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ exclaim_mess <dbl> 8, 1, 2, 1, 43, 0, 0, 2, 22, 3, 13, 1, 2, 2, 21, ...
## $ number <fctr> small, big, none, small, small, small, small, sm...
# Subset of emails with big numbers: email50_big
email50_big <- email50 %>%
filter(number == "big")
# Glimpse the subset
glimpse(email50_big)
## Observations: 7
## Variables: 21
## $ spam <dbl> 0, 0, 1, 0, 0, 0, 0
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1
## $ cc <int> 0, 0, 0, 0, 0, 0, 0
## $ sent_email <dbl> 0, 0, 0, 0, 0, 1, 0
## $ time <dttm> 2012-02-16 15:10:06, 2012-02-04 18:26:09, 2012-0...
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0
## $ attach <dbl> 0, 0, 0, 0, 0, 0, 0
## $ dollar <dbl> 0, 0, 3, 2, 0, 0, 0
## $ winner <fctr> no, no, yes, no, no, no, no
## $ inherit <dbl> 0, 0, 0, 0, 0, 0, 0
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0
## $ password <dbl> 0, 2, 0, 0, 0, 0, 8
## $ num_char <dbl> 7.011, 10.368, 42.793, 26.520, 6.563, 11.223, 10.613
## $ line_breaks <int> 183, 198, 712, 692, 140, 512, 225
## $ format <dbl> 1, 1, 1, 1, 1, 1, 1
## $ re_subj <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_subj <dbl> 0, 0, 0, 1, 0, 0, 0
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_mess <dbl> 1, 1, 2, 7, 2, 9, 9
## $ number <fctr> big, big, big, big, big, big, big
# Table of number variable
table(email50_big$number)
##
## none small big
## 0 0 7
# Drop levels
email50_big$number <- droplevels(email50_big$number)
# Another table of number variable
table(email50_big$number)
##
## big
## 7
ifelse
statement to convert a numerical variable to a categorical variable based on a set criteria# Calculate median number of characters: med_num_char
med_num_char <- median(email50$num_char)
# Create num_char_cat variable in email50
email50 <- email50 %>%
mutate(num_char_cat = ifelse(num_char < med_num_char, "below median", "at or above median"))
# Count emails in each category
table(email50$num_char_cat)
##
## at or above median below median
## 25 25
# Create number_yn column in email50
email50 <- email50 %>%
mutate(number_yn = ifelse(number == "none", "no", "yes"))
# Visualize number_yn
ggplot(email50, aes(x = number_yn)) +
geom_bar()
# Load ggplot2
# Scatterplot of exclaim_mess vs. num_char
ggplot(email50, aes(x = num_char, y = exclaim_mess, color = factor(spam))) +
geom_point()
A study is designed to evaluate whether people read text faster in Arial or Helvetica font. A group of volunteers who agreed to be a part of the study are randomly assigned to two groups: one where they read some text in Arial, and another where they read the same text in Helvetica. At the end, average reading speeds from the two groups are compared.
What type of study is this?
# Load data
data(gapminder)
# Glimpse data
glimpse(gapminder)
## Observations: 1,704
## Variables: 6
## $ country <fctr> Afghanistan, Afghanistan, Afghanistan, Afghanistan,...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asi...
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992...
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.8...
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 1488...
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 78...
One of the early studies linking smoking and lung cancer compared patients who are already hospitalized with lung cancer to similar patients without lung cancer (hospitalized for other reasons), and recorded whether each patient smoked. Then, proportions of smokers for patients with and without lung cancer were compared.
Does this study employ random sampling and/or random assignment?
Volunteers were recruited to participate in a study where they were asked to type 40 bits of trivia—for example, “an ostrich’s eye is bigger than its brain”—into a computer. A randomly selected half of these subjects were told the information would be saved in the computer; the other half were told the items they typed would be erased.
Then, the subjects were asked to remember these bits of trivia, and the number of bits of trivia each subject could correctly recall were recorded. It was found that the subjects were significantly more likely to remember information if they thought they would not be able to find it later.
cannot
be generalized to all peoplecan
be inferred based on these results.# dplyr and tidyr are already loaded
load('data/ucb_admit.Rdata')
str(ucb_admit)
## 'data.frame': 4526 obs. of 3 variables:
## $ Admit : Factor w/ 2 levels "Admitted","Rejected": 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender: Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : chr "A" "A" "A" "A" ...
# Count number of male and female applicants admitted
ucb_counts <- ucb_admit %>%
count(Gender, Admit)
# View result
ucb_counts
## # A tibble: 4 x 3
## Gender Admit n
## <fctr> <fctr> <int>
## 1 Male Admitted 1198
## 2 Male Rejected 1493
## 3 Female Admitted 557
## 4 Female Rejected 1278
# Spread the output across columns
ucb_counts %>%
spread(Admit, n)
## # A tibble: 2 x 3
## Gender Admitted Rejected
## * <fctr> <int> <int>
## 1 Male 1198 1493
## 2 Female 557 1278
str(ucb_admit)
## 'data.frame': 4526 obs. of 3 variables:
## $ Admit : Factor w/ 2 levels "Admitted","Rejected": 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender: Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : chr "A" "A" "A" "A" ...
ucb_admit %>%
# Table of counts of admission status and gender
count(Gender, Admit) %>%
# Spread output across columns based on admission status
spread(Admit, n) %>%
# Create new variable
mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## # A tibble: 2 x 4
## Gender Admitted Rejected Perc_Admit
## <fctr> <int> <int> <dbl>
## 1 Male 1198 1493 0.4451877
## 2 Female 557 1278 0.3035422
str(ucb_admit)
## 'data.frame': 4526 obs. of 3 variables:
## $ Admit : Factor w/ 2 levels "Admitted","Rejected": 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender: Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : chr "A" "A" "A" "A" ...
# Table of counts of admission status and gender for each department
admit_by_dept <- ucb_admit %>%
count(Dept, Gender, Admit) %>%
spread(Admit, n)
# View result
admit_by_dept
## # A tibble: 12 x 4
## Dept Gender Admitted Rejected
## * <chr> <fctr> <int> <int>
## 1 A Male 512 313
## 2 A Female 89 19
## 3 B Male 353 207
## 4 B Female 17 8
## 5 C Male 120 205
## 6 C Female 202 391
## 7 D Male 138 279
## 8 D Female 131 244
## 9 E Male 53 138
## 10 E Female 94 299
## 11 F Male 22 351
## 12 F Female 24 317
# Percentage of those admitted to each department
admit_by_dept %>%
mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## # A tibble: 12 x 5
## Dept Gender Admitted Rejected Perc_Admit
## <chr> <fctr> <int> <int> <dbl>
## 1 A Male 512 313 0.62060606
## 2 A Female 89 19 0.82407407
## 3 B Male 353 207 0.63035714
## 4 B Female 17 8 0.68000000
## 5 C Male 120 205 0.36923077
## 6 C Female 202 391 0.34064081
## 7 D Male 138 279 0.33093525
## 8 D Female 131 244 0.34933333
## 9 E Male 53 138 0.27748691
## 10 E Female 94 299 0.23918575
## 11 F Male 22 351 0.05898123
## 12 F Female 24 317 0.07038123
Sample strategies:
A consulting company is planning a pilot study on marketing in Boston. They identify the zip codes that make up the greater Boston area, then sample 50 randomly selected addresses from each zip code and mail a coupon to these addresses. They then track whether the coupon was used in the following month.
What sampling strategy has this company used?
sample_n
load('data/us_regions.RData')
str(us_regions)
## 'data.frame': 51 obs. of 2 variables:
## $ state : Factor w/ 51 levels "Alabama","Alaska",..: 7 20 22 30 40 46 31 33 39 14 ...
## $ region: Factor w/ 4 levels "Midwest","Northeast",..: 2 2 2 2 2 2 2 2 2 1 ...
# Simple random sample: states_srs
states_srs <- us_regions %>%
sample_n(8)
# Count states by region
states_srs %>%
count(region)
## # A tibble: 4 x 2
## region n
## <fctr> <int>
## 1 Midwest 2
## 2 Northeast 2
## 3 South 2
## 4 West 2
# Stratified sample
states_str <- us_regions %>%
group_by(region) %>%
sample_n(2)
# Count states by region
states_str %>%
count(region)
## # A tibble: 4 x 2
## # Groups: region [4]
## region n
## <fctr> <int>
## 1 Midwest 2
## 2 Northeast 2
## 3 South 2
## 4 West 2
A researcher designs a study to test the effect of light and noise levels on exam performance of students. The researcher also believes that light and noise levels might have different effects on males and females, so she wants to make sure both genders are represented equally under different conditions.
What variables are involved in this study?
Explanatory
variables are conditions you can impose on the experimental units, while blocking
variables are characteristics that the experimental units come with that you would like to control for.
stratifying
to control for a variable.blocking
to achieve the same goal.
# Inspect evals
load('data/evals.RData')
glimpse(evals)
## Observations: 463
## Variables: 21
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5...
## $ rank <fctr> tenure track, tenure track, tenure track, tenur...
## $ ethnicity <fctr> minority, minority, minority, minority, not min...
## $ gender <fctr> female, female, female, female, male, male, mal...
## $ language <fctr> english, english, english, english, english, en...
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, ...
## $ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000...
## $ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24,...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, ...
## $ cls_level <fctr> upper, upper, upper, upper, upper, upper, upper...
## $ cls_profs <fctr> single, single, single, single, multiple, multi...
## $ cls_credits <fctr> multi credit, multi credit, multi credit, multi...
## $ bty_f1lower <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_f1upper <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, ...
## $ bty_f2upper <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, ...
## $ bty_m1lower <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m1upper <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m2upper <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000,...
## $ pic_outfit <fctr> not formal, not formal, not formal, not formal,...
## $ pic_color <fctr> color, color, color, color, color, color, color...
# Inspect variable types
glimpse(evals)
## Observations: 463
## Variables: 21
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5...
## $ rank <fctr> tenure track, tenure track, tenure track, tenur...
## $ ethnicity <fctr> minority, minority, minority, minority, not min...
## $ gender <fctr> female, female, female, female, male, male, mal...
## $ language <fctr> english, english, english, english, english, en...
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, ...
## $ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000...
## $ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24,...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, ...
## $ cls_level <fctr> upper, upper, upper, upper, upper, upper, upper...
## $ cls_profs <fctr> single, single, single, single, multiple, multi...
## $ cls_credits <fctr> multi credit, multi credit, multi credit, multi...
## $ bty_f1lower <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_f1upper <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, ...
## $ bty_f2upper <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, ...
## $ bty_m1lower <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m1upper <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m2upper <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000,...
## $ pic_outfit <fctr> not formal, not formal, not formal, not formal,...
## $ pic_color <fctr> color, color, color, color, color, color, color...
# Remove non-factor variables from this vector
cat_vars <- c("rank", "ethnicity", "gender", "language",
"cls_level", "cls_profs", "cls_credits",
"pic_outfit", "pic_color")
# Recode cls_students as cls_type: evals
evals <- evals %>%
# Create new variable
mutate(cls_type = ifelse(cls_students <= 18, 'small',
ifelse(cls_students <= 59, 'midsize', 'large')))
table(evals$cls_type)
##
## large midsize small
## 117 233 113
# Scatterplot of score vs. bty_avg
ggplot(evals, aes(bty_avg, score)) +
geom_point()
# Scatterplot of score vs. bty_avg colored by cls_type
ggplot(evals, aes(bty_avg, score, color = cls_type)) +
geom_point()