# Load packages
library(openintro) #for the use of email50 and county data
library(dplyr) #for the use of dplyr functions such as mutate
library(ggplot2) #for use of ggplot2 functions such ggplot()
# Load data
data(email50)
# View its structure
str(email50)
## 'data.frame': 50 obs. of 21 variables:
## $ spam : num 0 0 1 0 0 0 0 0 0 0 ...
## $ to_multiple : num 0 0 0 0 0 0 0 0 0 0 ...
## $ from : num 1 1 1 1 1 1 1 1 1 1 ...
## $ cc : int 0 0 4 0 0 0 0 0 1 0 ...
## $ sent_email : num 1 0 0 0 0 0 0 1 1 0 ...
## $ time : POSIXct, format: "2012-01-04 13:19:16" "2012-02-16 20:10:06" ...
## $ image : num 0 0 0 0 0 0 0 0 0 0 ...
## $ attach : num 0 0 2 0 0 0 0 0 0 0 ...
## $ dollar : num 0 0 0 0 9 0 0 0 0 23 ...
## $ winner : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ inherit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ viagra : num 0 0 0 0 0 0 0 0 0 0 ...
## $ password : num 0 0 0 0 1 0 0 0 0 0 ...
## $ num_char : num 21.705 7.011 0.631 2.454 41.623 ...
## $ line_breaks : int 551 183 28 61 1088 5 17 88 242 578 ...
## $ format : num 1 1 0 0 1 0 0 1 1 1 ...
## $ re_subj : num 1 0 0 0 0 0 0 1 1 0 ...
## $ exclaim_subj: num 0 0 0 0 0 0 0 0 1 0 ...
## $ urgent_subj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_mess: num 8 1 2 1 43 0 0 2 22 3 ...
## $ number : Factor w/ 3 levels "none","small",..: 2 3 1 2 2 2 2 2 2 2 ...
glimpse() gives you something similar to a summary about a dataset.
# Glimpse email50
glimpse(email50)
## Observations: 50
## Variables: 21
## $ spam <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0...
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0...
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ cc <int> 0, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ sent_email <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ time <dttm> 2012-01-04 13:19:16, 2012-02-16 20:10:06, 2012-0...
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ attach <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0...
## $ dollar <dbl> 0, 0, 0, 0, 9, 0, 0, 0, 0, 23, 4, 0, 3, 2, 0, 0, ...
## $ winner <fctr> no, no, no, no, no, no, no, no, no, no, no, no, ...
## $ inherit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ password <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0...
## $ num_char <dbl> 21.705, 7.011, 0.631, 2.454, 41.623, 0.057, 0.809...
## $ line_breaks <int> 551, 183, 28, 61, 1088, 5, 17, 88, 242, 578, 1167...
## $ format <dbl> 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1...
## $ re_subj <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ exclaim_mess <dbl> 8, 1, 2, 1, 43, 0, 0, 2, 22, 3, 13, 1, 2, 2, 21, ...
## $ number <fctr> small, big, none, small, small, small, small, sm...
filter() helps you create a subset out of a dataset using only specified parts.
# Subset of emails with big numbers: email50_big
email50_big <- email50 %>%
filter(number == "big")
# Glimpse the subset
glimpse(email50_big)
## Observations: 7
## Variables: 21
## $ spam <dbl> 0, 0, 1, 0, 0, 0, 0
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1
## $ cc <int> 0, 0, 0, 0, 0, 0, 0
## $ sent_email <dbl> 0, 0, 0, 0, 0, 1, 0
## $ time <dttm> 2012-02-16 20:10:06, 2012-02-04 23:26:09, 2012-0...
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0
## $ attach <dbl> 0, 0, 0, 0, 0, 0, 0
## $ dollar <dbl> 0, 0, 3, 2, 0, 0, 0
## $ winner <fctr> no, no, yes, no, no, no, no
## $ inherit <dbl> 0, 0, 0, 0, 0, 0, 0
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0
## $ password <dbl> 0, 2, 0, 0, 0, 0, 8
## $ num_char <dbl> 7.011, 10.368, 42.793, 26.520, 6.563, 11.223, 10.613
## $ line_breaks <int> 183, 198, 712, 692, 140, 512, 225
## $ format <dbl> 1, 1, 1, 1, 1, 1, 1
## $ re_subj <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_subj <dbl> 0, 0, 0, 1, 0, 0, 0
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_mess <dbl> 1, 1, 2, 7, 2, 9, 9
## $ number <fctr> big, big, big, big, big, big, big
# Table of number variable
table(email50_big$number)
##
## none small big
## 0 0 7
# Drop levels
email50_big$number <- droplevels(email50_big$number)
# Another table of number variable
table(email50_big$number)
##
## big
## 7
# Calculate median number of characters: med_num_char
med_num_char <- median(email50$num_char)
# Create num_char_cat variable in email50
email50 <- email50 %>%
mutate(num_char_cat = ifelse(num_char < med_num_char, "below median", "at or above median"))
# Count emails in each category
table(email50$num_char_cat)
##
## at or above median below median
## 25 25
# Create number_yn column in email50
email50 <- email50 %>%
mutate(number_yn = ifelse(number == "none", "no", "yes"))
# Visualize number_yn
ggplot(email50, aes(x = number_yn)) +
geom_bar()
# Load ggplot2
library(ggplot2)
# Scatterplot of exclaim_mess vs. num_char
ggplot(email50, aes(x = num_char, y = exclaim_mess, color = factor(spam))) +
geom_point()
# Download data
library(gapminder)
# Load data
data(gapminder)
# Glimpse data
glimpse(gapminder)
## Observations: 1,704
## Variables: 6
## $ country <fctr> Afghanistan, Afghanistan, Afghanistan, Afghanistan,...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asi...
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992...
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.8...
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 1488...
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 78...
# Identify type of study
type_of_study <- "observational"
Helps generalizability of results
Helps infer causation from results
# Import Data
ucb_admit <- read.csv("/resources/rstudio/data/ucb_admit.csv")
ucb_admit$Dept <- as.character(ucb_admit$Dept)
glimpse(ucb_admit)
## Observations: 4,526
## Variables: 3
## $ Admit <fctr> Admitted, Admitted, Admitted, Admitted, Admitted, Admi...
## $ Gender <fctr> Male, Male, Male, Male, Male, Male, Male, Male, Male, ...
## $ Dept <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", ...
## Observations: 4,526
## Variables: 3
## $ Admit <fctr> Admitted, Admitted, Admitted, Admitted, Admitted, Admi...
## $ Gender <fctr> Male, Male, Male, Male, Male, Male, Male, Male, Male, ...
## $ Dept <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", ...
summary(ucb_admit)
## Admit Gender Dept
## Admitted:1755 Female:1835 Length:4526
## Rejected:2771 Male :2691 Class :character
## Mode :character
## Admit Gender Dept
## Admitted:1755 Female:1835 Length:4526
## Rejected:2771 Male :2691 Class :character
## Mode :character
# Load packages
library(dplyr)
library(tidyr)
# Count number of male and female applicants admitted
ucb_counts <- ucb_admit %>%
count(Admit, Gender)
# View result
ucb_counts
## # A tibble: 4 x 3
## Admit Gender n
## <fctr> <fctr> <int>
## 1 Admitted Female 557
## 2 Admitted Male 1198
## 3 Rejected Female 1278
## 4 Rejected Male 1493
# Spread the output across columns
ucb_counts %>%
spread(Admit, n)
## # A tibble: 2 x 3
## Gender Admitted Rejected
## * <fctr> <int> <int>
## 1 Female 557 1278
## 2 Male 1198 1493
ucb_admit %>%
# Table of counts of admission status and gender
count(Admit, Gender) %>%
# Spread output across columns based on admission status
spread(Admit, n) %>%
# Create new variable
mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## # A tibble: 2 x 4
## Gender Admitted Rejected Perc_Admit
## <fctr> <int> <int> <dbl>
## 1 Female 557 1278 0.3035422
## 2 Male 1198 1493 0.4451877
# Table of counts of admission status and gender for each department
admit_by_dept <- ucb_admit %>%
count(Dept, Admit, Gender) %>%
spread(Admit, n)
# View result
admit_by_dept
## # A tibble: 12 x 4
## Dept Gender Admitted Rejected
## * <chr> <fctr> <int> <int>
## 1 A Female 89 19
## 2 A Male 512 313
## 3 B Female 17 8
## 4 B Male 353 207
## 5 C Female 202 391
## 6 C Male 120 205
## 7 D Female 131 244
## 8 D Male 138 279
## 9 E Female 94 299
## 10 E Male 53 138
## 11 F Female 24 317
## 12 F Male 22 351
# Percentage of those admitted to each department
admit_by_dept %>%
mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## # A tibble: 12 x 5
## Dept Gender Admitted Rejected Perc_Admit
## <chr> <fctr> <int> <int> <dbl>
## 1 A Female 89 19 0.82407407
## 2 A Male 512 313 0.62060606
## 3 B Female 17 8 0.68000000
## 4 B Male 353 207 0.63035714
## 5 C Female 202 391 0.34064081
## 6 C Male 120 205 0.36923077
## 7 D Female 131 244 0.34933333
## 8 D Male 138 279 0.33093525
## 9 E Female 94 299 0.23918575
## 10 E Male 53 138 0.27748691
## 11 F Female 24 317 0.07038123
## 12 F Male 22 351 0.05898123
# Import us_regions
us_regions <- read.csv("/resources/rstudio/data/us_regions.csv")
# Simple random sample: states_srs
states_srs <- us_regions %>%
sample_n(size = 8)
# Count states by region
states_srs %>%
group_by(region) %>%
count()
## # A tibble: 4 x 2
## # Groups: region [4]
## region n
## <fctr> <int>
## 1 Midwest 2
## 2 Northeast 1
## 3 South 3
## 4 West 2
The goal of stratified sampling is to select an equal number of states from each region.
# Stratified sample
states_str <- us_regions %>%
group_by(region) %>%
sample_n(size = 2)
# Count states by region
states_str %>%
group_by(region) %>%
count()
## # A tibble: 4 x 2
## # Groups: region [4]
## region n
## <fctr> <int>
## 1 Midwest 2
## 2 Northeast 2
## 3 South 2
## 4 West 2
Experimental variables are conditions you can impose on the experimental units, while blocking variables are characteristics that the experimental units come with that you would like to control for.
In random sampling, you use stratifying to control for a variable. In random assignment, you use blocking to achieve the same goal.
# Import Evals
evals <- read.csv("/resources/rstudio/data/evals.csv")
# Inspect evals
glimpse(evals)
## Observations: 463
## Variables: 21
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5...
## $ rank <fctr> tenure track, tenure track, tenure track, tenur...
## $ ethnicity <fctr> minority, minority, minority, minority, not min...
## $ gender <fctr> female, female, female, female, male, male, mal...
## $ language <fctr> english, english, english, english, english, en...
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, ...
## $ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000...
## $ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24,...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, ...
## $ cls_level <fctr> upper, upper, upper, upper, upper, upper, upper...
## $ cls_profs <fctr> single, single, single, single, multiple, multi...
## $ cls_credits <fctr> multi credit, multi credit, multi credit, multi...
## $ bty_f1lower <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_f1upper <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, ...
## $ bty_f2upper <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, ...
## $ bty_m1lower <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m1upper <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m2upper <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000,...
## $ pic_outfit <fctr> not formal, not formal, not formal, not formal,...
## $ pic_color <fctr> color, color, color, color, color, color, color...
# Inspect variable types
glimpse(evals)
## Observations: 463
## Variables: 21
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5...
## $ rank <fctr> tenure track, tenure track, tenure track, tenur...
## $ ethnicity <fctr> minority, minority, minority, minority, not min...
## $ gender <fctr> female, female, female, female, male, male, mal...
## $ language <fctr> english, english, english, english, english, en...
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, ...
## $ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000...
## $ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24,...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, ...
## $ cls_level <fctr> upper, upper, upper, upper, upper, upper, upper...
## $ cls_profs <fctr> single, single, single, single, multiple, multi...
## $ cls_credits <fctr> multi credit, multi credit, multi credit, multi...
## $ bty_f1lower <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_f1upper <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, ...
## $ bty_f2upper <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, ...
## $ bty_m1lower <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m1upper <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m2upper <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000,...
## $ pic_outfit <fctr> not formal, not formal, not formal, not formal,...
## $ pic_color <fctr> color, color, color, color, color, color, color...
# Remove non-factor variables from this vector
cat_vars <- c("rank", "ethnicity", "gender", "language",
"cls_level", "cls_profs", "cls_credits",
"pic_outfit", "pic_color")
# Recode cls_students as cls_type: evals
evals <- evals %>%
# Create new variable
mutate(cls_type = ifelse(cls_students <= 18, "small",
ifelse(cls_students >= 60, "large", "midsize")))
# Scatterplot of score vs. bty_avg
ggplot(evals, aes(x = bty_avg, y = score)) +
geom_point()
# Scatterplot of score vs. bty_avg colored by cls_type
ggplot(evals, aes(x = bty_avg, y = score, color = cls_type)) +
geom_point()
library(openintro)
library(dplyr)
states <- county %>%
sample_n(size = 150)
glimpse(states)
## Observations: 150
## Variables: 10
## $ name <fctr> Buffalo County, Sauk County, Wake County, Johns...
## $ state <fctr> Nebraska, Wisconsin, North Carolina, Texas, Ohi...
## $ pop2000 <dbl> 42259, 55225, 627846, 126811, 32641, 9017, 7976,...
## $ pop2010 <dbl> 46102, 61976, 900993, 150934, 33225, 10052, 7861...
## $ fed_spend <dbl> 5.799575, 7.008665, 9.041650, 5.009481, 9.313589...
## $ poverty <dbl> 14.0, 9.4, 9.7, 10.5, 23.3, 16.1, 18.7, 8.8, 9.2...
## $ homeownership <dbl> 65.5, 72.8, 66.2, 76.6, 68.6, 77.8, 77.9, 75.4, ...
## $ multiunit <dbl> 20.4, 20.0, 26.7, 8.8, 11.5, 5.5, 4.9, 17.2, 22....
## $ income <dbl> 22616, 25452, 32592, 23669, 18775, 19691, 18367,...
## $ med_income <dbl> 47120, 50390, 63770, 54954, 34044, 40143, 32106,...
library(ggplot2)
library(openintro)
ggplot(county, aes(x = income, y = fed_spend)) +
geom_point()