Disclaimer: The content of this RMarkdown note came from a course called Introduction to Data in datacamp.
library(openintro)
# Load data
data(email50)
# View its structure
str(email50)
## 'data.frame': 50 obs. of 21 variables:
## $ spam : num 0 0 1 0 0 0 0 0 0 0 ...
## $ to_multiple : num 0 0 0 0 0 0 0 0 0 0 ...
## $ from : num 1 1 1 1 1 1 1 1 1 1 ...
## $ cc : int 0 0 4 0 0 0 0 0 1 0 ...
## $ sent_email : num 1 0 0 0 0 0 0 1 1 0 ...
## $ time : POSIXct, format: "2012-01-04 13:19:16" "2012-02-16 20:10:06" ...
## $ image : num 0 0 0 0 0 0 0 0 0 0 ...
## $ attach : num 0 0 2 0 0 0 0 0 0 0 ...
## $ dollar : num 0 0 0 0 9 0 0 0 0 23 ...
## $ winner : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ inherit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ viagra : num 0 0 0 0 0 0 0 0 0 0 ...
## $ password : num 0 0 0 0 1 0 0 0 0 0 ...
## $ num_char : num 21.705 7.011 0.631 2.454 41.623 ...
## $ line_breaks : int 551 183 28 61 1088 5 17 88 242 578 ...
## $ format : num 1 1 0 0 1 0 0 1 1 1 ...
## $ re_subj : num 1 0 0 0 0 0 0 1 1 0 ...
## $ exclaim_subj: num 0 0 0 0 0 0 0 0 1 0 ...
## $ urgent_subj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_mess: num 8 1 2 1 43 0 0 2 22 3 ...
## $ number : Factor w/ 3 levels "none","small",..: 2 3 1 2 2 2 2 2 2 2 ...
library(dplyr)
# Glimpse email50
glimpse(email50)
## Observations: 50
## Variables: 21
## $ spam <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0...
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0...
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ cc <int> 0, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ sent_email <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ time <dttm> 2012-01-04 13:19:16, 2012-02-16 20:10:06, 2012-0...
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ attach <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0...
## $ dollar <dbl> 0, 0, 0, 0, 9, 0, 0, 0, 0, 23, 4, 0, 3, 2, 0, 0, ...
## $ winner <fctr> no, no, no, no, no, no, no, no, no, no, no, no, ...
## $ inherit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ password <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0...
## $ num_char <dbl> 21.705, 7.011, 0.631, 2.454, 41.623, 0.057, 0.809...
## $ line_breaks <int> 551, 183, 28, 61, 1088, 5, 17, 88, 242, 578, 1167...
## $ format <dbl> 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1...
## $ re_subj <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ exclaim_mess <dbl> 8, 1, 2, 1, 43, 0, 0, 2, 22, 3, 13, 1, 2, 2, 21, ...
## $ number <fctr> small, big, none, small, small, small, small, sm...
# Subset of emails with big numbers: email50_big
email50_big <- email50 %>%
filter(number == "big")
# Glimpse the subset
glimpse(email50_big)
## Observations: 7
## Variables: 21
## $ spam <dbl> 0, 0, 1, 0, 0, 0, 0
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1
## $ cc <int> 0, 0, 0, 0, 0, 0, 0
## $ sent_email <dbl> 0, 0, 0, 0, 0, 1, 0
## $ time <dttm> 2012-02-16 20:10:06, 2012-02-04 23:26:09, 2012-0...
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0
## $ attach <dbl> 0, 0, 0, 0, 0, 0, 0
## $ dollar <dbl> 0, 0, 3, 2, 0, 0, 0
## $ winner <fctr> no, no, yes, no, no, no, no
## $ inherit <dbl> 0, 0, 0, 0, 0, 0, 0
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0
## $ password <dbl> 0, 2, 0, 0, 0, 0, 8
## $ num_char <dbl> 7.011, 10.368, 42.793, 26.520, 6.563, 11.223, 10.613
## $ line_breaks <int> 183, 198, 712, 692, 140, 512, 225
## $ format <dbl> 1, 1, 1, 1, 1, 1, 1
## $ re_subj <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_subj <dbl> 0, 0, 0, 1, 0, 0, 0
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_mess <dbl> 1, 1, 2, 7, 2, 9, 9
## $ number <fctr> big, big, big, big, big, big, big
# Table of number variable
table(email50_big$number)
##
## none small big
## 0 0 7
# Drop levels
email50_big$number <- droplevels(email50_big$number)
# Another table of number variable
table(email50_big$number)
##
## big
## 7
# Calculate median number of characters: med_num_char
med_num_char <- median(email50$num_char)
# Create num_char_cat variable in email50
email50 <- email50 %>%
mutate(num_char_cat = ifelse(num_char < med_num_char, "below median", "at or above median"))
# Count emails in each category
table(email50$num_char_cat)
##
## at or above median below median
## 25 25
library(ggplot2)
# Create number_yn column in email50
email50 <- email50 %>%
mutate(number_yn = ifelse(number == "none", "no", "yes"))
# Visualize number_yn
ggplot(email50, aes(x = number_yn)) +
geom_bar()
# Load ggplot2
library(ggplot2)
# Scatterplot of exclaim_mess vs. num_char
ggplot(email50, aes(x = num_char, y = exclaim_mess, color = factor(spam))) +
geom_point()
library(gapminder)
# Load data
data(gapminder)
# Glimpse data
glimpse(gapminder)
## Observations: 1,704
## Variables: 6
## $ country <fctr> Afghanistan, Afghanistan, Afghanistan, Afghanistan,...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asi...
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992...
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.8...
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 1488...
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 78...
# Identify type of study
type_of_study <- "observational"
ucb_admit <- read.csv("/resources/rstudio/ucb_admit.csv")
ucb_admit$Dept <- as.character(ucb_admit$Dept)
glimpse(ucb_admit)
## Observations: 4,526
## Variables: 3
## $ Admit <fctr> Admitted, Admitted, Admitted, Admitted, Admitted, Admi...
## $ Gender <fctr> Male, Male, Male, Male, Male, Male, Male, Male, Male, ...
## $ Dept <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", ...
summary(ucb_admit)
## Admit Gender Dept
## Admitted:1755 Female:1835 Length:4526
## Rejected:2771 Male :2691 Class :character
## Mode :character
# Load packages
library(dplyr)
library(tidyr)
# Count number of male and female applicants admitted
ucb_counts <- ucb_admit %>%
count(Admit, Gender)
# View result
ucb_counts
## # A tibble: 4 x 3
## Admit Gender n
## <fctr> <fctr> <int>
## 1 Admitted Female 557
## 2 Admitted Male 1198
## 3 Rejected Female 1278
## 4 Rejected Male 1493
# Spread the output across columns
ucb_counts %>%
spread(Admit, n)
## # A tibble: 2 x 3
## Gender Admitted Rejected
## * <fctr> <int> <int>
## 1 Female 557 1278
## 2 Male 1198 1493
ucb_admit %>%
# Table of counts of admission status and gender
count(Admit, Gender) %>%
# Spread output across columns based on admission status
spread(Admit, n) %>%
# Create new variable
mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## # A tibble: 2 x 4
## Gender Admitted Rejected Perc_Admit
## <fctr> <int> <int> <dbl>
## 1 Female 557 1278 0.3035422
## 2 Male 1198 1493 0.4451877
# Table of counts of admission status and gender for each department
admit_by_dept <- ucb_admit %>%
count(Admit, Dept, Gender) %>%
spread(Admit, n)
# View result
admit_by_dept
## # A tibble: 12 x 4
## Dept Gender Admitted Rejected
## * <chr> <fctr> <int> <int>
## 1 A Female 89 19
## 2 A Male 512 313
## 3 B Female 17 8
## 4 B Male 353 207
## 5 C Female 202 391
## 6 C Male 120 205
## 7 D Female 131 244
## 8 D Male 138 279
## 9 E Female 94 299
## 10 E Male 53 138
## 11 F Female 24 317
## 12 F Male 22 351
# Percentage of those admitted to each department
admit_by_dept %>%
mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## # A tibble: 12 x 5
## Dept Gender Admitted Rejected Perc_Admit
## <chr> <fctr> <int> <int> <dbl>
## 1 A Female 89 19 0.82407407
## 2 A Male 512 313 0.62060606
## 3 B Female 17 8 0.68000000
## 4 B Male 353 207 0.63035714
## 5 C Female 202 391 0.34064081
## 6 C Male 120 205 0.36923077
## 7 D Female 131 244 0.34933333
## 8 D Male 138 279 0.33093525
## 9 E Female 94 299 0.23918575
## 10 E Male 53 138 0.27748691
## 11 F Female 24 317 0.07038123
## 12 F Male 22 351 0.05898123
us_regions <- read.csv("/resources/rstudio/us_regions.csv")
# Simple random sample: states_srs
states_srs <- us_regions %>%
sample_n(size = 8)
# Count states by region
states_srs %>%
group_by(region) %>%
count()
## # A tibble: 3 x 2
## # Groups: region [3]
## region n
## <fctr> <int>
## 1 Midwest 3
## 2 Northeast 1
## 3 West 4
# Stratified sample
states_str <- us_regions %>%
group_by(region) %>%
sample_n(2)
# Count states by region
states_str %>%
group_by(region) %>%
count()
## # A tibble: 4 x 2
## # Groups: region [4]
## region n
## <fctr> <int>
## 1 Midwest 2
## 2 Northeast 2
## 3 South 2
## 4 West 2
evals <- read.csv("/resources/rstudio/evals.csv")
# Inspect evals
glimpse(evals)
## Observations: 463
## Variables: 21
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5...
## $ rank <fctr> tenure track, tenure track, tenure track, tenur...
## $ ethnicity <fctr> minority, minority, minority, minority, not min...
## $ gender <fctr> female, female, female, female, male, male, mal...
## $ language <fctr> english, english, english, english, english, en...
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, ...
## $ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000...
## $ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24,...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, ...
## $ cls_level <fctr> upper, upper, upper, upper, upper, upper, upper...
## $ cls_profs <fctr> single, single, single, single, multiple, multi...
## $ cls_credits <fctr> multi credit, multi credit, multi credit, multi...
## $ bty_f1lower <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_f1upper <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, ...
## $ bty_f2upper <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, ...
## $ bty_m1lower <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m1upper <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m2upper <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000,...
## $ pic_outfit <fctr> not formal, not formal, not formal, not formal,...
## $ pic_color <fctr> color, color, color, color, color, color, color...
# Inspect variable types
glimpse(evals)
## Observations: 463
## Variables: 21
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5...
## $ rank <fctr> tenure track, tenure track, tenure track, tenur...
## $ ethnicity <fctr> minority, minority, minority, minority, not min...
## $ gender <fctr> female, female, female, female, male, male, mal...
## $ language <fctr> english, english, english, english, english, en...
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, ...
## $ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000...
## $ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24,...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, ...
## $ cls_level <fctr> upper, upper, upper, upper, upper, upper, upper...
## $ cls_profs <fctr> single, single, single, single, multiple, multi...
## $ cls_credits <fctr> multi credit, multi credit, multi credit, multi...
## $ bty_f1lower <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_f1upper <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, ...
## $ bty_f2upper <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, ...
## $ bty_m1lower <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m1upper <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m2upper <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000,...
## $ pic_outfit <fctr> not formal, not formal, not formal, not formal,...
## $ pic_color <fctr> color, color, color, color, color, color, color...
str(evals) # Another option
## 'data.frame': 463 obs. of 21 variables:
## $ score : num 4.7 4.1 3.9 4.8 4.6 4.3 2.8 4.1 3.4 4.5 ...
## $ rank : Factor w/ 3 levels "teaching","tenure track",..: 2 2 2 2 3 3 3 3 3 3 ...
## $ ethnicity : Factor w/ 2 levels "minority","not minority": 1 1 1 1 2 2 2 2 2 2 ...
## $ gender : Factor w/ 2 levels "female","male": 1 1 1 1 2 2 2 2 2 1 ...
## $ language : Factor w/ 2 levels "english","non-english": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 36 36 36 36 59 59 59 51 51 40 ...
## $ cls_perc_eval: num 55.8 68.8 60.8 62.6 85 ...
## $ cls_did_eval : int 24 86 76 77 17 35 39 55 111 40 ...
## $ cls_students : int 43 125 125 123 20 40 44 55 195 46 ...
## $ cls_level : Factor w/ 2 levels "lower","upper": 2 2 2 2 2 2 2 2 2 2 ...
## $ cls_profs : Factor w/ 2 levels "multiple","single": 2 2 2 2 1 1 1 2 2 2 ...
## $ cls_credits : Factor w/ 2 levels "multi credit",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ bty_f1lower : int 5 5 5 5 4 4 4 5 5 2 ...
## $ bty_f1upper : int 7 7 7 7 4 4 4 2 2 5 ...
## $ bty_f2upper : int 6 6 6 6 2 2 2 5 5 4 ...
## $ bty_m1lower : int 2 2 2 2 2 2 2 2 2 3 ...
## $ bty_m1upper : int 4 4 4 4 3 3 3 3 3 3 ...
## $ bty_m2upper : int 6 6 6 6 3 3 3 3 3 2 ...
## $ bty_avg : num 5 5 5 5 3 ...
## $ pic_outfit : Factor w/ 2 levels "formal","not formal": 2 2 2 2 2 2 2 2 2 2 ...
## $ pic_color : Factor w/ 2 levels "black&white",..: 2 2 2 2 2 2 2 2 2 2 ...
# Remove non-factor variables from this vector
cat_vars <- c("rank", "ethnicity", "gender", "language",
"cls_level", "cls_profs", "cls_credits",
"pic_outfit", "pic_color")
# Recode cls_students as cls_type: evals
evals <- evals %>%
# Create new variable
mutate(cls_type = ifelse(cls_students <= 18, "small",
ifelse(cls_students >= 19 & cls_students <= 59, "midsize",
"large")))
# Scatterplot of score vs. bty_avg
ggplot(evals, aes(x = bty_avg, y = score)) + geom_point()
# Scatterplot of score vs. bty_avg colored by cls_type
ggplot(evals, aes(x = bty_avg, y = score, color = cls_type)) +
geom_point()
You’re tasked to examine whether federal spending is positively related to the standard of living. Use the county data set in the openintro package. Examine the relationship between fed_spend and income by following instructions below.
data(county)
# Sample 150 counties
US_states <- county %>%
sample_n(size = 150)
#glimpse sample
glimpse(US_states)
## Observations: 150
## Variables: 10
## $ name <fctr> Lamb County, Marlboro County, Wood County, Mont...
## $ state <fctr> Texas, South Carolina, Texas, Tennessee, Nebras...
## $ pop2000 <dbl> 14709, 28818, 36752, 134768, 4089, 11400, 26873,...
## $ pop2010 <dbl> 13977, 28933, 41964, 172331, 3821, 10511, 26570,...
## $ fed_spend <dbl> 9.864706, 10.183839, 8.384973, 6.016381, 13.2394...
## $ poverty <dbl> 17.9, 27.5, 14.0, 14.6, 10.5, 10.3, 19.2, 16.0, ...
## $ homeownership <dbl> 75.1, 66.2, 81.4, 65.1, 70.8, 80.1, 83.2, 70.8, ...
## $ multiunit <dbl> 6.1, 10.7, 4.5, 20.4, 8.6, 6.8, 4.5, 11.1, 31.2,...
## $ income <dbl> 17553, 13817, 21682, 22092, 22263, 22088, 18402,...
## $ med_income <dbl> 35458, 27688, 41277, 48930, 42010, 42698, 39543,...
This is an observational study. An experiment would require that you have subjects, this study just looks at existing data.
It involves a random sample.
It is association because, in an observational study there could be other factors that would be relative. You can only infer causation from an experimental study.
Yes, if you notice a general trend it becomes easier to relay this information. The more data collected the more accurate the trend can be. ###Create a scatter plot of fed_spend on the y axis and income on the x axis. Interpret.
# Scatterplot of fed_spend vs. income
ggplot(US_states, aes(x = income, y = fed_spend)) +
geom_point()
A confounding variable is a variable that is not ntoiced but can ultimatly effect the end result. A good example of a confounding variable in this analysis is high employment rates vs lower employment rates in these counties.