Disclaimer: The content of this RMarkdown note came from a course called Introduction to Data in datacamp.

Introduction to Data

Chapter 1: Language of Data

1.1 Loading Data into R

library(openintro)
# Load data
data(email50)

# View its structure
str(email50)
## 'data.frame':    50 obs. of  21 variables:
##  $ spam        : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ to_multiple : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ from        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ cc          : int  0 0 4 0 0 0 0 0 1 0 ...
##  $ sent_email  : num  1 0 0 0 0 0 0 1 1 0 ...
##  $ time        : POSIXct, format: "2012-01-04 13:19:16" "2012-02-16 20:10:06" ...
##  $ image       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ attach      : num  0 0 2 0 0 0 0 0 0 0 ...
##  $ dollar      : num  0 0 0 0 9 0 0 0 0 23 ...
##  $ winner      : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ inherit     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ viagra      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ password    : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ num_char    : num  21.705 7.011 0.631 2.454 41.623 ...
##  $ line_breaks : int  551 183 28 61 1088 5 17 88 242 578 ...
##  $ format      : num  1 1 0 0 1 0 0 1 1 1 ...
##  $ re_subj     : num  1 0 0 0 0 0 0 1 1 0 ...
##  $ exclaim_subj: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ urgent_subj : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ exclaim_mess: num  8 1 2 1 43 0 0 2 22 3 ...
##  $ number      : Factor w/ 3 levels "none","small",..: 2 3 1 2 2 2 2 2 2 2 ...

1.2 Identify variable types

library(dplyr)
# Glimpse email50
glimpse(email50)
## Observations: 50
## Variables: 21
## $ spam         <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0...
## $ to_multiple  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0...
## $ from         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ cc           <int> 0, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ sent_email   <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ time         <dttm> 2012-01-04 13:19:16, 2012-02-16 20:10:06, 2012-0...
## $ image        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ attach       <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0...
## $ dollar       <dbl> 0, 0, 0, 0, 9, 0, 0, 0, 0, 23, 4, 0, 3, 2, 0, 0, ...
## $ winner       <fctr> no, no, no, no, no, no, no, no, no, no, no, no, ...
## $ inherit      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ viagra       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ password     <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0...
## $ num_char     <dbl> 21.705, 7.011, 0.631, 2.454, 41.623, 0.057, 0.809...
## $ line_breaks  <int> 551, 183, 28, 61, 1088, 5, 17, 88, 242, 578, 1167...
## $ format       <dbl> 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1...
## $ re_subj      <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ urgent_subj  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ exclaim_mess <dbl> 8, 1, 2, 1, 43, 0, 0, 2, 22, 3, 13, 1, 2, 2, 21, ...
## $ number       <fctr> small, big, none, small, small, small, small, sm...

1.3 Filtering based on a factor

# Subset of emails with big numbers: email50_big
email50_big <- email50 %>%
  filter(number == "big")

# Glimpse the subset
glimpse(email50_big)
## Observations: 7
## Variables: 21
## $ spam         <dbl> 0, 0, 1, 0, 0, 0, 0
## $ to_multiple  <dbl> 0, 0, 0, 0, 0, 0, 0
## $ from         <dbl> 1, 1, 1, 1, 1, 1, 1
## $ cc           <int> 0, 0, 0, 0, 0, 0, 0
## $ sent_email   <dbl> 0, 0, 0, 0, 0, 1, 0
## $ time         <dttm> 2012-02-16 20:10:06, 2012-02-04 23:26:09, 2012-0...
## $ image        <dbl> 0, 0, 0, 0, 0, 0, 0
## $ attach       <dbl> 0, 0, 0, 0, 0, 0, 0
## $ dollar       <dbl> 0, 0, 3, 2, 0, 0, 0
## $ winner       <fctr> no, no, yes, no, no, no, no
## $ inherit      <dbl> 0, 0, 0, 0, 0, 0, 0
## $ viagra       <dbl> 0, 0, 0, 0, 0, 0, 0
## $ password     <dbl> 0, 2, 0, 0, 0, 0, 8
## $ num_char     <dbl> 7.011, 10.368, 42.793, 26.520, 6.563, 11.223, 10.613
## $ line_breaks  <int> 183, 198, 712, 692, 140, 512, 225
## $ format       <dbl> 1, 1, 1, 1, 1, 1, 1
## $ re_subj      <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_subj <dbl> 0, 0, 0, 1, 0, 0, 0
## $ urgent_subj  <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_mess <dbl> 1, 1, 2, 7, 2, 9, 9
## $ number       <fctr> big, big, big, big, big, big, big

1.4 Complete filtering based on a factor

# Table of number variable
table(email50_big$number)
## 
##  none small   big 
##     0     0     7

# Drop levels
email50_big$number <- droplevels(email50_big$number)

# Another table of number variable
table(email50_big$number)
## 
## big 
##   7

1.5 Discretize a different variable

# Calculate median number of characters: med_num_char
med_num_char <- median(email50$num_char)

# Create num_char_cat variable in email50
email50 <- email50 %>%
  mutate(num_char_cat = ifelse(num_char < med_num_char, "below median", "at or above median"))
  
# Count emails in each category
table(email50$num_char_cat)
## 
## at or above median       below median 
##                 25                 25

1.6 Combining levels of a different factor

library(ggplot2)
# Create number_yn column in email50
email50 <- email50 %>%
  mutate(number_yn = ifelse(number == "none", "no", "yes"))

# Visualize number_yn
ggplot(email50, aes(x = number_yn)) +
  geom_bar()

1.7 Visualizing numerical and categorical data

# Load ggplot2
library(ggplot2)

# Scatterplot of exclaim_mess vs. num_char
ggplot(email50, aes(x = num_char, y = exclaim_mess, color = factor(spam))) +
  geom_point()

Chapter 2: Study types and cautionary tales

2.1 Identify the type of study

library(gapminder)
# Load data
data(gapminder)

# Glimpse data
glimpse(gapminder)
## Observations: 1,704
## Variables: 6
## $ country   <fctr> Afghanistan, Afghanistan, Afghanistan, Afghanistan,...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asi...
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992...
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.8...
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 1488...
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 78...

# Identify type of study
type_of_study <- "observational"

2.2 Number of males and females admitted

ucb_admit <- read.csv("/resources/rstudio/ucb_admit.csv") 
ucb_admit$Dept <- as.character(ucb_admit$Dept)
glimpse(ucb_admit)
## Observations: 4,526
## Variables: 3
## $ Admit  <fctr> Admitted, Admitted, Admitted, Admitted, Admitted, Admi...
## $ Gender <fctr> Male, Male, Male, Male, Male, Male, Male, Male, Male, ...
## $ Dept   <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", ...

summary(ucb_admit)
##       Admit         Gender         Dept          
##  Admitted:1755   Female:1835   Length:4526       
##  Rejected:2771   Male  :2691   Class :character  
##                                Mode  :character
# Load packages
library(dplyr)
library(tidyr)

# Count number of male and female applicants admitted
ucb_counts <- ucb_admit %>%
  count(Admit, Gender)

# View result
ucb_counts
## # A tibble: 4 x 3
##      Admit Gender     n
##     <fctr> <fctr> <int>
## 1 Admitted Female   557
## 2 Admitted   Male  1198
## 3 Rejected Female  1278
## 4 Rejected   Male  1493
  
# Spread the output across columns
ucb_counts %>%
  spread(Admit, n)
## # A tibble: 2 x 3
##   Gender Admitted Rejected
## * <fctr>    <int>    <int>
## 1 Female      557     1278
## 2   Male     1198     1493

2.3 Proportion of males admitted overall

ucb_admit %>%
  # Table of counts of admission status and gender
  count(Admit, Gender) %>%
  # Spread output across columns based on admission status
  spread(Admit, n) %>%
  # Create new variable
 mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## # A tibble: 2 x 4
##   Gender Admitted Rejected Perc_Admit
##   <fctr>    <int>    <int>      <dbl>
## 1 Female      557     1278  0.3035422
## 2   Male     1198     1493  0.4451877

2.4 Proportion of males admitted for each department

# Table of counts of admission status and gender for each department
admit_by_dept <- ucb_admit %>%
  count(Admit, Dept, Gender) %>%
  spread(Admit, n)

# View result
admit_by_dept
## # A tibble: 12 x 4
##     Dept Gender Admitted Rejected
##  * <chr> <fctr>    <int>    <int>
##  1     A Female       89       19
##  2     A   Male      512      313
##  3     B Female       17        8
##  4     B   Male      353      207
##  5     C Female      202      391
##  6     C   Male      120      205
##  7     D Female      131      244
##  8     D   Male      138      279
##  9     E Female       94      299
## 10     E   Male       53      138
## 11     F Female       24      317
## 12     F   Male       22      351

# Percentage of those admitted to each department
admit_by_dept %>%
  mutate(Perc_Admit = Admitted  / (Admitted + Rejected))
## # A tibble: 12 x 5
##     Dept Gender Admitted Rejected Perc_Admit
##    <chr> <fctr>    <int>    <int>      <dbl>
##  1     A Female       89       19 0.82407407
##  2     A   Male      512      313 0.62060606
##  3     B Female       17        8 0.68000000
##  4     B   Male      353      207 0.63035714
##  5     C Female      202      391 0.34064081
##  6     C   Male      120      205 0.36923077
##  7     D Female      131      244 0.34933333
##  8     D   Male      138      279 0.33093525
##  9     E Female       94      299 0.23918575
## 10     E   Male       53      138 0.27748691
## 11     F Female       24      317 0.07038123
## 12     F   Male       22      351 0.05898123

Chapter 3: Sampling strategies and experimental design

3.1 Simple random sample in R

us_regions <- read.csv("/resources/rstudio/us_regions.csv")

# Simple random sample: states_srs
states_srs <- us_regions %>%
  sample_n(size = 8)

# Count states by region
states_srs %>%
  group_by(region) %>%
  count()
## # A tibble: 3 x 2
## # Groups:   region [3]
##      region     n
##      <fctr> <int>
## 1   Midwest     3
## 2 Northeast     1
## 3      West     4

3.2 Stratified sample in R

# Stratified sample
states_str <- us_regions %>%
  group_by(region) %>%
  sample_n(2)

# Count states by region
states_str %>%
  group_by(region) %>%
  count()
## # A tibble: 4 x 2
## # Groups:   region [4]
##      region     n
##      <fctr> <int>
## 1   Midwest     2
## 2 Northeast     2
## 3     South     2
## 4      West     2

Chapter 4: Case study

4.1 Inspect the data

evals <- read.csv("/resources/rstudio/evals.csv")
# Inspect evals
glimpse(evals)
## Observations: 463
## Variables: 21
## $ score         <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5...
## $ rank          <fctr> tenure track, tenure track, tenure track, tenur...
## $ ethnicity     <fctr> minority, minority, minority, minority, not min...
## $ gender        <fctr> female, female, female, female, male, male, mal...
## $ language      <fctr> english, english, english, english, english, en...
## $ age           <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, ...
## $ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000...
## $ cls_did_eval  <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24,...
## $ cls_students  <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, ...
## $ cls_level     <fctr> upper, upper, upper, upper, upper, upper, upper...
## $ cls_profs     <fctr> single, single, single, single, multiple, multi...
## $ cls_credits   <fctr> multi credit, multi credit, multi credit, multi...
## $ bty_f1lower   <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_f1upper   <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, ...
## $ bty_f2upper   <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, ...
## $ bty_m1lower   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m1upper   <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m2upper   <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_avg       <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000,...
## $ pic_outfit    <fctr> not formal, not formal, not formal, not formal,...
## $ pic_color     <fctr> color, color, color, color, color, color, color...

4.2 Identify variable types

# Inspect variable types
glimpse(evals)
## Observations: 463
## Variables: 21
## $ score         <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5...
## $ rank          <fctr> tenure track, tenure track, tenure track, tenur...
## $ ethnicity     <fctr> minority, minority, minority, minority, not min...
## $ gender        <fctr> female, female, female, female, male, male, mal...
## $ language      <fctr> english, english, english, english, english, en...
## $ age           <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, ...
## $ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000...
## $ cls_did_eval  <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24,...
## $ cls_students  <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, ...
## $ cls_level     <fctr> upper, upper, upper, upper, upper, upper, upper...
## $ cls_profs     <fctr> single, single, single, single, multiple, multi...
## $ cls_credits   <fctr> multi credit, multi credit, multi credit, multi...
## $ bty_f1lower   <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_f1upper   <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, ...
## $ bty_f2upper   <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, ...
## $ bty_m1lower   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m1upper   <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
## $ bty_m2upper   <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ...
## $ bty_avg       <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000,...
## $ pic_outfit    <fctr> not formal, not formal, not formal, not formal,...
## $ pic_color     <fctr> color, color, color, color, color, color, color...
str(evals) # Another option
## 'data.frame':    463 obs. of  21 variables:
##  $ score        : num  4.7 4.1 3.9 4.8 4.6 4.3 2.8 4.1 3.4 4.5 ...
##  $ rank         : Factor w/ 3 levels "teaching","tenure track",..: 2 2 2 2 3 3 3 3 3 3 ...
##  $ ethnicity    : Factor w/ 2 levels "minority","not minority": 1 1 1 1 2 2 2 2 2 2 ...
##  $ gender       : Factor w/ 2 levels "female","male": 1 1 1 1 2 2 2 2 2 1 ...
##  $ language     : Factor w/ 2 levels "english","non-english": 1 1 1 1 1 1 1 1 1 1 ...
##  $ age          : int  36 36 36 36 59 59 59 51 51 40 ...
##  $ cls_perc_eval: num  55.8 68.8 60.8 62.6 85 ...
##  $ cls_did_eval : int  24 86 76 77 17 35 39 55 111 40 ...
##  $ cls_students : int  43 125 125 123 20 40 44 55 195 46 ...
##  $ cls_level    : Factor w/ 2 levels "lower","upper": 2 2 2 2 2 2 2 2 2 2 ...
##  $ cls_profs    : Factor w/ 2 levels "multiple","single": 2 2 2 2 1 1 1 2 2 2 ...
##  $ cls_credits  : Factor w/ 2 levels "multi credit",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ bty_f1lower  : int  5 5 5 5 4 4 4 5 5 2 ...
##  $ bty_f1upper  : int  7 7 7 7 4 4 4 2 2 5 ...
##  $ bty_f2upper  : int  6 6 6 6 2 2 2 5 5 4 ...
##  $ bty_m1lower  : int  2 2 2 2 2 2 2 2 2 3 ...
##  $ bty_m1upper  : int  4 4 4 4 3 3 3 3 3 3 ...
##  $ bty_m2upper  : int  6 6 6 6 3 3 3 3 3 2 ...
##  $ bty_avg      : num  5 5 5 5 3 ...
##  $ pic_outfit   : Factor w/ 2 levels "formal","not formal": 2 2 2 2 2 2 2 2 2 2 ...
##  $ pic_color    : Factor w/ 2 levels "black&white",..: 2 2 2 2 2 2 2 2 2 2 ...

# Remove non-factor variables from this vector
cat_vars <- c("rank", "ethnicity", "gender", "language",
              "cls_level", "cls_profs", "cls_credits",
              "pic_outfit", "pic_color") 

4.3 Recode a variable

# Recode cls_students as cls_type: evals
evals <- evals %>%
  # Create new variable
  mutate(cls_type = ifelse(cls_students <= 18, "small",
                      ifelse(cls_students >= 19 & cls_students <= 59, "midsize", 
                        "large")))

4.4 Create a scatterplot

# Scatterplot of score vs. bty_avg
ggplot(evals, aes(x = bty_avg, y = score)) + geom_point()

4.5 Create a scatterplot, with an added layer

# Scatterplot of score vs. bty_avg colored by cls_type
ggplot(evals, aes(x = bty_avg, y = score, color = cls_type)) +
  geom_point()

Quiz 3

You’re tasked to examine whether federal spending is positively related to the standard of living. Use the county data set in the openintro package. Examine the relationship between fed_spend and income by following instructions below.

Randomly sample 150 counties in the US.

data(county)

# Sample 150 counties
US_states <- county %>%
  sample_n(size = 150)

What type of variables are they? Use the glimpse function.

#glimpse sample

glimpse(US_states)
## Observations: 150
## Variables: 10
## $ name          <fctr> Lamb County, Marlboro County, Wood County, Mont...
## $ state         <fctr> Texas, South Carolina, Texas, Tennessee, Nebras...
## $ pop2000       <dbl> 14709, 28818, 36752, 134768, 4089, 11400, 26873,...
## $ pop2010       <dbl> 13977, 28933, 41964, 172331, 3821, 10511, 26570,...
## $ fed_spend     <dbl> 9.864706, 10.183839, 8.384973, 6.016381, 13.2394...
## $ poverty       <dbl> 17.9, 27.5, 14.0, 14.6, 10.5, 10.3, 19.2, 16.0, ...
## $ homeownership <dbl> 75.1, 66.2, 81.4, 65.1, 70.8, 80.1, 83.2, 70.8, ...
## $ multiunit     <dbl> 6.1, 10.7, 4.5, 20.4, 8.6, 6.8, 4.5, 11.1, 31.2,...
## $ income        <dbl> 17553, 13817, 21682, 22092, 22263, 22088, 18402,...
## $ med_income    <dbl> 35458, 27688, 41277, 48930, 42010, 42698, 39543,...

Discuss the nature of this study by addressing the following:

Is this an observational study or an experiment? Why?

This is an observational study. An experiment would require that you have subjects, this study just looks at existing data.

Does it involve random sampling or random assignment?

It involves a random sample.

Can you infer causation? Or just association? Why?

It is association because, in an observational study there could be other factors that would be relative. You can only infer causation from an experimental study.

Is your conclusion generalizable to the population as a whole? Why?

Yes, if you notice a general trend it becomes easier to relay this information. The more data collected the more accurate the trend can be. ###Create a scatter plot of fed_spend on the y axis and income on the x axis. Interpret.

# Scatterplot of fed_spend vs. income 
ggplot(US_states, aes(x = income, y = fed_spend)) +
  geom_point()

Can you think of any confounding variable? Briefly discuss.

A confounding variable is a variable that is not ntoiced but can ultimatly effect the end result. A good example of a confounding variable in this analysis is high employment rates vs lower employment rates in these counties.