# Load the packages
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(gridExtra)))

# Read in the data files
food_consumption <- read.csv("food_consumption.csv")
amir_deals <- read.csv("amir_deals.csv")
world_happiness <- read.csv("world_happiness.csv")

Summary Statistics

  • Statistics
    • The field of statistics - The practice and study of collecting and analyzing data
    • A summary statistic - A fact about or summary of some data
  • Types of statistics
    • Descriptive statistics - Describe and summarize data
    • Inferential statistics - Use a sample of data to make inferences about a larger population
  • Types of data
    • Numeric (Quantitative)
      • Continuous (Measured)
      • Discrete (Counted)
    • Categorical (Qualitative) - Can be represented as numbers
      • Nominal (Unordered)
      • Ordinal (Ordered)

Measures of center

  • Measures of center
    • Mean - Much more sensitive to extreme values than the median, works better for symmetrical data
    • Median - Better to use when data is skewed
    • Mode - Most frequent value
  • Skewness
    • Left-skewed (Negative-skewed) - Mean < Median < Mode
    • Right-skewed (Positive-skewed) - Mean > Median > Mode
    • Symmetric (Zero-skewed) - Mean = Median = Mode

Mean

# Mean
mean(msleep$sleep_total)
## [1] 10.43373

Median

# Median
median(msleep$sleep_total) # sort(msleep$sleep_total)[42]
## [1] 10.1

Mode

# Mode
msleep %>% count(sleep_total, sort = TRUE)
msleep %>% count(vore, sort = TRUE)

Mean and Median

food_consumption %>% 
  filter(country %in% c("Belgium", "USA")) %>% 
  group_by(country) %>% 
  summarise(mean_consumption = mean(consumption),
            median_consumption = median(consumption))

Plot

food_consumption %>% 
  filter(food_category == "rice") %>% 
  summarise(mean_co2 = mean(co2_emission),
            median_co2 = median(co2_emission))
food_consumption %>% 
  filter(food_category == "rice") %>% 
  # Histogram of co2_emission
  ggplot(aes(co2_emission)) +
    geom_histogram(bins = 30) +
    geom_vline(aes(xintercept = mean(co2_emission)), color = "green") +
    geom_text(aes(mean(co2_emission), 0, label=round(mean(co2_emission), 2), vjust=1)) +
    geom_vline(aes(xintercept = median(co2_emission)), color = "blue") +
    geom_text(aes(median(co2_emission), 0, label=median(co2_emission), vjust=1))

Measures of spread

  • Measures of spread
    • Variance - Average distance from each data point to the data’s mean
      • The higher the variance, the more spread out the data is
      • The units of variance are squared
      • Sensitive to outliers (rely on mean)
    • Standard deviation
      • The units are not squared
      • Sensitive to outliers (rely on mean)
    • Mean absolute deviation (MAD)
      • SD vs. MAD
        • SD squares distances, penalizing longer distances more than shorter ones
        • MAD penalizes each distance equally
        • One isn’t better than the other, but SD is more common than MAD
    • Interquartile range (IQR) - The distance between the 25th and 75th percentile, which is also the height of the box in a boxplot
      • Quartiles - 4
        • Second quartile/50th percentile = median
        • Boxplots
      • Quintiles - 5
      • Deciles - 10
      • Quantiles - Percentiles
      • Outliers - data < Q1 − 1.5 × IQR or data > Q3 + 1.5 × IQR
      • Less influenced by outliers

Variance

# Variance
var(msleep$sleep_total)
## [1] 19.80568
# Manual calculation
dists <- msleep$sleep_total - mean(msleep$sleep_total)
squared_dists <- (dists)^2
sum_sq_dists <- sum(squared_dists)
sum_sq_dists/82
## [1] 19.80568

Standard deviation

# Standard deviation
sd(msleep$sleep_total)
## [1] 4.450357
# Manual calculation
sqrt(var(msleep$sleep_total))
## [1] 4.450357

Mean absolute deviation

# Mean absolute deviation
dists <- msleep$sleep_total - mean(msleep$sleep_total)
mean(abs(dists))
## [1] 3.566701

Quantiles

# Quartiles
quantile(msleep$sleep_total)
##    0%   25%   50%   75%  100% 
##  1.90  7.85 10.10 13.75 19.90
# Quantiles
quantile(msleep$sleep_total, probs = c(0, 0.2, 0.4, 0.6, 0.8, 1))
##    0%   20%   40%   60%   80%  100% 
##  1.90  6.24  9.48 11.14 14.40 19.90
quantile(msleep$sleep_total, probs = seq(0, 1, 0.2))
##    0%   20%   40%   60%   80%  100% 
##  1.90  6.24  9.48 11.14 14.40 19.90
# Boxplots
ggplot(msleep, aes(y = sleep_total)) +
  geom_boxplot()

# IQR
quantile(msleep$sleep_total, 0.75) - quantile(msleep$sleep_total, 0.25)
## 75% 
## 5.9
# Outlier
iqr <- quantile(msleep$bodywt, 0.75) - quantile(msleep$bodywt, 0.25)
lower_threshold <- quantile(msleep$bodywt, 0.25) - 1.5 * iqr
upper_threshold<- quantile(msleep$bodywt, 0.75) + 1.5 * iqr
msleep %>% 
  filter(bodywt < lower_threshold | bodywt > upper_threshold ) %>% 
  select(name, vore, sleep_total, bodywt)
# Calculate total co2_emission per country: emissions_by_country
emissions_by_country <- food_consumption %>% 
  group_by(country) %>% 
  summarize(total_emission = sum(co2_emission))

# Compute the first and third quartiles and IQR of total_emission
q1 <- quantile(emissions_by_country$total_emission, 0.25)
q3 <- quantile(emissions_by_country$total_emission, 0.75)
iqr <- q3 - q1

# Calculate the lower and upper cutoffs for outliers
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr

# Filter emissions_by_country to find outliers
emissions_by_country %>% 
  filter(total_emission < lower | total_emission > upper)

Plot

# Calculate variance and sd of co2_emission for each food_category
food_consumption %>% 
  group_by(food_category) %>% 
  summarize(var_co2 = var(co2_emission),
            sd_co2 = sd(co2_emission))
# Plot food_consumption with co2_emission on x-axis
ggplot(data = food_consumption, aes(co2_emission)) +
  geom_histogram() +
  # Create a separate sub-graph for each food_category
  facet_wrap(~ food_category)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Random Numbers and Probability

  • Probability of an event
    • \(P(\text{event}) = \frac{\text{# ways event can happen}}{\text{total # of possible outcomes}}\)
    • [0, 1]
  • Independent events - Two events are independent if the probability of the second event isn’t affected by the outcome of the first event
    • Sampling with replacement
  • Dependent events - Two events are dependent if the probability of the second event is affected by the outcome of the first event
    • Sampling without replacement

Sampling (Seeds)

sales_counts = data.frame(
  name = c("Amir", "Brian", "Claire", "Damian"),
  n_sales = c(178, 126, 75, 69)
)

# Sampling without seeds
sales_counts %>% sample_n(1)
sales_counts %>% sample_n(1)
# Sampling with seeds
# The seed is a number that R's random number generator uses as a starting point
set.seed(5)
sales_counts %>% sample_n(1)
set.seed(5)
sales_counts %>% sample_n(1)

Sampling (Replacement)

# Sampling without replacement
sales_counts %>% sample_n(2)
# Sampling with replacement
sales_counts %>% sample_n(5, replace = TRUE)

Probability

# Calculate probability of picking a deal with each product
amir_deals %>% 
  count(product) %>% 
  mutate(prob = n / sum(n))

Probability distributions

  • Probability distribution - Describes the probability of each possible outcome in a scenario
    • Probability = Area
    • Total area = 1
    • Discrete distributions - Describe probabilities for discrete outcomes
    • Continuous distributions
  • Expected value - Mean of a probability distribution
  • Law of large numbers - As the size of your sample increases, the sample mean will approach the expected value (theoretical mean)

Uniform distribution

  • Uniform distribution - A distribution that has constant probability due to equally likely occurring events
  • \(U(a, b)\)
    • \(a\): minimum
    • \(b\): maximum
  • Discrete uniform distribution
  • Continuous uniform distribution
    • dunif(x, min = 0, max = 1, log = FALSE)
    • punif(q, min = 0, max = 1, lower.tail = TRUE, log.p = FALSE)
    • qunif(p, min = 0, max = 1, lower.tail = TRUE, log.p = FALSE)
    • runif(n, min = 0, max = 1)
# Discrete uniform distributions
die <- data.frame(n = c(1, 2, 3, 4, 5, 6))

# Expected value
mean(die$n)
## [1] 3.5
# Sampling from discrete distributions
rolls_10 <- die %>% sample_n(10, replace = TRUE)
g1 <- ggplot(rolls_10, aes(n)) +
  geom_histogram(bins = 6) +
  labs(title = "Sample size 10")
mean(rolls_10$n)
## [1] 3.9
rolls_1000 <- die %>% sample_n(1000, replace = TRUE)
g2 <- ggplot(rolls_1000, aes(n)) +
  geom_histogram(bins = 6) +
  labs(title = "Sample size 1000")
mean(rolls_1000$n)
## [1] 3.5
rolls_100000 <- die %>% sample_n(100000, replace = TRUE)
g3 <- ggplot(rolls_100000, aes(n)) +
  geom_histogram(bins = 6) +
  labs(title = "Sample size 100000")
mean(rolls_100000$n)
## [1] 3.49046
grid.arrange(g1, g2, g3, nrow = 1)

restaurant_groups <- data.frame(
    group_id = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J"),
    group_size = c(2, 4, 6, 2, 2, 2, 3, 2, 4, 2)
)

# Create a histogram of group_size
ggplot(restaurant_groups, aes(group_size)) +
  geom_histogram(bins = 5)

# Create probability distribution
size_distribution <- restaurant_groups %>% 
  count(group_size) %>% 
  mutate(probability = n / sum(n))
size_distribution
# Calculate expected group size
expected_val <- sum(size_distribution$group_size * size_distribution$probability)
expected_val
## [1] 2.9
# Calculate probability of picking group of 4 or more
size_distribution %>% 
  filter(group_size >= 4) %>% 
  summarise(prob_4_or_more = sum(probability))
# Continuous uniform distributions
# Min and max wait times for back-up that happens every 30 min
min <- 0
max <- 30

data.frame(
  # P(wait time ≤ 5)
  prob_less_than_5 = punif(5, min, max),
  # P(wait time ≥ 5)
  prob_greater_than_5 = punif(5, min, max, lower.tail = FALSE),
  # P(10 ≤ wait time ≤ 20)
  prob_between_10_and_20 = punif(20, min, max) - punif(10, min, max)
)
wait_times <- data.frame(simulation_nb = c(1:100000))

set.seed(334)

# Generate 100000 wait times between 0 and 30 mins, save in time column
wait_times %>% 
  mutate(time = runif(100000, min, max)) %>% 
  # Histogram of simulated times
  ggplot(aes(time)) +
    geom_histogram(bins = 30)

Binomial distribution

  • Binomial distribution - Discrete probability distribution of the number of successes in a sequence of independent trials
    • \(B(n, p)\)
      • \(n\) - Total number of trials - size
      • \(p\) - Probability of success - prob
    • dbinom(x, size, prob, log = FALSE)
    • pbinom(q, size, prob, lower.tail = TRUE, log.p = FALSE)
    • qbinom(p, size, prob, lower.tail = TRUE, log.p = FALSE)
    • rbinom(n, size, prob)
    • Expected value = \(n\) × \(p\)
# Binomial distribution
# rbinom(# of trials, # of coins, # probability of heads/success)
# A single flip
rbinom(1, 1, 0.5)
## [1] 0
# One flip many times
rbinom(8, 1, 0.5)
## [1] 1 0 0 1 0 0 1 1
# Many flips one time
rbinom(1, 8, 0.5)
## [1] 3
# Many flips many times
rbinom(10, 3, 0.5)
##  [1] 1 0 1 1 2 1 3 1 2 1
# P(heads = 7)
# dbinom(num heads, num coins, prob of heads)
dbinom(7, 10, 0.5)
## [1] 0.1171875
# P(heads ≤ 7)
pbinom(7, 10, 0.5)
## [1] 0.9453125
# P(heads > 7)
pbinom(7, 10, 0.5, lower.tail = FALSE)
## [1] 0.0546875
1 - pbinom(7, 10, 0.5)
## [1] 0.0546875
# Simulate a single deal
set.seed(10)
rbinom(1, 1, 0.3)
## [1] 0
# Simulate 1 week of 3 deals
set.seed(10)
rbinom(1, 3, 0.3)
## [1] 1
# Simulate 52 weeks of 3 deals
set.seed(10)
deals <- rbinom(52, 3, 0.3)

# Calculate mean deals won per week
mean(deals)
## [1] 0.8076923
# Probability of closing 3 out of 3 deals
dbinom(3, 3, 0.3)
## [1] 0.027
# Probability of closing <= 1 deal out of 3 deals
pbinom(1, 3, 0.3)
## [1] 0.784
# Probability of closing > 1 deal out of 3 deals
pbinom(1, 3, 0.3, lower.tail = FALSE)
## [1] 0.216
data.frame(n = rbinom(10000, 10, .5)) %>% 
  ggplot(aes(n)) +
    geom_histogram(bins = 11)

Normal distribution

  • Normal distribution
    • Continuous distribution
    • Bell curve
    • Symmetrical
    • Area = 1
    • Curve never hits 0
    • \(N(\mu, \sigma)\)
      • \(\mu\): Mean
      • \(\sigma\): Standard deviation
    • dnorm(x, mean = 0, sd = 1, log = FALSE)
    • pnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
    • qnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
    • rnorm(n, mean = 0, sd = 1)
    • Standard normal distribution - Normal distribution with mean 0 and a standard deviation of 1
    • 68-95-99-point-7 rule
      • 68% falls within 1 standard deviation of the mean
      • 95% falls within 2 standard deviations
      • 99.7% falls within 3 standard deviations
# Normal distribution
# P (Height < 154 cm)
pnorm(154, mean = 161, sd = 7) # Distribution function
## [1] 0.1586553
# P (Height > 154 cm)
pnorm(154, mean = 161, sd = 7, lower.tail = FALSE)
## [1] 0.8413447
# P (154 cm < Height < 157 cm)
pnorm(157, mean = 161, sd = 7) - pnorm(154, mean = 161, sd = 7)
## [1] 0.1251993
# Height that 90% of women shorter than
qnorm(0.9, mean = 161, sd = 7) # Quantile function
## [1] 169.9709
# Height that 90% of women taller than
qnorm(0.9, mean = 161, sd = 7, lower.tail = FALSE)
## [1] 152.0291
# Generate 10 random heights
rnorm(10, mean = 161, sd = 7) # Generates random deviates
##  [1] 165.6326 169.2622 162.9009 157.7776 161.8106 154.9148 172.4883 153.1321
##  [9] 158.4192 163.4099
# P (Height = 160)
dnorm(160, mean = 161, sd = 7) # Density
## [1] 0.05641316
# Histogram of amount with 10 bins
ggplot(data = amir_deals, aes(amount)) +
  geom_histogram(bins = 10)

# Probability of deal < 7500
pnorm(7500, 5000, 2000)
## [1] 0.8943502
# Probability of deal > 1000
pnorm(1000, 5000, 2000, lower.tail = FALSE)
## [1] 0.9772499
# Probability of deal between 3000 and 7000
pnorm(7000, 5000, 2000) - pnorm(3000, 5000, 2000)
## [1] 0.6826895
# Calculate amount that 75% of deals will be more than
qnorm(0.75, 5000, 2000, lower.tail = FALSE)
## [1] 3651.02
new_sales <- data.frame(sale_num = c(1:36))

# Calculate new average amount
new_mean <- 5000 * 1.2

# Calculate new standard deviation
new_sd <- 2000 * 1.3

# Simulate 36 sales
new_sales <- new_sales %>% 
  mutate(amount = rnorm(36, new_mean, new_sd))

# Create histogram with 10 bins
ggplot(new_sales, aes(amount)) +
  geom_histogram(bins = 10)

Poisson distribution

  • Poisson processes - Events appear to happen at a certain rate, but completely at random
    • Examples
      • Number of animals adopted from an animal shelter per week
      • Number of people arriving at a restaurant per hour
  • Poisson distribution - Probability of some # of events occurring over a fixed period of time
    • Examples
      • Probability of >= 5 animals adopted from an animal shelter per week
      • Probability of 12 people arriving at a restaurant per hour
    • Discrete distribution
    • The distribution’s peak is always at its \(\lambda\) value
    • \(P(\lambda)\)
      • \(\lambda\): Average number of events per time interval
    • dpois(x, lambda, log = FALSE)
    • ppois(q, lambda, lower.tail = TRUE, log.p = FALSE)
    • qpois(p, lambda, lower.tail = TRUE, log.p = FALSE)
    • rpois(n, lambda)
    • Expected value: \(\lambda\)
    • Poisson distribution tends to a Normal distribution for a large parameter lambda (>10)
# Poisson distribution
lambdas <- c(1, 2, 4, 5, 8)

ggplot(data = data.frame(x = 0:10)) +
  lapply(lambdas, function(l) geom_point(aes(x, dpois(x, lambda = l), col = factor(l)))) +
  lapply(lambdas, function(l) geom_line(aes(x, dpois(x, lambda = l), col = factor(l)), linewidth = 1)) +
  labs(title = "Poisson Distribution", x = "Number of Events", y = "Probability") +
  guides(color = guide_legend(title = "lambda")) +
  theme(text = element_text(size=12))

# P (# adoptions in a week = 5)
dpois(5, lambda = 8)
## [1] 0.09160366
# P (# adoptions in a week ≤ 5)
ppois(5, lambda = 8)
## [1] 0.1912361
# P(# adoptions in a week > 5)
ppois(5, lambda = 8, lower.tail = FALSE)
## [1] 0.8087639
ppois(5, lambda = 10, lower.tail = FALSE)
## [1] 0.932914
# Sampling from a Poisson distribution
rpois(10, lambda = 8)
##  [1]  9 11  5  9 12  6  9 11  7 11
sample_means <- data.frame(means = replicate(1000, rpois(10, lambda = 8) %>% mean()))
ggplot(sample_means, aes(means)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exponential distribution

  • Exponential distribution - Probability of time between Poisson events
    • Examples
      • Probability of > 1 day between adoptions
      • Probability of < 10 minutes between restaurant arrivals
    • Continuous distribution
    • \(E(\lambda)\)
      • \(\lambda\): rate, the average rate of arrivals or occurrences of an event in a given time interval
    • dexp(x, rate = 1, log = FALSE)
    • pexp(q, rate = 1, lower.tail = TRUE, log.p = FALSE)
    • qexp(p, rate = 1, lower.tail = TRUE, log.p = FALSE)
    • rexp(n, rate = 1)
    • Expected value: \(1/\lambda\)
    • Poisson distribution vs Exponential distribution
      • For the Poisson distribution, the expected value represents the average frequency of events
        • Measures frequency in terms of rate or number of events
      • For the exponential distribution, the expected value represents the average time between events
        • Measures frequency in terms of time between events
      • For example, on average, 3 cars pass through an intersection in a minute
        • Poisson distribution: \(\lambda = 3\) and expected value = 3
        • Exponential distribution: \(\lambda = 3\) and expected value = 1/3
# Exponential distribution
lambdas <- c(0.5, 1, 1.5)

ggplot(data = data.frame(x = seq(0, 5, by = 0.1))) +
  lapply(lambdas, function(l) geom_line(aes(x, dexp(x, rate = l), col = factor(l)), linewidth = 1)) +
  labs(title = "Exponential Distribution", x = "Time Between Events", y = "") +
  guides(color = guide_legend(title = "rate")) +
  theme(text=element_text(size=12))

# On average, one customer service ticket is created every 2 minutes. λ = 0.5 customer service tickets created each minute
# P(wait < 1 min)
pexp(1, rate = 0.5)
## [1] 0.3934693
# P(wait > 4 min)
pexp(4, rate = 0.5, lower.tail = FALSE)
## [1] 0.1353353
# P(1 min < wait < 4 min)
pexp(4, rate = 0.5) - pexp(1, rate = 0.5)
## [1] 0.4711954

(Student’s) t-distribution

  • (Student’s) t-Distribution
    • Similar shape as the normal distribution
    • Degrees of freedom (df)
      • Affects the thickness of the tails
      • Lower df - Thicker tails, higher standard deviation
      • Higher df - Closer to normal distribution
# (Student's) t-distribution
df <- c(1, 5, 10)

ggplot(data = data.frame(x = seq(-3, 3, by = 0.1))) +
  lapply(df, function(l) geom_line(aes(x, dt(x, df = l), col = factor(l)), linewidth = 1)) +
  labs(title = "t-Distribution", x = "", y = "") +
  guides(color = guide_legend(title = "df")) +
  theme(text=element_text(size=12))

Log-normal distribution

  • Log-normal distribution
    • Variable whose logarithm is normally distributed
    • Example
      • Length of chess games
      • Adult blood pressure
      • Number of hospitalizations in the 2003 SARS outbreak
# Log-normal distribution
sd <- c(.5, 1, 1.5)

ggplot(data = data.frame(x = seq(0, 3, by = 0.05))) +
  lapply(sd, function(l) geom_line(aes(x, dlnorm(x, sdlog = l), col = factor(l)), linewidth = 1)) +
  labs(title = "Log Distribution", x = "", y = "") +
  guides(color = guide_legend(title = "sd")) +
  theme(text=element_text(size=12))

The central limit theorem

  • Central limit theorem (CLT) - The sampling distribution of a statistic becomes closer to the normal distribution as the number of trials increases
    • Sampling distributions - Distribution of a summary statistic
    • Samples should be random and independent
    • Apply to sample mean, median, sd, proportion
    • No matter the original distribution being sampled from
    • Estimate characteristics of unknown underlying distribution
    • More easily estimate characteristics of large populations
    • The mean of the sampling distribution is the mean of the population
    • The standard deviation of the sampling distribution is the standard deviation of the population divided by the square root of the sample size
    • \(\bar{X} \sim N(\mu, \frac{\sigma}{\sqrt{n}})\)
      • \(\bar{X}\) is the sampling distribution of the sample means
      • \(\sim\) means follows the distribution
      • \(N\) is the normal distribution
      • \(\mu\) is the mean of the population
      • \(\sigma\) is the standard deviation of the population
      • \(n\) is the sample size
    • Conditions of the central limit theorem
      • The sample size is sufficiently large. This condition is usually met if the sample size is n >= 30
      • The samples are independent and identically distributed (i.i.d.) random variables. This condition is usually met if the sampling is random
      • The population’s distribution has finite variance. Central limit theorem doesn’t apply to distributions with infinite variance, such as the Cauchy distribution. Most distributions have finite variance
# The central limit theorem
die <- c(1, 2, 3, 4, 5, 6)

# Rolling the dice 5 times
sample_of_5 <- sample(die, 5, replace = TRUE)
sample_of_5
## [1] 3 3 5 4 4
mean(sample_of_5)
## [1] 3.8
# Rolling the dice 5 times 10/100/1000 times
replicate(10, sample(die, 5, replace = TRUE) %>% mean())
##  [1] 3.2 2.8 3.8 2.4 2.8 4.4 3.2 4.0 2.2 3.8
sample_means_10 <- replicate(10, sample(die, 5, replace = TRUE) %>% mean())
sample_means_100 <- replicate(100, sample(die, 5, replace = TRUE) %>% mean())
sample_means_1000 <- replicate(1000, sample(die, 5, replace = TRUE) %>% mean())

p1 <- ggplot() + aes(sample_means_10) + geom_histogram(bins = 7)
p2 <- ggplot() + aes(sample_means_100) + geom_histogram(bins = 7)
p3 <- ggplot() + aes(sample_means_1000) + geom_histogram(bins = 7)

grid.arrange(p1, p2, p3, ncol=3)

# Estimate expected value of die
mean(sample_means_1000)
## [1] 3.4738
# Standard deviation and the CLT
mean(replicate(1000, sample(die, 5, replace = TRUE) %>% sd()))
## [1] 1.642136
# Proportions and the CLT
sales_team <- c("Amir", "Brian", "Claire", "Damian")
mean(replicate(100000, mean(sample(sales_team, 10, replace = TRUE) == "Claire")))
## [1] 0.250635
# Create a histogram of num_users
g1 <- ggplot(amir_deals, aes(num_users)) +
  geom_histogram(bins = 10)

set.seed(104)

# Sample 20 num_users from amir_deals and take mean and repeat 100 times
sample_means <- replicate(100, sample(amir_deals$num_users, size = 20, replace = TRUE) %>% mean())

# Create data frame for plotting
samples <- data.frame(mean = sample_means)

# Histogram of sample means
g2 <- ggplot(samples, aes(mean)) +
  geom_histogram(bins = 10)

grid.arrange(g1, g2, nrow = 1)

Correlation

  • Correlation - Relationships between two variables
    • \(x\): explanatory/independent variable
    • \(y\): response/dependent variable
    • Only accounts for linear relationships
    • Correlation shouldn’t be used blindly, always visualize your data
  • Correlation coefficient - Quantifies the linear relationship between two variables
    • [−1, 1]
    • Magnitude corresponds to strength of relationship
    • Sign (+ or -) corresponds to direction of relationship
    • Pearson product-moment correlation (r)
      • \(r = \sum^{n}_{i=1} \frac{(x_i - \bar{x})(y_i - \bar{y})}{\sigma_x \times \sigma_y}\)
        • \(\bar{x}\): Mean of \(x\)
        • \(\sigma_x\): Standard deviation of \(x\)
      • Most common
    • Kendall’s tau
    • Spearman’s rho
  • Transformation to linear - When variables have skewed distributions
    • Log transformation: log(x)
    • Square root transformation: sqrt(x)
    • Reciprocal transformation: 1/x
    • Combinations of these - e.g. sqrt(x) and 1/y
  • Correlation does not imply causation. x is correlated with y does not mean x causes y
    • Spurious correlation/causation/association/confounding/confounder/lurking variable
# Correlation
ggplot(world_happiness, aes(life_exp, happiness_score)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

cor(world_happiness$life_exp, world_happiness$happiness_score, use = "pairwise.complete.obs")
## [1] 0.7802249
g1 <- ggplot(msleep, aes(bodywt, awake)) +
  geom_point()
cor(msleep$bodywt, msleep$awake)
## [1] 0.3119801
# Log transformation
g2 <- msleep %>% 
  mutate(log_bodywt = log(bodywt)) %>% 
  ggplot(aes(log_bodywt, awake)) +
    geom_point() +
    geom_smooth(method = "lm", se = FALSE)
cor(log(msleep$bodywt), msleep$awake)
## [1] 0.5687943
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'

Design of experiments

  • Experiment aims to answer: What is the effect of the treatment on the response?
    • Treatment - Explanatory/independent variable
    • Response - Response/dependent variable
  • Controlled experiments
    • Treatment group - Receives the treatment
    • Control group - Does not receive the treatment
    • Groups should be comparable so that causation can be inferred. If not, this could lead to confounding (bias)
  • The gold standard of experiments to eliminate bias
    • Randomized controlled trial - comparable
      • Participants are assigned to treatment/control randomly, not based on any other characteristics
    • Placebo
    • Double-blind trial
  • Observational studies - Participants are not assigned randomly to groups. Participants assign themselves, usually based on pre-existing characteristics
    • Establish association, not causation
    • Effects can be confounded by factors that got certain people into the control or treatment group
  • Longitudinal studies - Participants are followed over a period of time to examine effect of treatment on response
    • More expensive, results take longer
  • Cross-sectional study - Data on participants is collected from a single snapshot in time
    • Cheaper, faster, more convenient