Weekly Homework

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ISLR)
library(moderndive)
library(skimr)

Big Picture

1.

A p-value is the probability of seeing the results you saw given the null hypothesis (usually that there is no trend/correlation; random chance) is true. To compute a p-value you need to know…?

2.

There is some interval around a in which 85% of the time, r will fall in.

Monday

3.

a/b 1 2 3 4
1 0 0.5 1 1.5
2 0.5 2 3.5 5
3 1 3.5 6 8.5
4 1.5 5 8.5 12

Variance = 1.25

50% Confidence Interval = +/- 1.25. This results in 8 of the 16 (50%): 0, 0.5, 0.5, 1, 1, 1.5, 1.5, 2 falling within the 0-2.5 interval. The other 8 fall outside.

Tuesday: Discrete Confidence Intervals

4.

Using the data we created below, we can see that a 47.5% confidence interval is +/- 1.5 , and a 97.5% confidence interval is +/- 4.5.

5.

# a.
df_dice <- full_join(tibble(x = seq(4)), tibble(y = seq(10)), by=character())
# b.
df_dice |> mutate(diff = abs(x-y)) -> df_dice
# c.
df_dice |> summarise(mean(diff)) |> pull() -> true_mean
# d.
df_dice |> mutate(deviation = abs(diff - true_mean)) -> df_dice
# e.
df_dice |> mutate(prob = 1/40) -> df_dice
# f.
df_dice |> group_by(deviation) |> summarise(prob = n()/40) -> df_dice
# g.
df_dice |> mutate(cum_prob = cumsum(prob))

6.

# a.
unfair <- tibble(side = 1:6, weight = c(0.4, 0.1, 0.1, 0.1, 0.1, 0.2))
fair <- tibble(side = 1:6, weight = 1/6)
df_dice_6 <- full_join(unfair, fair, by=character())
df_dice_6 |> summarise(u_plus_f = side.x+side.y, prob = weight.x*weight.y) -> df_dice_6
df_dice_6 |> group_by(u_plus_f) |> summarise(prob = sum(prob)) -> df_dice_6
df_dice_6 |> summarise(sum(u_plus_f * prob)) |> pull() -> e_uf
"e[u+f]"
## [1] "e[u+f]"
e_uf
## [1] 6.5
# b.
df_dice_6 |> summarise(sum(u_plus_f^2 * prob)) |> pull() -> e2_uf
var_uf <- e2_uf - e_uf^2
"var(u+f)"
## [1] "var(u+f)"
var_uf
## [1] 6.916667
# c.
df_dice_6 |> 
  mutate(deviation = abs(e_uf - u_plus_f)) |>
  group_by(deviation) |>
  summarise(sum_prob = sum(prob)) |>
  mutate(cum_prob = cumsum(sum_prob))
  1. 70% interval is around +/- 2.5

Thursday: Continous Confidence Intervals

7.

dnorm(28, 20, 8)
## [1] 0.03024634

8.

dunif(10, 5, 20)
## [1] 0.06666667

9.

dexp(1.5, 1/3)
## [1] 0.2021769

10.

df = tibble(n1 = rnorm(30, 20, 8), n2 = rnorm(30, 20, 8))
df <- df |> mutate(mean = (n1+n2)/2, p = 1/30)
est_mean <- df |> summarise(mean(mean)) |> pull()
df <- df |> mutate(deviation = abs(mean-est_mean))
df

11.

df |> 
  select(deviation, p) |> 
  arrange(deviation) |> 
  mutate(cp = cumsum(p))

2.8962… is the radius 30% confidence interval

12.

auto_confidence <- function(n) {
  df <- tibble(n1 = rnorm(n, 20, 8), n2 = rnorm(n, 20, 8)) |> 
    mutate(mean = (n1+n2)/2, p = 1/30)
  est_mean <- df |> summarise(mean(mean)) |> pull()
  radius <- df |> 
    mutate(deviation = abs(mean-est_mean)) |> 
    arrange(deviation) |> 
    mutate(cp = cumsum(p)) |>
    filter(cp == 0.3) |> 
    select(deviation) |>
    pull()
  return(radius)
}

auto_confidenceV <- Vectorize(auto_confidence)

13.

name = tibble(x = rep.int(10, 30))
name <- name |> mutate(confidence = auto_confidenceV(x))
ggplot(name, aes(confidence)) +
  geom_histogram(binwidth = 1)

14.

auto_confidenceEXP <- function(n) {
  df <- tibble(n1 = rexp(n, 1/3), n2 = rexp(n, 1/3)) |> 
    mutate(mean = (n1+n2)/2, p = 1/30)
  est_mean <- df |> summarise(mean(mean)) |> pull()
  radius <- df |> 
    mutate(deviation = abs(mean-est_mean)) |> 
    arrange(deviation) |> 
    mutate(cp = cumsum(p)) |>
    filter(cp == 0.3) |> 
    select(deviation) |>
    pull()
  return(radius)
}

auto_confidenceEXPV <- Vectorize(auto_confidence)
name2 = tibble(x = rep.int(10, 50))
name2 <- name2 |> mutate(confidence = auto_confidenceV(x))
ggplot(name2, aes(confidence)) +
  geom_histogram(binwidth = 0.5) +
  geom_rug()

Saturday: Continous Expectation

15.

a.

probexp <- function (x) { dexp(x, rate=0.8) }
ggplot() +
    geom_function(fun=probexp) +
    scale_x_continuous(limits=c(0,5))

b.

dexp(1, rate=0.8)
## [1] 0.3594632

c.

pexp(1, rate=0.8)
## [1] 0.550671

d.

df = tibble(x=seq(0,1,0.01)) 
df |> 
  mutate(p = dexp(x, rate=0.8)) |>
  mutate(cp = cumsum(p*0.01))

e.

df <- tibble(x=seq(1.5,2,0.01)) 
df |> 
  mutate(p = dexp(x, rate=0.8)) |>
  mutate(cp = cumsum(p*0.01)) -> df
p1.5 <- df |> filter(x == 1.5) |> select(cp) |> pull()
p2 <- df |> filter(x == 2) |> select(cp) |> pull()

print(p2 - p1.5)
## [1] 0.09890103
print(pexp(2, 0.8) - pexp(1.5, 0.8))
## [1] 0.09929769

f.

df <- tibble(x=seq(0,1,0.01)) 
df |> 
  mutate(p = dexp(x, rate=0.8) / pexp(1, rate=0.8) * 0.01) |>
  summarise(ex = sum(x*p))

g.

df <- tibble(x=seq(1.5,2,0.01)) 
df |> 
  mutate(p = dexp(x, rate=0.8) / (pexp(2, rate=0.8)-pexp(1.5, 0.8)) * 0.01) |>
  summarise(ex = sum(x*p)) |>
  pull() -> e_x
e_x
## [1] 1.767839

h.

df <- tibble(x=seq(1.5,2,0.01)) 
df |> 
  mutate(p = dexp(x, rate=0.8) / (pexp(2, rate=0.8)-pexp(1.5, 0.8)) * 0.01) |>
  summarise(ex = sum(x^2*p)) |>
  pull() -> e2_x
e2_x
## [1] 3.085083

i.

var_x <- e2_x - e_x
var_x
## [1] 1.317244