library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ISLR)
library(moderndive)
library(skimr)
A p-value is the probability of seeing the results you saw given the null hypothesis (usually that there is no trend/correlation; random chance) is true. To compute a p-value you need to know…?
There is some interval around a in which 85% of the
time, r will fall in.
| a/b | 1 | 2 | 3 | 4 |
| 1 | 0 | 0.5 | 1 | 1.5 |
| 2 | 0.5 | 2 | 3.5 | 5 |
| 3 | 1 | 3.5 | 6 | 8.5 |
| 4 | 1.5 | 5 | 8.5 | 12 |
Variance = 1.25
50% Confidence Interval = +/- 1.25. This results in 8 of the 16 (50%): 0, 0.5, 0.5, 1, 1, 1.5, 1.5, 2 falling within the 0-2.5 interval. The other 8 fall outside.
Using the data we created below, we can see that a 47.5% confidence interval is +/- 1.5 , and a 97.5% confidence interval is +/- 4.5.
# a.
df_dice <- full_join(tibble(x = seq(4)), tibble(y = seq(10)), by=character())
# b.
df_dice |> mutate(diff = abs(x-y)) -> df_dice
# c.
df_dice |> summarise(mean(diff)) |> pull() -> true_mean
# d.
df_dice |> mutate(deviation = abs(diff - true_mean)) -> df_dice
# e.
df_dice |> mutate(prob = 1/40) -> df_dice
# f.
df_dice |> group_by(deviation) |> summarise(prob = n()/40) -> df_dice
# g.
df_dice |> mutate(cum_prob = cumsum(prob))
# a.
unfair <- tibble(side = 1:6, weight = c(0.4, 0.1, 0.1, 0.1, 0.1, 0.2))
fair <- tibble(side = 1:6, weight = 1/6)
df_dice_6 <- full_join(unfair, fair, by=character())
df_dice_6 |> summarise(u_plus_f = side.x+side.y, prob = weight.x*weight.y) -> df_dice_6
df_dice_6 |> group_by(u_plus_f) |> summarise(prob = sum(prob)) -> df_dice_6
df_dice_6 |> summarise(sum(u_plus_f * prob)) |> pull() -> e_uf
"e[u+f]"
## [1] "e[u+f]"
e_uf
## [1] 6.5
# b.
df_dice_6 |> summarise(sum(u_plus_f^2 * prob)) |> pull() -> e2_uf
var_uf <- e2_uf - e_uf^2
"var(u+f)"
## [1] "var(u+f)"
var_uf
## [1] 6.916667
# c.
df_dice_6 |>
mutate(deviation = abs(e_uf - u_plus_f)) |>
group_by(deviation) |>
summarise(sum_prob = sum(prob)) |>
mutate(cum_prob = cumsum(sum_prob))
dnorm(28, 20, 8)
## [1] 0.03024634
dunif(10, 5, 20)
## [1] 0.06666667
dexp(1.5, 1/3)
## [1] 0.2021769
df = tibble(n1 = rnorm(30, 20, 8), n2 = rnorm(30, 20, 8))
df <- df |> mutate(mean = (n1+n2)/2, p = 1/30)
est_mean <- df |> summarise(mean(mean)) |> pull()
df <- df |> mutate(deviation = abs(mean-est_mean))
df
df |>
select(deviation, p) |>
arrange(deviation) |>
mutate(cp = cumsum(p))
2.8962… is the radius 30% confidence interval
auto_confidence <- function(n) {
df <- tibble(n1 = rnorm(n, 20, 8), n2 = rnorm(n, 20, 8)) |>
mutate(mean = (n1+n2)/2, p = 1/30)
est_mean <- df |> summarise(mean(mean)) |> pull()
radius <- df |>
mutate(deviation = abs(mean-est_mean)) |>
arrange(deviation) |>
mutate(cp = cumsum(p)) |>
filter(cp == 0.3) |>
select(deviation) |>
pull()
return(radius)
}
auto_confidenceV <- Vectorize(auto_confidence)
name = tibble(x = rep.int(10, 30))
name <- name |> mutate(confidence = auto_confidenceV(x))
ggplot(name, aes(confidence)) +
geom_histogram(binwidth = 1)
auto_confidenceEXP <- function(n) {
df <- tibble(n1 = rexp(n, 1/3), n2 = rexp(n, 1/3)) |>
mutate(mean = (n1+n2)/2, p = 1/30)
est_mean <- df |> summarise(mean(mean)) |> pull()
radius <- df |>
mutate(deviation = abs(mean-est_mean)) |>
arrange(deviation) |>
mutate(cp = cumsum(p)) |>
filter(cp == 0.3) |>
select(deviation) |>
pull()
return(radius)
}
auto_confidenceEXPV <- Vectorize(auto_confidence)
name2 = tibble(x = rep.int(10, 50))
name2 <- name2 |> mutate(confidence = auto_confidenceV(x))
ggplot(name2, aes(confidence)) +
geom_histogram(binwidth = 0.5) +
geom_rug()
probexp <- function (x) { dexp(x, rate=0.8) }
ggplot() +
geom_function(fun=probexp) +
scale_x_continuous(limits=c(0,5))
dexp(1, rate=0.8)
## [1] 0.3594632
pexp(1, rate=0.8)
## [1] 0.550671
df = tibble(x=seq(0,1,0.01))
df |>
mutate(p = dexp(x, rate=0.8)) |>
mutate(cp = cumsum(p*0.01))
df <- tibble(x=seq(1.5,2,0.01))
df |>
mutate(p = dexp(x, rate=0.8)) |>
mutate(cp = cumsum(p*0.01)) -> df
p1.5 <- df |> filter(x == 1.5) |> select(cp) |> pull()
p2 <- df |> filter(x == 2) |> select(cp) |> pull()
print(p2 - p1.5)
## [1] 0.09890103
print(pexp(2, 0.8) - pexp(1.5, 0.8))
## [1] 0.09929769
df <- tibble(x=seq(0,1,0.01))
df |>
mutate(p = dexp(x, rate=0.8) / pexp(1, rate=0.8) * 0.01) |>
summarise(ex = sum(x*p))
df <- tibble(x=seq(1.5,2,0.01))
df |>
mutate(p = dexp(x, rate=0.8) / (pexp(2, rate=0.8)-pexp(1.5, 0.8)) * 0.01) |>
summarise(ex = sum(x*p)) |>
pull() -> e_x
e_x
## [1] 1.767839
df <- tibble(x=seq(1.5,2,0.01))
df |>
mutate(p = dexp(x, rate=0.8) / (pexp(2, rate=0.8)-pexp(1.5, 0.8)) * 0.01) |>
summarise(ex = sum(x^2*p)) |>
pull() -> e2_x
e2_x
## [1] 3.085083
var_x <- e2_x - e_x
var_x
## [1] 1.317244