Skoob: estimando tempo de acesso

library(tidyverse)

## ── Attaching packages ────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  3.0.0     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(here)

## here() starts at /cloud/project

library(DT)

Os dados

set.seed(12345)

skoob = read_csv("skoob.csv")

## Parsed with column specification:
## cols(
##   tempo = col_double()
## )

skoob = skoob %>% 
  sample_n(70) %>% 
  select(tempo)

glimpse(skoob)

## Rows: 70
## Columns: 1
## $ tempo <dbl> 0.432630, 0.446415, 0.449505, 0.492261, 0.462408, 0.480776, 0.4…

Informações

Theta chapéu

theta_c = skoob %>% 
  summarise(media = mean(tempo)) %>% 
  pull(media)

theta_c

## [1] 0.5129356

Fazendo o bootstrap

repeticoes = 4000

um_bootstrap <- function(x){
  tempo = x %>% pull(tempo)
  boot_x <- sample(tempo,           
                   size = NROW(tempo), 
                   replace = TRUE)
  return(mean(boot_x))
}

set.seed(1212)

# A REAMOSTRAGEM
reamostragens = tibble(i = 1:repeticoes) %>% 
  mutate(theta_c_s = map_dbl(i, ~ um_bootstrap(skoob)))

sketch = htmltools::withTags(table(
  tableHeader(skoob),
  tableFooter(skoob)
))

datatable(head(reamostragens, 4000), colnames = c('ID' = 1, 'Tempo (ms)' = 2), options = list(pageLength = 10, dom = 'tip'), rownames = FALSE)

Visualização

reamostragens %>%
  ggplot(aes(x = theta_c_s)) +
  geom_histogram(binwidth = 0.01,
                 colour = "darkorange",
                 fill = "white")

reamostragens %>%
  ggplot(aes(x = theta_c_s - theta_c)) +
  geom_histogram(binwidth = 0.01,
                 colour = "darkblue",
                 fill = "white")

Margens de erro

confianca = .95
alpha = 1 - confianca

intervalo = reamostragens %>% 
  mutate(erro = theta_c_s - theta_c) %>% 
  summarise(erro_i = quantile(erro, alpha / 2), 
            erro_s = quantile(erro, 1 - alpha /2)) %>% 
  mutate(valor_i = theta_c + erro_i, 
         valor_s = theta_c + erro_s)

datatable(head(intervalo, 4000), colnames = c('erro_i' = 1, 'erro_s' = 2, 'valor_i' = 3, 'valor_s' = 4), options = list(pageLength = 10, dom = 'tip'), rownames = FALSE)

Visualizando graficamente

ggplot() +
  geom_rect(
    data = intervalo,
    aes(xmin = valor_i, xmax = valor_s),
    ymin = -Inf,
    ymax = Inf,
    fill = "gold",
    alpha = .25
  ) +
  geom_histogram(
    data = reamostragens,
    aes(theta_c_s),
    binwidth = .01,
    fill = "white",
    colour = "darkgrey"
  ) +
  geom_vline(xintercept = theta_c,
             color = "blue",
             size = 1.2) +
  geom_vline(xintercept = theta_c, color = "dark green") +
  labs(title = expression("Intervalo estimado via bootstrap"))

Skoob: estimando tempo de acesso

Francicláudio Dantas da Silva

Os dados

Informações

Theta chapéu

Fazendo o bootstrap

Visualização

Margens de erro

Visualizando graficamente