library(ggplot2, quietly = T, warn.conflicts = F)
library(dplyr, quietly = T, warn.conflicts = F)

Discussão

Lembrando de amostra vs. população
caso dos salários vs. caso dos gastos
Histogramas. Density plots.
Variáveis aleatórias
Distribuições
Quais as que você conhece?

Sobre histogramas, CDFs e gráficos de densidade

gastos <- read.csv("AnoAtual.csv")

divulgacoes <- gastos %>% 
                filter(txtDescricao == "DIVULGAÇÃO DA ATIVIDADE PARLAMENTAR.") %>% 
                select(vlrLiquido, sgUF)

ggplot(divulgacoes, aes(vlrLiquido)) + 
  geom_density() + 
  #geom_histogram(binwidth = 1000) + 
  theme_bw()

ggplot(filter(divulgacoes, sgUF %in% c("PB", "SP", "RS")), aes(vlrLiquido, colour = sgUF)) + 
  stat_ecdf()

ggplot(filter(divulgacoes, sgUF %in% c("PB", "SP", "RS")), 
       aes(vlrLiquido,fill = sgUF, colour = sgUF)) + 
  geom_density(alpha = 0.3) + 
  theme_bw()

Erro amostral, erro padrão.

li <- read.csv("todas_infos.csv")

names(li)

##  [1] "user"           "ecletic"        "clusters_total" "clusters"      
##  [5] "media_pop"      "mediana_pop"    "dp_pop"         "news"          
##  [9] "news_total"     "old"            "old_total"      "count_sim"     
## [13] "tempo_sim"      "count_pop"      "tempo_pop"

# Considere que li é a população:
na.omit(li) %>% 
  select(news, old, media_pop) %>% 
  summarise(mean_news = mean(news), 
            mean_old = mean(old), 
            mean_pop = mean(media_pop))

##   mean_news mean_old mean_pop
## 1  32.51927 100.3394 5.476899

ggplot(li, aes(news)) + geom_histogram()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Distribuição (da média) amostral:

# Média de uma amostra: 
sample(li$news, 1000) %>% mean()

## [1] 32.82

# Média de 200 amostras com n = 200
dist_original <- li$news
sample_size <- 200
num_samples <- 200

samples_means <- c()
for(i in seq(1, num_samples)){
  a_sample <- sample(dist_original, sample_size)
  samples_means[i] <- mean(a_sample)
}
ggplot(data.frame(samples_means), aes(samples_means)) + geom_histogram()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

## Warning: position_stack requires constant width: output may be incorrect

É uma normal. E repare que a distribuição original era muito diferente disso. Esta é a distribuição das médias das amostras.

Intervalos de confiança

Temos do Teorema do limite central que a distribuição de amostragem é uma distribuição normal com a média da população e um desvio padrão de \(\sigma = s / \sqrt{n}\). (s é o desvio padrão da amostra, usado no lugar do desvio padrão da população, que não temos).

Usando isso para estimar intervalos com confiança

library("Rmisc", quietly = T)

## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

sample_cis <- data.frame(upper = c(), mean = c(), lower = c())
for(i in seq(1, num_samples)){
  a_sample <- sample(dist_original, sample_size)
  interval <- CI(a_sample, ci = 0.95)
  sample_cis <- rbind(sample_cis, data.frame(mean = interval["mean"], 
                                             lower = interval["lower"], 
                                             upper = interval["upper"]))
}

pop_mean <- mean(dist_original)

sample_cis <- sample_cis %>% mutate(contains_pop_mean = (upper >= pop_mean & lower <= pop_mean)) 

ggplot(sample_cis, aes(x = 1:nrow(sample_cis), y = mean, colour = contains_pop_mean)) + 
  geom_point() + 
  geom_errorbar(aes(ymin = lower, ymax = upper)) + 
  geom_hline(aes(yintercept=mean(mean(dist_original)))) + 
  theme_bw()

um applet (!) legal: [http://www.stat.berkeley.edu/~stark/Java/Html/Ci.htm]

Lembre da analogia do cachorro amarrado na árvore.

Há dois casos especiais comuns

ICs para amostras < 30 –> Use t-student em lugar da distribuição normal.

ICs de propoção –> Erro padrão é \(\sqrt{(p (1-p)/n)}\)

# dist normal:
z = qnorm(1 - 0.05/2)
# dist t-student
degress_of_freedom = 50
z = qt(1 - 0.01/2, df = degress_of_freedom)

?t.test
?prop.test

Exercícios 2.38 e 2.40 do Open intro (p.121)

Sobre sobreposições de barras de erro

[http://scienceblogs.com/cognitivedaily/2008/07/31/most-researchers-dont-understa-1/]

E um truque importante

require("Hmisc")

## Loading required package: Hmisc
## Loading required package: grid
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:plyr':
## 
##     is.discrete, summarize
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

ggplot(gastos, aes(x = sgPartido, y = vlrDocumento)) + stat_summary(fun.y = mean, geom = "point")

ggplot(gastos, aes(x = sgPartido, y = vlrDocumento)) + 
  stat_summary(fun.y = mean, geom = "point") + 
  stat_summary(fun.data = mean_cl_normal, geom = "errorbar", colour = "blue", width = 0.2)

Código do Rmisc:CI

CI

## function (x, ci = 0.95) 
## {
##     a <- mean(x)
##     s <- sd(x)
##     n <- length(x)
##     error <- qt(ci + (1 - ci)/2, df = n - 1) * s/sqrt(n)
##     return(c(upper = a + error, mean = a, lower = a - error))
## }
## <environment: namespace:Rmisc>

Extra: ICs Via bootstrap

library(boot)

## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:survival':
## 
##     aml
## 
## The following object is masked from 'package:lattice':
## 
##     melanoma

media = function(amostra, indexes){
  mean(amostra[indexes])
}

uma_amostra <- sample(dist_original, 200)
CI(uma_amostra)

##    upper     mean    lower 
## 36.52363 32.34000 28.15637

time.boot = boot(uma_amostra, media, 2000)
boot.ci(time.boot, type = "bca")

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = time.boot, type = "bca")
## 
## Intervals : 
## Level       BCa          
## 95%   (28.88, 37.28 )  
## Calculations and Intervals on Original Scale

time.boot = boot(uma_amostra, function(x, i) median(x[i]), 2000)
boot.ci(time.boot, type = "bca")

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = time.boot, type = "bca")
## 
## Intervals : 
## Level       BCa          
## 95%   (20.5, 25.0 )  
## Calculations and Intervals on Original Scale

library(simpleboot)

## Simple Bootstrap Routines (1.1-3 2008-04-30)

sboot <- one.boot(uma_amostra, median, 1000)
hist(sboot)

boot.ci(sboot, type = "bca")

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = sboot, type = "bca")
## 
## Intervals : 
## Level       BCa          
## 95%   (20.43, 25.50 )  
## Calculations and Intervals on Original Scale

# IC da mediana, sem bootstrap
wilcox.test(uma_amostra, conf.int = TRUE)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  uma_amostra
## V = 20100, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
##  23.50000 29.00002
## sample estimates:
## (pseudo)median 
##       26.49997

Aula dist, tlc, ic

Nazareno Andrade e João Arthur B. Monteiro

26 de abril de 2015