library(ggplot2, quietly = T, warn.conflicts = F)
library(dplyr, quietly = T, warn.conflicts = F)
gastos <- read.csv("AnoAtual.csv")
divulgacoes <- gastos %>%
filter(txtDescricao == "DIVULGAÇÃO DA ATIVIDADE PARLAMENTAR.") %>%
select(vlrLiquido, sgUF)
ggplot(divulgacoes, aes(vlrLiquido)) +
geom_density() +
#geom_histogram(binwidth = 1000) +
theme_bw()
ggplot(filter(divulgacoes, sgUF %in% c("PB", "SP", "RS")), aes(vlrLiquido, colour = sgUF)) +
stat_ecdf()
ggplot(filter(divulgacoes, sgUF %in% c("PB", "SP", "RS")),
aes(vlrLiquido,fill = sgUF, colour = sgUF)) +
geom_density(alpha = 0.3) +
theme_bw()
Erro amostral, erro padrão.
li <- read.csv("todas_infos.csv")
names(li)
## [1] "user" "ecletic" "clusters_total" "clusters"
## [5] "media_pop" "mediana_pop" "dp_pop" "news"
## [9] "news_total" "old" "old_total" "count_sim"
## [13] "tempo_sim" "count_pop" "tempo_pop"
# Considere que li é a população:
na.omit(li) %>%
select(news, old, media_pop) %>%
summarise(mean_news = mean(news),
mean_old = mean(old),
mean_pop = mean(media_pop))
## mean_news mean_old mean_pop
## 1 32.51927 100.3394 5.476899
ggplot(li, aes(news)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Distribuição (da média) amostral:
# Média de uma amostra:
sample(li$news, 1000) %>% mean()
## [1] 32.82
# Média de 200 amostras com n = 200
dist_original <- li$news
sample_size <- 200
num_samples <- 200
samples_means <- c()
for(i in seq(1, num_samples)){
a_sample <- sample(dist_original, sample_size)
samples_means[i] <- mean(a_sample)
}
ggplot(data.frame(samples_means), aes(samples_means)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: position_stack requires constant width: output may be incorrect
É uma normal. E repare que a distribuição original era muito diferente disso. Esta é a distribuição das médias das amostras.
Temos do Teorema do limite central que a distribuição de amostragem é uma distribuição normal com a média da população e um desvio padrão de \(\sigma = s / \sqrt{n}\). (s é o desvio padrão da amostra, usado no lugar do desvio padrão da população, que não temos).
library("Rmisc", quietly = T)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
sample_cis <- data.frame(upper = c(), mean = c(), lower = c())
for(i in seq(1, num_samples)){
a_sample <- sample(dist_original, sample_size)
interval <- CI(a_sample, ci = 0.95)
sample_cis <- rbind(sample_cis, data.frame(mean = interval["mean"],
lower = interval["lower"],
upper = interval["upper"]))
}
pop_mean <- mean(dist_original)
sample_cis <- sample_cis %>% mutate(contains_pop_mean = (upper >= pop_mean & lower <= pop_mean))
ggplot(sample_cis, aes(x = 1:nrow(sample_cis), y = mean, colour = contains_pop_mean)) +
geom_point() +
geom_errorbar(aes(ymin = lower, ymax = upper)) +
geom_hline(aes(yintercept=mean(mean(dist_original)))) +
theme_bw()
um applet (!) legal: [http://www.stat.berkeley.edu/~stark/Java/Html/Ci.htm]
Lembre da analogia do cachorro amarrado na árvore.
ICs para amostras < 30 –> Use t-student em lugar da distribuição normal.
ICs de propoção –> Erro padrão é \(\sqrt{(p (1-p)/n)}\)
# dist normal:
z = qnorm(1 - 0.05/2)
# dist t-student
degress_of_freedom = 50
z = qt(1 - 0.01/2, df = degress_of_freedom)
?t.test
?prop.test
Exercícios 2.38 e 2.40 do Open intro (p.121)
[http://scienceblogs.com/cognitivedaily/2008/07/31/most-researchers-dont-understa-1/]
require("Hmisc")
## Loading required package: Hmisc
## Loading required package: grid
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:plyr':
##
## is.discrete, summarize
##
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
ggplot(gastos, aes(x = sgPartido, y = vlrDocumento)) + stat_summary(fun.y = mean, geom = "point")
ggplot(gastos, aes(x = sgPartido, y = vlrDocumento)) +
stat_summary(fun.y = mean, geom = "point") +
stat_summary(fun.data = mean_cl_normal, geom = "errorbar", colour = "blue", width = 0.2)
CI
## function (x, ci = 0.95)
## {
## a <- mean(x)
## s <- sd(x)
## n <- length(x)
## error <- qt(ci + (1 - ci)/2, df = n - 1) * s/sqrt(n)
## return(c(upper = a + error, mean = a, lower = a - error))
## }
## <environment: namespace:Rmisc>
library(boot)
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:survival':
##
## aml
##
## The following object is masked from 'package:lattice':
##
## melanoma
media = function(amostra, indexes){
mean(amostra[indexes])
}
uma_amostra <- sample(dist_original, 200)
CI(uma_amostra)
## upper mean lower
## 36.52363 32.34000 28.15637
time.boot = boot(uma_amostra, media, 2000)
boot.ci(time.boot, type = "bca")
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = time.boot, type = "bca")
##
## Intervals :
## Level BCa
## 95% (28.88, 37.28 )
## Calculations and Intervals on Original Scale
time.boot = boot(uma_amostra, function(x, i) median(x[i]), 2000)
boot.ci(time.boot, type = "bca")
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = time.boot, type = "bca")
##
## Intervals :
## Level BCa
## 95% (20.5, 25.0 )
## Calculations and Intervals on Original Scale
library(simpleboot)
## Simple Bootstrap Routines (1.1-3 2008-04-30)
sboot <- one.boot(uma_amostra, median, 1000)
hist(sboot)
boot.ci(sboot, type = "bca")
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = sboot, type = "bca")
##
## Intervals :
## Level BCa
## 95% (20.43, 25.50 )
## Calculations and Intervals on Original Scale
# IC da mediana, sem bootstrap
wilcox.test(uma_amostra, conf.int = TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: uma_amostra
## V = 20100, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 23.50000 29.00002
## sample estimates:
## (pseudo)median
## 26.49997