Ejemplo práctico de experimentos en encuestas

rm(list=ls()) #Limpiar entorno de trabajo
#install.packages("dplyr") #Remover primer # para instalar la primera vez
library(dplyr) #paquete con funciones de procesamiento

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#install.packages("car") #Remover primer # para instalar la primera vez
library(car) #paquete con funciones de análisis estadístico

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

#install.packages("stargazer") #Remover primer # para instalar la primera vez
library(stargazer) #paquete con funciones de reporte en LaTeX

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

#install.packages("ggplot2") #Remover primer # para instalar la primera vez
library(ggplot2) #paquete con funciones para generar gráficos
#install.packages("effsize") #Remover primer # para instalar la primera vez
library(effsize) #paquete para calcular el tamaño del efecto
#install.packages("outliers") #Remover primer # para instalar la primera vez
library(outliers) #paquete para identificar y tratar valores atípicos

setwd("/home/daniel/Dropbox/U CENTRAL/02 2020/01 Primer semestre 2020/Métodos mixtos, experimentales y cuasiexperimentales/Clases/05") #Fijar carpeta de trabajo

Procesamiento de la base de datos.

Exportar base de datos.

data <- read.csv("data.csv", skip=2) #Exportar datos

Limpiar un poco los datos.

head(data) #ver primeros datos

##   Interview.number..ongoing. Serial.number..if.provided.
## 1                         NA                          NA
## 2                         31                          NA
## 3                         NA                          NA
## 4                         32                          NA
## 5                         NA                          NA
## 6                         34                          NA
##   Reference..if.provided.in.link.
## 1                              NA
## 2                              NA
## 3                              NA
## 4                              NA
## 5                              NA
## 6                              NA
##   Questionnaire.that.has.been.used.in.the.interview Interview.mode
## 1                                                                 
## 2                                              base      interview
## 3                                                                 
## 4                                              base      interview
## 5                                                                 
## 6                                              base      interview
##   Time.the.interview.has.started..Europe.Berlin. acepta genero edad...01.
## 1                                                    NA     NA         NA
## 2                            2020-04-29 19:53:35      1      2         42
## 3                                                    NA     NA         NA
## 4                            2020-04-29 19:53:36      1     NA         NA
## 5                                                    NA     NA         NA
## 6                            2020-04-29 19:54:17      1      2         37
##   region random1..Complete.clearances.of.the.ballot..yet random1..Code.drawn
## 1     NA                                              NA                  NA
## 2      4                                               0                   2
## 3     NA                                              NA                  NA
## 4     NA                                              NA                  NA
## 5     NA                                              NA                  NA
## 6     13                                               1                   2
##   sueldo...01. random2..Complete.clearances.of.the.ballot..yet
## 1                                                           NA
## 2       500000                                               0
## 3                                                           NA
## 4                                                           NA
## 5                                                           NA
## 6       650000                                               1
##   random2..Code.drawn contagiados...01. Time.spent.on.page.1
## 1                  NA                                     NA
## 2                   2           1300000                   22
## 3                  NA                                     NA
## 4                  NA                                     17
## 5                  NA                                     NA
## 6                   2           3000000                   26
##   Time.spent.on.page.2 Time.spent.on.page.3 Time.spent.on.page.4
## 1                   NA                   NA                   NA
## 2                   12                   27                   20
## 3                   NA                   NA                   NA
## 4                   NA                   NA                   NA
## 5                   NA                   NA                   NA
## 6                   26                    8                   24
##   Time.spent.on.page.5 Time.spent.on.page.6
## 1                   NA                   NA
## 2                   29                   11
## 3                   NA                   NA
## 4                   NA                   NA
## 5                   NA                   NA
## 6                    7                   86
##   Time.spent.overall..except.outliers.
## 1                                   NA
## 2                                  121
## 3                                   NA
## 4                                   17
## 5                                   NA
## 6                                  111
##   Time.when.the.invitation.mailing.was.sent..personally.identifiable.recipients..only.
## 1                                                                                   NA
## 2                                                                                   NA
## 3                                                                                   NA
## 4                                                                                   NA
## 5                                                                                   NA
## 6                                                                                   NA
##   Time.when.the.data.was.most.recently.updated
## 1                                             
## 2                          2020-04-29 19:55:36
## 3                                             
## 4                          2020-04-29 19:53:53
## 5                                             
## 6                          2020-04-29 19:57:14
##   Has.the.interview.been.finished..reached.last.page..
## 1                                                   NA
## 2                                                    1
## 3                                                   NA
## 4                                                    0
## 5                                                   NA
## 6                                                    1
##   Did.the.respondent.only.view.the.questionnaire..omitting.mandatory.questions.
## 1                                                                            NA
## 2                                                                             0
## 3                                                                            NA
## 4                                                                             0
## 5                                                                            NA
## 6                                                                             0
##   Last.page.that.the.participant.has.handled.in.the.questionnaire
## 1                                                              NA
## 2                                                               6
## 3                                                              NA
## 4                                                               1
## 5                                                              NA
## 6                                                               6
##   Hindmost.page.handled.by.the.participant Missing.answers.in.percent
## 1                                       NA                         NA
## 2                                        6                          0
## 3                                       NA                         NA
## 4                                        1                          0
## 5                                       NA                         NA
## 6                                        6                          0
##   Missing.answers..weighted.by.relevance.
## 1                                      NA
## 2                                       0
## 3                                      NA
## 4                                       0
## 5                                      NA
## 6                                       0
##   Degradation.points.for.being.very.fast
## 1                                     NA
## 2                                   1.06
## 3                                     NA
## 4                                   1.24
## 5                                     NA
## 6                                   1.09
##   Degradation.points.for.being.very.fast.1
## 1                                       NA
## 2                                       11
## 3                                       NA
## 4                                       12
## 5                                       NA
## 6                                       17

data <- data[!is.na(data$Interview.number..ongoing.),] #Eliminar filas sin información
names(data) #ver nombres de las variables

##  [1] "Interview.number..ongoing."                                                          
##  [2] "Serial.number..if.provided."                                                         
##  [3] "Reference..if.provided.in.link."                                                     
##  [4] "Questionnaire.that.has.been.used.in.the.interview"                                   
##  [5] "Interview.mode"                                                                      
##  [6] "Time.the.interview.has.started..Europe.Berlin."                                      
##  [7] "acepta"                                                                              
##  [8] "genero"                                                                              
##  [9] "edad...01."                                                                          
## [10] "region"                                                                              
## [11] "random1..Complete.clearances.of.the.ballot..yet"                                     
## [12] "random1..Code.drawn"                                                                 
## [13] "sueldo...01."                                                                        
## [14] "random2..Complete.clearances.of.the.ballot..yet"                                     
## [15] "random2..Code.drawn"                                                                 
## [16] "contagiados...01."                                                                   
## [17] "Time.spent.on.page.1"                                                                
## [18] "Time.spent.on.page.2"                                                                
## [19] "Time.spent.on.page.3"                                                                
## [20] "Time.spent.on.page.4"                                                                
## [21] "Time.spent.on.page.5"                                                                
## [22] "Time.spent.on.page.6"                                                                
## [23] "Time.spent.overall..except.outliers."                                                
## [24] "Time.when.the.invitation.mailing.was.sent..personally.identifiable.recipients..only."
## [25] "Time.when.the.data.was.most.recently.updated"                                        
## [26] "Has.the.interview.been.finished..reached.last.page.."                                
## [27] "Did.the.respondent.only.view.the.questionnaire..omitting.mandatory.questions."       
## [28] "Last.page.that.the.participant.has.handled.in.the.questionnaire"                     
## [29] "Hindmost.page.handled.by.the.participant"                                            
## [30] "Missing.answers.in.percent"                                                          
## [31] "Missing.answers..weighted.by.relevance."                                             
## [32] "Degradation.points.for.being.very.fast"                                              
## [33] "Degradation.points.for.being.very.fast.1"

data <- dplyr::rename(data, folio=Interview.number..ongoing.,
               fecha=Time.the.interview.has.started..Europe.Berlin., 
               edad=edad...01.,
               tratamiento1=random1..Code.drawn,
               sueldo=sueldo...01.,
               tratamiento2=random2..Code.drawn,
               contagios=contagiados...01.,
               tiempopagina1=Time.spent.on.page.1,
               tiempopagina2=Time.spent.on.page.2,
               tiempopagina3=Time.spent.on.page.3,
               tiempopagina4=Time.spent.on.page.4,
               tiempopagina5=Time.spent.on.page.5,
               tiempopagina6=Time.spent.on.page.6,
               tiempototal=Time.spent.overall..except.outliers.,
               ultimapagina=Last.page.that.the.participant.has.handled.in.the.questionnaire) #renombrar variables
#eliminar variables (columnas) que no se utilizarán
eliminar <- c(2,3,4,5,11,14,24:27,29:33)
data <- data[,-eliminar]
rm(eliminar)

Exportar etiquetas de valores.

valores <- read.csv("values.csv") #Exportar
valores <- valores[!is.na(valores$RESPONSE),] #Eliminar filas sin información

#acepta
data$acepta <- factor(data$acepta, levels = 1:2, labels = valores$MEANING[1:2])

#genero
data$genero[data$genero==-9] <- NA #Definir casos perdidos
data$genero <- factor(data$genero, levels= 1:4, labels = valores$MEANING[4:7])

#region
data$region[data$region==-9] <- NA #Definir casos perdidos
data$region <- factor(data$region, levels = 1:16, labels = valores$MEANING[9:24])

#tratamiento1
data$tratamiento1 <- factor(data$tratamiento1, levels = 1:2, 
                       labels = valores$MEANING[27:26])

#sueldo
data$sueldo <- as.numeric(data$sueldo)

#tratamiento2
data$tratamiento2 <- factor(data$tratamiento2, levels = 1:2, 
                       labels = valores$MEANING[29:28])

#contagios
data$contagios <- as.numeric(data$contagios)

#tiempos
data$tiempopagina1 <-as.numeric(data$tiempopagina1)
data$tiempopagina2 <-as.numeric(data$tiempopagina2)
data$tiempopagina3 <-as.numeric(data$tiempopagina3)
data$tiempopagina4 <-as.numeric(data$tiempopagina4)
data$tiempopagina5 <-as.numeric(data$tiempopagina5)
data$tiempopagina6 <-as.numeric(data$tiempopagina6)
data$tiempototal <- as.numeric(data$tiempototal)

#última página
data$ultimapagina <- as.numeric(data$ultimapagina)

Análisis del experimento 1

Primero, comprobamos que los casos fueron aleatoriamente asignados a los grupos tratamiento y control.

#genero
prop.table(table(data$genero,data$tratamiento1),1)

##                    
##                     Grupo tratado Grupo control
##   Masculino             0.5135135     0.4864865
##   Femenino              0.4400000     0.5600000
##   Otro                                         
##   Prefiero no decir     0.6666667     0.3333333

chisq.test(data$genero, data$tratamiento1, simulate.p.value = TRUE) #Chi-cuadrado? No hay dif. sig.

## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  data$genero and data$tratamiento1
## X-squared = 0.70514, df = NA, p-value = 0.7286

#edad
leveneTest(data$edad, data$tratamiento1, center = "median") #hay igualdad de varianzas en ambos grupos? Si

## Levene's Test for Homogeneity of Variance (center = "median")
##       Df F value Pr(>F)
## group  1  0.5968 0.4427
##       63

by(data$edad, data$tratamiento1, shapiro.test) #hay normalidad en ambos grupos? No

## data$tratamiento1: Grupo tratado
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.88823, p-value = 0.003149
## 
## ------------------------------------------------------------ 
## data$tratamiento1: Grupo control
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.91063, p-value = 0.01017

t.test(edad ~ tratamiento1, var.equal=TRUE, data) #prueba t muestras independientes? No hay dif. sig.

## 
##  Two Sample t-test
## 
## data:  edad by tratamiento1
## t = -1.2815, df = 63, p-value = 0.2047
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -10.414594   2.276336
## sample estimates:
## mean in group Grupo tratado mean in group Grupo control 
##                    34.71875                    38.78788

wilcox.test(data$edad[data$tratamiento1=='Grupo control'],
            data$edad[data$tratamiento1=='Grupo tratado'], 
            paired = FALSE) #test de Mann–Whitney–Wilcoxon?No hay dif. sig.

## Warning in wilcox.test.default(data$edad[data$tratamiento1 == "Grupo
## control"], : cannot compute exact p-value with ties

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  data$edad[data$tratamiento1 == "Grupo control"] and data$edad[data$tratamiento1 == "Grupo tratado"]
## W = 627.5, p-value = 0.1931
## alternative hypothesis: true location shift is not equal to 0

#Gráfico
g1 <- data.frame(data$tratamiento1, data$edad)
g1 <- na.omit(g1)
G1 <- ggplot(g1, aes(x=data.tratamiento1, y=data.edad)) + 
  geom_boxplot(notch=FALSE) + 
  labs(x = "Tratamiento", y = "Edad", caption = "t = -1.28, df = 63, p = 0.21")
ggsave("G1.png")

## Saving 7 x 5 in image

G1

#region
prop.table(table(data$region,data$tratamiento1),1)

##                                                          
##                                                           Grupo tratado
##   I Regi?n de Tarapac?                                        0.3333333
##   II Regi?n de Antofagasta                                             
##   III Regi?n de Atacama                                                
##   IV Regi?n de Coquimbo                                       0.5217391
##   V Regi?n de Valpara?so                                      0.0000000
##   VI Regi?n del Libertador General Bernardo O\031 Higgins              
##   VII Regi?n del Maule                                                 
##   VIII Regi?n del Biob?o                                      1.0000000
##   IX Regi?n de La Araucan?a                                   0.0000000
##   X Regi?n de Los Lagos                                                
##   XI Regi?n Ays?n del General Carlos Ib??ez del Campo                  
##   XII Regi?n de Magallanes y Ant?rtica Chilena                         
##   XIII Regi?n Metropolitana de Santiago                       0.5666667
##   XIV Regi?n de Los R?os                                               
##   XV Regi?n de Arica y Parinacota                                      
##   XVI Regi?n de ?uble                                                  
##                                                          
##                                                           Grupo control
##   I Regi?n de Tarapac?                                        0.6666667
##   II Regi?n de Antofagasta                                             
##   III Regi?n de Atacama                                                
##   IV Regi?n de Coquimbo                                       0.4782609
##   V Regi?n de Valpara?so                                      1.0000000
##   VI Regi?n del Libertador General Bernardo O\031 Higgins              
##   VII Regi?n del Maule                                                 
##   VIII Regi?n del Biob?o                                      0.0000000
##   IX Regi?n de La Araucan?a                                   1.0000000
##   X Regi?n de Los Lagos                                                
##   XI Regi?n Ays?n del General Carlos Ib??ez del Campo                  
##   XII Regi?n de Magallanes y Ant?rtica Chilena                         
##   XIII Regi?n Metropolitana de Santiago                       0.4333333
##   XIV Regi?n de Los R?os                                               
##   XV Regi?n de Arica y Parinacota                                      
##   XVI Regi?n de ?uble

chisq.test(data$region, data$tratamiento1, simulate.p.value = TRUE) #Chi-cuadrado? No hay dif. sig.

## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  data$region and data$tratamiento1
## X-squared = 6.2435, df = NA, p-value = 0.2394

Resultados del experimento 1

Descriptivos de variable resultado.

summary(data$sueldo) #se decide sustituir 500 por 500.000

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     500  500000  700000  812139  900000 2000000       8

data$sueldo[data$sueldo==500] <- 500000

exp1 <- data[,7:8]
exp1$tratamiento1 <- as.numeric(exp1$tratamiento1)-1
names(exp1) <- c("Tratamiento", "Sueldo promedio sociólogo/a")
stargazer(exp1, digits = 2)

## 
## % Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
## % Date and time: mar, may 26, 2020 - 19:17:15
## \begin{table}[!htbp] \centering 
##   \caption{} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lccccccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
## Statistic & \multicolumn{1}{c}{N} & \multicolumn{1}{c}{Mean} & \multicolumn{1}{c}{St. Dev.} & \multicolumn{1}{c}{Min} & \multicolumn{1}{c}{Pctl(25)} & \multicolumn{1}{c}{Pctl(75)} & \multicolumn{1}{c}{Max} \\ 
## \hline \\[-1.8ex] 
## Tratamiento & 66 & 0.50 & 0.50 & 0.00 & 0.00 & 1.00 & 1.00 \\ 
## Sueldo promedio sociólogo/a & 61 & 820,327.90 & 421,809.50 & 350,000.00 & 500,000.00 & 900,000.00 & 2,000,000.00 \\ 
## \hline \\[-1.8ex] 
## \end{tabular} 
## \end{table}

Estimación del ATE.

exp1 <- data.frame(data$tratamiento1, data$sueldo)
exp1 <- na.omit(exp1)# sólo nos quedamos con los casos completos

T1 <- mean(exp1$data.sueldo[exp1$data.tratamiento1=="Grupo tratado"]) #promedio grupo tratado
T1

## [1] 994666.7

C1 <- mean(exp1$data.sueldo[exp1$data.tratamiento1=="Grupo control"]) #promedio grupo control
C1

## [1] 651612.9

ATE1 <- T1-C1 #ATE
ATE1

## [1] 343053.8

#tamaño del efecto
cohen.d(exp1$data.sueldo ~ exp1$data.tratamiento1) # cohen's d 0.88 (efecto grande)

## 
## Cohen's d
## 
## d estimate: 0.8842055 (large)
## 95 percent confidence interval:
##    lower    upper 
## 0.347283 1.421128

t.test(data.sueldo ~ data.tratamiento1, exp1) #hay diferencias significativas

## 
##  Welch Two Sample t-test
## 
## data:  data.sueldo by data.tratamiento1
## t = 3.4151, df = 40.392, p-value = 0.001464
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  140095.0 546012.6
## sample estimates:
## mean in group Grupo tratado mean in group Grupo control 
##                    994666.7                    651612.9

#Gráfico
g2 <- data.frame(data$tratamiento1, data$sueldo)
g2 <- na.omit(g2)
G2a <- ggplot(g2, aes(x=data.tratamiento1, y=data.sueldo)) + 
  geom_boxplot(notch=TRUE) + 
  labs(x = "Tratamiento", y = "Sueldo", caption = "t = 3.42, df = 40.39, p = 0.00")
ggsave("G2a.png")

## Saving 7 x 5 in image

G2a

 medias1 <- aggregate(g2$data.sueldo, list(tratamiento = g2$data.tratamiento1), mean)
G2b <- ggplot(g2, aes(x=data.sueldo, colour=data.tratamiento1)) +
    geom_density() +
    geom_vline(data=medias1, aes(xintercept=x,  colour=tratamiento),
               linetype="dashed", size=1) +
  labs(x = "Sueldo", 
       y = "Densidad", 
       caption = "t = 3.42, df = 40.39, p = 0.00", 
       color = NULL)
ggsave("G2b.png")

## Saving 7 x 5 in image

G2b

Análisis del experimento 2

Primero, comprobamos que los casos fueron aleatoriamente asignados a los grupos tratamiento y control.

#genero
prop.table(table(data$genero,data$tratamiento2),1)

##                    
##                     Grupo tratado Grupo control
##   Masculino             0.5588235     0.4411765
##   Femenino              0.3913043     0.6086957
##   Otro                                         
##   Prefiero no decir     0.6666667     0.3333333

chisq.test(data$genero, data$tratamiento2, simulate.p.value = TRUE) #Chi-cuadrado? No hay dif. sig.

## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  data$genero and data$tratamiento2
## X-squared = 1.8909, df = NA, p-value = 0.4463

#edad
leveneTest(data$edad, data$tratamiento2, center = "median") #hay igualdad de varianzas en ambos grupos? Si

## Levene's Test for Homogeneity of Variance (center = "median")
##       Df F value Pr(>F)
## group  1  0.0284 0.8667
##       58

by(data$edad, data$tratamiento2, shapiro.test) #hay normalidad en ambos grupos? No

## data$tratamiento2: Grupo tratado
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.91312, p-value = 0.01785
## 
## ------------------------------------------------------------ 
## data$tratamiento2: Grupo control
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.82219, p-value = 0.000171

t.test(edad ~ tratamiento2, var.equal=TRUE, data) #prueba t muestras independientes? No hay dif. sig.

## 
##  Two Sample t-test
## 
## data:  edad by tratamiento2
## t = 1.6818, df = 58, p-value = 0.09799
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.06544 12.26544
## sample estimates:
## mean in group Grupo tratado mean in group Grupo control 
##                    39.33333                    33.73333

wilcox.test(data$edad[data$tratamiento2=='Grupo control'],
            data$edad[data$tratamiento2=='Grupo tratado'], 
            paired = FALSE) #test de Mann–Whitney–Wilcoxon?No hay dif. sig.

## Warning in wilcox.test.default(data$edad[data$tratamiento2 == "Grupo
## control"], : cannot compute exact p-value with ties

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  data$edad[data$tratamiento2 == "Grupo control"] and data$edad[data$tratamiento2 == "Grupo tratado"]
## W = 317, p-value = 0.04961
## alternative hypothesis: true location shift is not equal to 0

#Gráfico
g3 <- data.frame(data$tratamiento2, data$edad)
g3 <- na.omit(g3)
G3 <- ggplot(g3, aes(x=data.tratamiento2, y=data.edad)) + 
  geom_boxplot(notch=FALSE) + 
  labs(x = "Tratamiento", y = "Edad", caption = "t = 1.68, df = 6358, p = 0.10")
ggsave("G3.png")

## Saving 7 x 5 in image

G3

#region
prop.table(table(data$region,data$tratamiento2),1)

##                                                          
##                                                           Grupo tratado
##   I Regi?n de Tarapac?                                        0.5000000
##   II Regi?n de Antofagasta                                             
##   III Regi?n de Atacama                                                
##   IV Regi?n de Coquimbo                                       0.4500000
##   V Regi?n de Valpara?so                                      0.6666667
##   VI Regi?n del Libertador General Bernardo O\031 Higgins              
##   VII Regi?n del Maule                                                 
##   VIII Regi?n del Biob?o                                      0.0000000
##   IX Regi?n de La Araucan?a                                   1.0000000
##   X Regi?n de Los Lagos                                                
##   XI Regi?n Ays?n del General Carlos Ib??ez del Campo                  
##   XII Regi?n de Magallanes y Ant?rtica Chilena                         
##   XIII Regi?n Metropolitana de Santiago                       0.5172414
##   XIV Regi?n de Los R?os                                               
##   XV Regi?n de Arica y Parinacota                                      
##   XVI Regi?n de ?uble                                                  
##                                                          
##                                                           Grupo control
##   I Regi?n de Tarapac?                                        0.5000000
##   II Regi?n de Antofagasta                                             
##   III Regi?n de Atacama                                                
##   IV Regi?n de Coquimbo                                       0.5500000
##   V Regi?n de Valpara?so                                      0.3333333
##   VI Regi?n del Libertador General Bernardo O\031 Higgins              
##   VII Regi?n del Maule                                                 
##   VIII Regi?n del Biob?o                                      1.0000000
##   IX Regi?n de La Araucan?a                                   0.0000000
##   X Regi?n de Los Lagos                                                
##   XI Regi?n Ays?n del General Carlos Ib??ez del Campo                  
##   XII Regi?n de Magallanes y Ant?rtica Chilena                         
##   XIII Regi?n Metropolitana de Santiago                       0.4827586
##   XIV Regi?n de Los R?os                                               
##   XV Regi?n de Arica y Parinacota                                      
##   XVI Regi?n de ?uble

chisq.test(data$region, data$tratamiento2, simulate.p.value = TRUE) #Chi-cuadrado? No hay dif. sig.

## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  data$region and data$tratamiento2
## X-squared = 2.5678, df = NA, p-value = 0.9565

Resultados del experimento 2

Descriptivos de variable resultado.

exp2 <- data[,9:10]
exp2 <- na.omit(exp2) # sólo nos quedamos con los casos completos
boxplot(exp2$contagios) #hay outliers

#procedimiento para identificar y remover valores atípicos (truncar)
outlier_scores <- scores(exp2$contagios)
is_outlier <- outlier_scores > 3 | outlier_scores < -3
exp2$is_outlier <- is_outlier
exp2 <- exp2[exp2$is_outlier == F, ]
exp2$is_outlier <- NULL
boxplot(exp2$contagios) #outliers removidos

exp2$tratamiento2 <- as.numeric(exp2$tratamiento2)-1
names(exp2) <- c("Tratamiento", "Contagios")
stargazer(exp2, digits = 2)

## 
## % Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
## % Date and time: mar, may 26, 2020 - 19:17:16
## \begin{table}[!htbp] \centering 
##   \caption{} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lccccccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
## Statistic & \multicolumn{1}{c}{N} & \multicolumn{1}{c}{Mean} & \multicolumn{1}{c}{St. Dev.} & \multicolumn{1}{c}{Min} & \multicolumn{1}{c}{Pctl(25)} & \multicolumn{1}{c}{Pctl(75)} & \multicolumn{1}{c}{Max} \\ 
## \hline \\[-1.8ex] 
## Tratamiento & 54 & 0.48 & 0.50 & 0 & 0 & 1 & 1 \\ 
## Contagios & 54 & 22,976,667.00 & 56,584,731.00 & 20,000 & 1,000,000 & 10,000,000 & 300,000,000 \\ 
## \hline \\[-1.8ex] 
## \end{tabular} 
## \end{table}

Estimación del ATE.

exp2 <- data.frame(data$tratamiento2, data$contagios)
exp2 <- na.omit(exp2)# sólo nos quedamos con los casos completos
outlier_scores <- scores(exp2$data.contagios)
is_outlier <- outlier_scores > 3 | outlier_scores < -3
exp2$is_outlier <- is_outlier
exp2 <- exp2[exp2$is_outlier == F, ]
exp2$is_outlier <- NULL

T2 <- mean(exp2$data.contagios[exp2$data.tratamiento2=="Grupo tratado"]) #promedio grupo tratado
T2

## [1] 27427500

C2 <- mean(exp2$data.contagios[exp2$data.tratamiento2=="Grupo control"]) #promedio grupo control
C2

## [1] 18183462

ATE2<- T2-C2 #ATE
ATE2

## [1] 9244038

#tamaño del efecto
cohen.d(exp2$data.contagios ~ exp2$data.tratamiento2) # cohen's d 0.16 (efecto trivial)

## 
## Cohen's d
## 
## d estimate: 0.1623699 (negligible)
## 95 percent confidence interval:
##      lower      upper 
## -0.3850437  0.7097835

t.test(data.contagios ~ data.tratamiento2, exp2) #no hay diferencias significativas

## 
##  Welch Two Sample t-test
## 
## data:  data.contagios by data.tratamiento2
## t = 0.59321, df = 49.866, p-value = 0.5557
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -22057527  40545604
## sample estimates:
## mean in group Grupo tratado mean in group Grupo control 
##                    27427500                    18183462

#Gráfico
g4 <- data.frame(data$tratamiento2, data$contagios)
g4 <- na.omit(g4)
outlier_scores <- scores(g4$data.contagios)
is_outlier <- outlier_scores > 3 | outlier_scores < -3
g4$is_outlier <- is_outlier
g4 <- g4[g4$is_outlier == F, ]
g4$is_outlier <- NULL

G4a <- ggplot(g4, aes(x=data.tratamiento2, y=data.contagios)) + 
  geom_boxplot(notch=TRUE) + 
  labs(x = "Tratamiento", y = "Contagios", caption = "t = 0.59, df = 48.87, p = 0.56")
ggsave("G4a.png")

## Saving 7 x 5 in image

G4a

 medias2 <- aggregate(g4$data.contagios, list(tratamiento = g4$data.tratamiento2), mean)
G4b <- ggplot(g4, aes(x=data.contagios, colour=data.tratamiento2)) +
    geom_density() +
    geom_vline(data=medias2, aes(xintercept=x,  colour=tratamiento),
               linetype="dashed", size=1) +
  labs(x = "Contagios", 
       y = "Densidad", 
       caption = "t = 0.59, df = 48.87, p = 0.56", 
       color = NULL)
ggsave("G4b.png")

## Saving 7 x 5 in image

G4b

Ejemplo práctico de experimentos en encuestas

Daniel Fredes

25/5/2020