Exemples contrastos

Una mitjana

#Distanciòmetre
mostrapiscina<-c(49.9958, 49.9971, 49.9978, 49.9992, 
                 49.9995, 50.0001, 50.0004, 50.0012)
mean(mostrapiscina)
## [1] 49.99889
sigma <- .002
(se <- sigma/sqrt(8))
## [1] 0.0007071068
pnorm(mean(mostrapiscina), 50, se)
## [1] 0.05782323
(z <- (mean(mostrapiscina)-50)/se)
## [1] -1.573313
pnorm(z)
## [1] 0.05782323

Una proporció

dades <- read.csv("~/DADES/pere/upf/estadistica_politiques/seminari3/Microdades anonimitzades -985.csv", encoding="UTF-8", sep=";")

table(dades$P39)
## 
##          Altres partits                     C's Catalunya en Comú Podem 
##                      34                      46                      95 
##                     CUP                En blanc                     ERC 
##                      93                      56                     419 
##                ICV-EUiA     Junts per Catalunya             No contesta 
##                       7                     162                      92 
##               No ho sap         No podria votar              No votaria 
##                     454                       4                     185 
##                     Nul                   PACMA                  PDeCAT 
##                      13                      15                      21 
##                 Podemos                     PPC                     PSC 
##                       9                      43                     218 
##                     Vox 
##                      34
indepe <- 93+419+162+21
total <- 2000-56-13-92-454-4-185
(p <- indepe/total)
## [1] 0.5811037
se <- sqrt(p*(1-p)/total)

#era significativament més gran del 50%?
pnorm(p, .5, se, lower.tail = FALSE)
## [1] 6.542843e-09
prop.test(indepe, total, p=.5, alternative = "greater")
## 
##  1-sample proportions test with continuity correction
## 
## data:  indepe out of total, null probability 0.5
## X-squared = 31.145, df = 1, p-value = 1.197e-08
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
##  0.5570599 1.0000000
## sample estimates:
##         p 
## 0.5811037
#és significativament diferent del que va sortir?
#51.2%
pnorm(p, .512, se, lower.tail = FALSE)
## [1] 6.36879e-07
2*pnorm(p, .512, se, lower.tail = FALSE)
## [1] 1.273758e-06
prop.test(indepe, total, p=.512, alternative = "two.sided")
## 
##  1-sample proportions test with continuity correction
## 
## data:  indepe out of total, null probability 0.512
## X-squared = 22.583, df = 1, p-value = 2.013e-06
## alternative hypothesis: true p is not equal to 0.512
## 95 percent confidence interval:
##  0.5525054 0.6091749
## sample estimates:
##         p 
## 0.5811037

dues mitjanes, mostres independents

library(readxl)
ECP_2021 <- read_excel("~/DADES/pere/upf/estadistica_politiques/seminari1/ECP_2021.xlsx")
names(ECP_2021)[4] <- "notacces"
aggregate(notacces~Sexe, data=ECP_2021, mean, rm.na=TRUE)
##   Sexe notacces
## 1 Dona 10.65694
## 2 Home 10.49325
t.test(notacces~Sexe, data=ECP_2021)
## 
##  Welch Two Sample t-test
## 
## data:  notacces by Sexe
## t = 0.67693, df = 124.9, p-value = 0.4997
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3148815  0.6422546
## sample estimates:
## mean in group Dona mean in group Home 
##           10.65694           10.49325
names(ECP_2021)
##  [1] "Sexe"                                                                                     
##  [2] "Grup"                                                                                     
##  [3] "On vas estudiar el batxillerat"                                                           
##  [4] "notacces"                                                                                 
##  [5] "On vius durant el curs?"                                                                  
##  [6] "Amb quin transport sols venir a la Uni quan no estem confinats i hi ha classe presencial?"
##  [7] "Quants minuts trigues de casa a la universitat?"                                          
##  [8] "Quina és la teva despesa total en un mes tipus? (euros)"                                  
##  [9] "Quina era el curs passat la teva despesa total en un mes tipus? (euros)"                  
## [10] "Quants euros vas gastar a les vacances d'estiu?"
names(ECP_2021)[7] <- "minuts"
aggregate(minuts~Sexe, data=ECP_2021, mean, rm.na=TRUE)
##   Sexe   minuts
## 1 Dona 39.07031
## 2 Home 35.60448
t.test(minuts~Sexe, data=ECP_2021)
## 
##  Welch Two Sample t-test
## 
## data:  minuts by Sexe
## t = 0.78562, df = 125.46, p-value = 0.4336
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.26492 12.19659
## sample estimates:
## mean in group Dona mean in group Home 
##           39.07031           35.60448
load("~/DADES/pere/upf/estadistica_politiques/EES_2014.RData")
boxplot(SALBRUTO~SEXO, data=ees)

aggregate(SALBRUTO~SEXO, data=ees, mean)
##   SEXO SALBRUTO
## 1    1 27955.46
## 2    6 20530.37
t.test(SALBRUTO~SEXO, data=ees)
## 
##  Welch Two Sample t-test
## 
## data:  SALBRUTO by SEXO
## t = 82.47, df = 206398, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  7248.626 7601.554
## sample estimates:
## mean in group 1 mean in group 6 
##        27955.46        20530.37
t.test(SALBRUTO~SEXO, data=ees, alternative="greater")
## 
##  Welch Two Sample t-test
## 
## data:  SALBRUTO by SEXO
## t = 82.47, df = 206398, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  7276.997      Inf
## sample estimates:
## mean in group 1 mean in group 6 
##        27955.46        20530.37

dues mitjanes, mostres aparellades

## dues mitjanes, mostres aparellades
dades <- read.csv("~/DADES/pere/upf/estadistica_politiques/seminari3/Microdades anonimitzades -985.csv", encoding="UTF-8", sep=";")
table(dades$P43B_M_ICETA)
## 
##                       0           1          10           2           3 
##         339         275          56          20         158         178 
##           4           5           6           7           8           9 
##         189         319         150         111          61          17 
## No contesta   No ho sap 
##          35          92
mi <- as.numeric(dades$P43B_M_ICETA)
## Warning: NAs introduced by coercion
summary(mi)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.000   4.000   3.765   5.000  10.000     466
si <- as.numeric(dades$P43B_S_ILLA)
## Warning: NAs introduced by coercion
summary(si)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   3.000   5.000   4.758   6.000  10.000     533
boxplot(si, mi)

t.test(si,mi, paired=TRUE, alternative = "greater")
## 
##  Paired t-test
## 
## data:  si and mi
## t = 16.047, df = 1351, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.9166799       Inf
## sample estimates:
## mean of the differences 
##                 1.02145

dues proporcions

addmargins(table(ees$SEXO, ees$CONTROL))
##      
##            1      2    Sum
##   1    15407 104536 119943
##   6    17485  72008  89493
##   Sum  32892 176544 209436
proportions(table(ees$SEXO, ees$CONTROL),1)
##    
##             1         2
##   1 0.1284527 0.8715473
##   6 0.1953784 0.8046216
prop.test(table(ees$SEXO, ees$CONTROL))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  table(ees$SEXO, ees$CONTROL)
## X-squared = 1733.5, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.07015007 -0.06370139
## sample estimates:
##    prop 1    prop 2 
## 0.1284527 0.1953784
prop.test(c(15407, 17485), c(119943, 89493))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(15407, 17485) out of c(119943, 89493)
## X-squared = 1733.5, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.07015007 -0.06370139
## sample estimates:
##    prop 1    prop 2 
## 0.1284527 0.1953784
names(ECP_2021)[6] <- "transport"
with(ECP_2021, addmargins(table(Sexe, transport)))
##       transport
## Sexe   A peu Bicicleta Bicing Cotxe Cotxe + Transport públic Moto
##   Dona     5         5      0     0                        0    1
##   Home     7         6      1     1                        1    1
##   Sum     12        11      1     1                        1    2
##       transport
## Sexe   Transport públic Transport públic i bicicleta Sum
##   Dona               52                            1  64
##   Home               50                            0  67
##   Sum               102                            1 131
with(ECP_2021, proportions(table(Sexe, transport),1))
##       transport
## Sexe        A peu  Bicicleta     Bicing      Cotxe Cotxe + Transport públic
##   Dona 0.07812500 0.07812500 0.00000000 0.00000000               0.00000000
##   Home 0.10447761 0.08955224 0.01492537 0.01492537               0.01492537
##       transport
## Sexe         Moto Transport públic Transport públic i bicicleta
##   Dona 0.01562500       0.81250000                   0.01562500
##   Home 0.01492537       0.74626866                   0.00000000
prop.test(c(52, 50), c(64, 67))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(52, 50) out of c(64, 67)
## X-squared = 0.49308, df = 1, p-value = 0.4826
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.09046739  0.22293007
## sample estimates:
##    prop 1    prop 2 
## 0.8125000 0.7462687
with(ECP_2021, prop.test(table(Sexe, transport=="Transport públic")))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  table(Sexe, transport == "Transport públic")
## X-squared = 0.49308, df = 1, p-value = 0.4826
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.22293007  0.09046739
## sample estimates:
##    prop 1    prop 2 
## 0.1875000 0.2537313

independència

# mitjà de transport i centre de batxillerat
names(ECP_2021)[6] <- "transport"
names(ECP_2021)[3] <- "centre.batx"
table(ECP_2021$transport, ECP_2021$centre.batx)
##                               
##                                Centre concertat Centre privat Centre públic
##   A peu                                       1             1            10
##   Bicicleta                                   1             4             6
##   Bicing                                      1             0             0
##   Cotxe                                       0             1             0
##   Cotxe + Transport públic                    0             0             1
##   Moto                                        1             0             1
##   Transport públic                           28            24            50
##   Transport públic i bicicleta                0             0             1
mosaicplot(table(ECP_2021$transport, ECP_2021$centre.batx))

chisq.test(table(ECP_2021$transport, ECP_2021$centre.batx))
## Warning in chisq.test(table(ECP_2021$transport, ECP_2021$centre.batx)): Chi-
## squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(ECP_2021$transport, ECP_2021$centre.batx)
## X-squared = 16.375, df = 14, p-value = 0.291
table(ECP_2021$Sexe, ECP_2021$centre.batx)
##       
##        Centre concertat Centre privat Centre públic
##   Dona               14            15            35
##   Home               18            15            34
mosaicplot(table(ECP_2021$Sexe, ECP_2021$centre.batx))

chisq.test(table(ECP_2021$Sexe, ECP_2021$centre.batx))
## 
##  Pearson's Chi-squared test
## 
## data:  table(ECP_2021$Sexe, ECP_2021$centre.batx)
## X-squared = 0.44602, df = 2, p-value = 0.8001
# Sexe i mitjà de transport
names(ECP_2021)[6] <- "transport"
table(ECP_2021$Sexe, ECP_2021$transport)
##       
##        A peu Bicicleta Bicing Cotxe Cotxe + Transport públic Moto
##   Dona     5         5      0     0                        0    1
##   Home     7         6      1     1                        1    1
##       
##        Transport públic Transport públic i bicicleta
##   Dona               52                            1
##   Home               50                            0
mosaicplot(table(ECP_2021$Sexe, ECP_2021$transport))

mosaicplot(table(ECP_2021$transport, ECP_2021$Sexe))

proportions(table(ECP_2021$transport, ECP_2021$Sexe),1)
##                               
##                                     Dona      Home
##   A peu                        0.4166667 0.5833333
##   Bicicleta                    0.4545455 0.5454545
##   Bicing                       0.0000000 1.0000000
##   Cotxe                        0.0000000 1.0000000
##   Cotxe + Transport públic     0.0000000 1.0000000
##   Moto                         0.5000000 0.5000000
##   Transport públic             0.5098039 0.4901961
##   Transport públic i bicicleta 1.0000000 0.0000000
chisq.test(table(ECP_2021$transport, ECP_2021$Sexe))
## Warning in chisq.test(table(ECP_2021$transport, ECP_2021$Sexe)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(ECP_2021$transport, ECP_2021$Sexe)
## X-squared = 4.3971, df = 7, p-value = 0.7331
# EES (sexe i sector)
table(ees$SEXO, ees$CNACE)
##    
##        B0    C1    C2    C3    C4    C5    C6    C7    C8    D0    E0    F0
##   1  1613  6145  2848  1389  5826  2266  4584  4688  8431  1495  5000 11146
##   6   236  4459   693   529  2700   412   661  1310  2376   361  1200  1754
##    
##        G1    G2    H1    H2    I0    J0    K0    L0    M0    N0    O0    P0
##   1  4822  3577  4315  3454  2924  6996  4326   778  7394  8653  4762  2526
##   6  2090  7204  1119  2004  4147  4406  4767   893  8257  9937  4765  4443
##    
##        Q0    R0    S0
##   1  4215  3335  2435
##   6 12985  2802  2983
mosaicplot(table(ees$SEXO, ees$CNACE))

mosaicplot(table(ees$CNACE, ees$SEXO))

chisq.test(table(ees$CNACE, ees$SEXO))
## 
##  Pearson's Chi-squared test
## 
## data:  table(ees$CNACE, ees$SEXO)
## X-squared = 30355, df = 26, p-value < 2.2e-16

pendent

names(ECP_2021)[8] <- "enguany"
names(ECP_2021)[9] <- "passat"
names(ECP_2021)[10] <- "vacances"
despeses <- ECP_2021[, 8:10]
summary(despeses)
##     enguany           passat          vacances     
##  Min.   :   3.0   Min.   :  22.5   Min.   :   0.0  
##  1st Qu.:  40.0   1st Qu.:  60.0   1st Qu.: 200.0  
##  Median : 100.0   Median : 120.0   Median : 300.0  
##  Mean   : 230.4   Mean   : 340.2   Mean   : 425.6  
##  3rd Qu.: 400.0   3rd Qu.: 405.0   3rd Qu.: 500.0  
##  Max.   :1500.0   Max.   :8540.0   Max.   :3000.0  
##  NA's   :10       NA's   :11       NA's   :17
despeses <- despeses[-39,]
summary(despeses)
##     enguany            passat          vacances     
##  Min.   :   3.00   Min.   :  22.5   Min.   :   0.0  
##  1st Qu.:  40.00   1st Qu.:  60.0   1st Qu.: 200.0  
##  Median :  93.75   Median : 120.0   Median : 300.0  
##  Mean   : 226.40   Mean   : 271.3   Mean   : 402.9  
##  3rd Qu.: 400.00   3rd Qu.: 400.0   3rd Qu.: 500.0  
##  Max.   :1500.00   Max.   :1500.0   Max.   :1500.0  
##  NA's   :10        NA's   :11       NA's   :17
pairs(despeses)

plot(enguany~passat, data=despeses)
model <- lm(enguany~passat, data=despeses)
abline(model, col="red")

summary(model)
## 
## Call:
## lm(formula = enguany ~ passat, data = despeses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -477.45  -25.77    1.99   33.78  279.12 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.64898   12.47841  -0.373     0.71    
## passat       0.83684    0.03094  27.048   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 100.7 on 117 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.8621, Adjusted R-squared:  0.8609 
## F-statistic: 731.6 on 1 and 117 DF,  p-value: < 2.2e-16
confint(model)
##                   2.5 %     97.5 %
## (Intercept) -29.3618217 20.0638706
## passat        0.7755666  0.8981124
plot(vacances~enguany, data=despeses)
model <- lm(vacances~enguany, data=despeses)
abline(model, col="red")

summary(model)
## 
## Call:
## lm(formula = vacances ~ enguany, data = despeses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -506.38 -204.48  -61.43   99.52 1143.21 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 347.5282    40.6132   8.557 8.65e-14 ***
## enguany       0.2648     0.1263   2.097   0.0384 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 319.4 on 108 degrees of freedom
##   (20 observations deleted due to missingness)
## Multiple R-squared:  0.03911,    Adjusted R-squared:  0.03021 
## F-statistic: 4.396 on 1 and 108 DF,  p-value: 0.03837
confint(model)
##                    2.5 %      97.5 %
## (Intercept) 267.02571343 428.0307172
## enguany       0.01444301   0.5150604