Exemples contrastos
Una mitjana
#Distanciòmetre
mostrapiscina<-c(49.9958, 49.9971, 49.9978, 49.9992,
49.9995, 50.0001, 50.0004, 50.0012)
mean(mostrapiscina)
## [1] 49.99889
sigma <- .002
(se <- sigma/sqrt(8))
## [1] 0.0007071068
pnorm(mean(mostrapiscina), 50, se)
## [1] 0.05782323
(z <- (mean(mostrapiscina)-50)/se)
## [1] -1.573313
pnorm(z)
## [1] 0.05782323
Una proporció
dades <- read.csv("~/DADES/pere/upf/estadistica_politiques/seminari3/Microdades anonimitzades -985.csv", encoding="UTF-8", sep=";")
table(dades$P39)
##
## Altres partits C's Catalunya en Comú Podem
## 34 46 95
## CUP En blanc ERC
## 93 56 419
## ICV-EUiA Junts per Catalunya No contesta
## 7 162 92
## No ho sap No podria votar No votaria
## 454 4 185
## Nul PACMA PDeCAT
## 13 15 21
## Podemos PPC PSC
## 9 43 218
## Vox
## 34
indepe <- 93+419+162+21
total <- 2000-56-13-92-454-4-185
(p <- indepe/total)
## [1] 0.5811037
se <- sqrt(p*(1-p)/total)
#era significativament més gran del 50%?
pnorm(p, .5, se, lower.tail = FALSE)
## [1] 6.542843e-09
prop.test(indepe, total, p=.5, alternative = "greater")
##
## 1-sample proportions test with continuity correction
##
## data: indepe out of total, null probability 0.5
## X-squared = 31.145, df = 1, p-value = 1.197e-08
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
## 0.5570599 1.0000000
## sample estimates:
## p
## 0.5811037
#és significativament diferent del que va sortir?
#51.2%
pnorm(p, .512, se, lower.tail = FALSE)
## [1] 6.36879e-07
2*pnorm(p, .512, se, lower.tail = FALSE)
## [1] 1.273758e-06
prop.test(indepe, total, p=.512, alternative = "two.sided")
##
## 1-sample proportions test with continuity correction
##
## data: indepe out of total, null probability 0.512
## X-squared = 22.583, df = 1, p-value = 2.013e-06
## alternative hypothesis: true p is not equal to 0.512
## 95 percent confidence interval:
## 0.5525054 0.6091749
## sample estimates:
## p
## 0.5811037
dues mitjanes, mostres independents
library(readxl)
ECP_2021 <- read_excel("~/DADES/pere/upf/estadistica_politiques/seminari1/ECP_2021.xlsx")
names(ECP_2021)[4] <- "notacces"
aggregate(notacces~Sexe, data=ECP_2021, mean, rm.na=TRUE)
## Sexe notacces
## 1 Dona 10.65694
## 2 Home 10.49325
t.test(notacces~Sexe, data=ECP_2021)
##
## Welch Two Sample t-test
##
## data: notacces by Sexe
## t = 0.67693, df = 124.9, p-value = 0.4997
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3148815 0.6422546
## sample estimates:
## mean in group Dona mean in group Home
## 10.65694 10.49325
names(ECP_2021)
## [1] "Sexe"
## [2] "Grup"
## [3] "On vas estudiar el batxillerat"
## [4] "notacces"
## [5] "On vius durant el curs?"
## [6] "Amb quin transport sols venir a la Uni quan no estem confinats i hi ha classe presencial?"
## [7] "Quants minuts trigues de casa a la universitat?"
## [8] "Quina és la teva despesa total en un mes tipus? (euros)"
## [9] "Quina era el curs passat la teva despesa total en un mes tipus? (euros)"
## [10] "Quants euros vas gastar a les vacances d'estiu?"
names(ECP_2021)[7] <- "minuts"
aggregate(minuts~Sexe, data=ECP_2021, mean, rm.na=TRUE)
## Sexe minuts
## 1 Dona 39.07031
## 2 Home 35.60448
t.test(minuts~Sexe, data=ECP_2021)
##
## Welch Two Sample t-test
##
## data: minuts by Sexe
## t = 0.78562, df = 125.46, p-value = 0.4336
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.26492 12.19659
## sample estimates:
## mean in group Dona mean in group Home
## 39.07031 35.60448
load("~/DADES/pere/upf/estadistica_politiques/EES_2014.RData")
boxplot(SALBRUTO~SEXO, data=ees)

aggregate(SALBRUTO~SEXO, data=ees, mean)
## SEXO SALBRUTO
## 1 1 27955.46
## 2 6 20530.37
t.test(SALBRUTO~SEXO, data=ees)
##
## Welch Two Sample t-test
##
## data: SALBRUTO by SEXO
## t = 82.47, df = 206398, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 7248.626 7601.554
## sample estimates:
## mean in group 1 mean in group 6
## 27955.46 20530.37
t.test(SALBRUTO~SEXO, data=ees, alternative="greater")
##
## Welch Two Sample t-test
##
## data: SALBRUTO by SEXO
## t = 82.47, df = 206398, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 7276.997 Inf
## sample estimates:
## mean in group 1 mean in group 6
## 27955.46 20530.37
dues mitjanes, mostres aparellades
## dues mitjanes, mostres aparellades
dades <- read.csv("~/DADES/pere/upf/estadistica_politiques/seminari3/Microdades anonimitzades -985.csv", encoding="UTF-8", sep=";")
table(dades$P43B_M_ICETA)
##
## 0 1 10 2 3
## 339 275 56 20 158 178
## 4 5 6 7 8 9
## 189 319 150 111 61 17
## No contesta No ho sap
## 35 92
mi <- as.numeric(dades$P43B_M_ICETA)
## Warning: NAs introduced by coercion
summary(mi)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.000 4.000 3.765 5.000 10.000 466
si <- as.numeric(dades$P43B_S_ILLA)
## Warning: NAs introduced by coercion
summary(si)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 3.000 5.000 4.758 6.000 10.000 533
boxplot(si, mi)

t.test(si,mi, paired=TRUE, alternative = "greater")
##
## Paired t-test
##
## data: si and mi
## t = 16.047, df = 1351, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.9166799 Inf
## sample estimates:
## mean of the differences
## 1.02145
dues proporcions
addmargins(table(ees$SEXO, ees$CONTROL))
##
## 1 2 Sum
## 1 15407 104536 119943
## 6 17485 72008 89493
## Sum 32892 176544 209436
proportions(table(ees$SEXO, ees$CONTROL),1)
##
## 1 2
## 1 0.1284527 0.8715473
## 6 0.1953784 0.8046216
prop.test(table(ees$SEXO, ees$CONTROL))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: table(ees$SEXO, ees$CONTROL)
## X-squared = 1733.5, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.07015007 -0.06370139
## sample estimates:
## prop 1 prop 2
## 0.1284527 0.1953784
prop.test(c(15407, 17485), c(119943, 89493))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(15407, 17485) out of c(119943, 89493)
## X-squared = 1733.5, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.07015007 -0.06370139
## sample estimates:
## prop 1 prop 2
## 0.1284527 0.1953784
names(ECP_2021)[6] <- "transport"
with(ECP_2021, addmargins(table(Sexe, transport)))
## transport
## Sexe A peu Bicicleta Bicing Cotxe Cotxe + Transport públic Moto
## Dona 5 5 0 0 0 1
## Home 7 6 1 1 1 1
## Sum 12 11 1 1 1 2
## transport
## Sexe Transport públic Transport públic i bicicleta Sum
## Dona 52 1 64
## Home 50 0 67
## Sum 102 1 131
with(ECP_2021, proportions(table(Sexe, transport),1))
## transport
## Sexe A peu Bicicleta Bicing Cotxe Cotxe + Transport públic
## Dona 0.07812500 0.07812500 0.00000000 0.00000000 0.00000000
## Home 0.10447761 0.08955224 0.01492537 0.01492537 0.01492537
## transport
## Sexe Moto Transport públic Transport públic i bicicleta
## Dona 0.01562500 0.81250000 0.01562500
## Home 0.01492537 0.74626866 0.00000000
prop.test(c(52, 50), c(64, 67))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(52, 50) out of c(64, 67)
## X-squared = 0.49308, df = 1, p-value = 0.4826
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.09046739 0.22293007
## sample estimates:
## prop 1 prop 2
## 0.8125000 0.7462687
with(ECP_2021, prop.test(table(Sexe, transport=="Transport públic")))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: table(Sexe, transport == "Transport públic")
## X-squared = 0.49308, df = 1, p-value = 0.4826
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.22293007 0.09046739
## sample estimates:
## prop 1 prop 2
## 0.1875000 0.2537313
independència
# mitjà de transport i centre de batxillerat
names(ECP_2021)[6] <- "transport"
names(ECP_2021)[3] <- "centre.batx"
table(ECP_2021$transport, ECP_2021$centre.batx)
##
## Centre concertat Centre privat Centre públic
## A peu 1 1 10
## Bicicleta 1 4 6
## Bicing 1 0 0
## Cotxe 0 1 0
## Cotxe + Transport públic 0 0 1
## Moto 1 0 1
## Transport públic 28 24 50
## Transport públic i bicicleta 0 0 1
mosaicplot(table(ECP_2021$transport, ECP_2021$centre.batx))

chisq.test(table(ECP_2021$transport, ECP_2021$centre.batx))
## Warning in chisq.test(table(ECP_2021$transport, ECP_2021$centre.batx)): Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(ECP_2021$transport, ECP_2021$centre.batx)
## X-squared = 16.375, df = 14, p-value = 0.291
table(ECP_2021$Sexe, ECP_2021$centre.batx)
##
## Centre concertat Centre privat Centre públic
## Dona 14 15 35
## Home 18 15 34
mosaicplot(table(ECP_2021$Sexe, ECP_2021$centre.batx))

chisq.test(table(ECP_2021$Sexe, ECP_2021$centre.batx))
##
## Pearson's Chi-squared test
##
## data: table(ECP_2021$Sexe, ECP_2021$centre.batx)
## X-squared = 0.44602, df = 2, p-value = 0.8001
# Sexe i mitjà de transport
names(ECP_2021)[6] <- "transport"
table(ECP_2021$Sexe, ECP_2021$transport)
##
## A peu Bicicleta Bicing Cotxe Cotxe + Transport públic Moto
## Dona 5 5 0 0 0 1
## Home 7 6 1 1 1 1
##
## Transport públic Transport públic i bicicleta
## Dona 52 1
## Home 50 0
mosaicplot(table(ECP_2021$Sexe, ECP_2021$transport))

mosaicplot(table(ECP_2021$transport, ECP_2021$Sexe))

proportions(table(ECP_2021$transport, ECP_2021$Sexe),1)
##
## Dona Home
## A peu 0.4166667 0.5833333
## Bicicleta 0.4545455 0.5454545
## Bicing 0.0000000 1.0000000
## Cotxe 0.0000000 1.0000000
## Cotxe + Transport públic 0.0000000 1.0000000
## Moto 0.5000000 0.5000000
## Transport públic 0.5098039 0.4901961
## Transport públic i bicicleta 1.0000000 0.0000000
chisq.test(table(ECP_2021$transport, ECP_2021$Sexe))
## Warning in chisq.test(table(ECP_2021$transport, ECP_2021$Sexe)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(ECP_2021$transport, ECP_2021$Sexe)
## X-squared = 4.3971, df = 7, p-value = 0.7331
# EES (sexe i sector)
table(ees$SEXO, ees$CNACE)
##
## B0 C1 C2 C3 C4 C5 C6 C7 C8 D0 E0 F0
## 1 1613 6145 2848 1389 5826 2266 4584 4688 8431 1495 5000 11146
## 6 236 4459 693 529 2700 412 661 1310 2376 361 1200 1754
##
## G1 G2 H1 H2 I0 J0 K0 L0 M0 N0 O0 P0
## 1 4822 3577 4315 3454 2924 6996 4326 778 7394 8653 4762 2526
## 6 2090 7204 1119 2004 4147 4406 4767 893 8257 9937 4765 4443
##
## Q0 R0 S0
## 1 4215 3335 2435
## 6 12985 2802 2983
mosaicplot(table(ees$SEXO, ees$CNACE))

mosaicplot(table(ees$CNACE, ees$SEXO))

chisq.test(table(ees$CNACE, ees$SEXO))
##
## Pearson's Chi-squared test
##
## data: table(ees$CNACE, ees$SEXO)
## X-squared = 30355, df = 26, p-value < 2.2e-16
pendent
names(ECP_2021)[8] <- "enguany"
names(ECP_2021)[9] <- "passat"
names(ECP_2021)[10] <- "vacances"
despeses <- ECP_2021[, 8:10]
summary(despeses)
## enguany passat vacances
## Min. : 3.0 Min. : 22.5 Min. : 0.0
## 1st Qu.: 40.0 1st Qu.: 60.0 1st Qu.: 200.0
## Median : 100.0 Median : 120.0 Median : 300.0
## Mean : 230.4 Mean : 340.2 Mean : 425.6
## 3rd Qu.: 400.0 3rd Qu.: 405.0 3rd Qu.: 500.0
## Max. :1500.0 Max. :8540.0 Max. :3000.0
## NA's :10 NA's :11 NA's :17
despeses <- despeses[-39,]
summary(despeses)
## enguany passat vacances
## Min. : 3.00 Min. : 22.5 Min. : 0.0
## 1st Qu.: 40.00 1st Qu.: 60.0 1st Qu.: 200.0
## Median : 93.75 Median : 120.0 Median : 300.0
## Mean : 226.40 Mean : 271.3 Mean : 402.9
## 3rd Qu.: 400.00 3rd Qu.: 400.0 3rd Qu.: 500.0
## Max. :1500.00 Max. :1500.0 Max. :1500.0
## NA's :10 NA's :11 NA's :17
pairs(despeses)

plot(enguany~passat, data=despeses)
model <- lm(enguany~passat, data=despeses)
abline(model, col="red")

summary(model)
##
## Call:
## lm(formula = enguany ~ passat, data = despeses)
##
## Residuals:
## Min 1Q Median 3Q Max
## -477.45 -25.77 1.99 33.78 279.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.64898 12.47841 -0.373 0.71
## passat 0.83684 0.03094 27.048 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 100.7 on 117 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.8621, Adjusted R-squared: 0.8609
## F-statistic: 731.6 on 1 and 117 DF, p-value: < 2.2e-16
confint(model)
## 2.5 % 97.5 %
## (Intercept) -29.3618217 20.0638706
## passat 0.7755666 0.8981124
plot(vacances~enguany, data=despeses)
model <- lm(vacances~enguany, data=despeses)
abline(model, col="red")

summary(model)
##
## Call:
## lm(formula = vacances ~ enguany, data = despeses)
##
## Residuals:
## Min 1Q Median 3Q Max
## -506.38 -204.48 -61.43 99.52 1143.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 347.5282 40.6132 8.557 8.65e-14 ***
## enguany 0.2648 0.1263 2.097 0.0384 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 319.4 on 108 degrees of freedom
## (20 observations deleted due to missingness)
## Multiple R-squared: 0.03911, Adjusted R-squared: 0.03021
## F-statistic: 4.396 on 1 and 108 DF, p-value: 0.03837
confint(model)
## 2.5 % 97.5 %
## (Intercept) 267.02571343 428.0307172
## enguany 0.01444301 0.5150604