library(rio)
linkGIT="https://github.com/JoseManuelMagallanes/Estadistica_Para_AnalisisPolitico/raw/master/hsb.sav"
data=import(linkGIT)
que tenemos?
str(data,strict.width="cut",width=50)
## 'data.frame': 600 obs. of 15 variables:
## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "format.spss")= chr "F5.0"
## $ SEX : num 2 1 2 2 2 1 1 2 1 2 ...
## ..- attr(*, "format.spss")= chr "F5.0"
## $ RACE : num 2 2 2 2 2 2 2 2 2 2 ...
## ..- attr(*, "format.spss")= chr "F5.0"
## $ SES : num 1 1 1 2 2 2 1 1 2 1 ...
## ..- attr(*, "format.spss")= chr "F5.0"
## $ SCTYP : num 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "format.spss")= chr "F5.0"
## $ HSP : num 3 2 2 3 3 2 1 1 1 1 ...
## ..- attr(*, "format.spss")= chr "F5.0"
## $ LOCUS : num 0.29 -0.42 0.71 0.06 0.22 0.46 0..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ CONCPT: num 0.88 0.03 0.03 0.03 -0.28 0.03 -..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ MOT : num 0.67 0.33 0.67 0 0 0 0.33 1 0.33..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ CAR : num 10 2 9 15 1 11 10 9 9 11 ...
## ..- attr(*, "format.spss")= chr "F5.0"
## $ RDG : num 33.6 46.9 41.6 38.9 36.3 49.5 62..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ WRTG : num 43.7 35.9 59.3 41.1 48.9 46.3 64..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ MATH : num 40.2 41.9 41.9 32.7 39.5 46.2 48..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ SCI : num 39 36.3 44.4 41.7 41.7 41.7 63.4..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ CIV : num 40.6 45.6 45.6 40.6 45.6 35.6 55..
## ..- attr(*, "format.spss")= chr "F5.2"
Lea la metadata,y de formato:
data$ID=as.character(data$ID)
data[,c(2,3,5,6,10)]=lapply(data[,c(2,3,5,6,10)],as.factor)
data$SES=as.ordered(data$SES)
Ahora:
str(data,strict.width="cut",width=50)
## 'data.frame': 600 obs. of 15 variables:
## $ ID : chr "1" "2" "3" "4" ...
## $ SEX : Factor w/ 2 levels "1","2": 2 1 2 2 2..
## $ RACE : Factor w/ 4 levels "1","2","3","4": 2..
## $ SES : Ord.factor w/ 3 levels "1"<"2"<"3": 1..
## $ SCTYP : Factor w/ 2 levels "1","2": 1 1 1 1 1..
## $ HSP : Factor w/ 3 levels "1","2","3": 3 2 2..
## $ LOCUS : num 0.29 -0.42 0.71 0.06 0.22 0.46 0..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ CONCPT: num 0.88 0.03 0.03 0.03 -0.28 0.03 -..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ MOT : num 0.67 0.33 0.67 0 0 0 0.33 1 0.33..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ CAR : Factor w/ 17 levels "1","2","3","4",...
## $ RDG : num 33.6 46.9 41.6 38.9 36.3 49.5 62..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ WRTG : num 43.7 35.9 59.3 41.1 48.9 46.3 64..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ MATH : num 40.2 41.9 41.9 32.7 39.5 46.2 48..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ SCI : num 39 36.3 44.4 41.7 41.7 41.7 63.4..
## ..- attr(*, "format.spss")= chr "F5.2"
## $ CIV : num 40.6 45.6 45.6 40.6 45.6 35.6 55..
## ..- attr(*, "format.spss")= chr "F5.2"
I. READING
I.1 RDG y SEX
f1=formula(RDG ~ SEX)
aggregate(f1, data,mean)
## SEX RDG
## 1 1 52.36300
## 2 2 51.51682
library(knitr)
tablag= aggregate(f1, data,
FUN = function(x) {y <- shapiro.test(x); c(y$statistic, y$p.value)})
# para que se vea mejor:
library(knitr)
shapiroTest=as.data.frame(tablag[,2])
names(shapiroTest)=c("W","Prob")
kable(cbind(tablag[1],shapiroTest))
| SEX | W | Prob |
|---|---|---|
| 1 | 0.9784886 | 0.0003866 |
| 2 | 0.9824176 | 0.0004953 |
Aqui lo puedes graficar:
library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
ggqqplot(data=data,x="RDG") + facet_grid(. ~ SEX)
Como no hay normalidad, vamos por camino NO parametrico:
wilcox.test(f1,data)
##
## Wilcoxon rank sum test with continuity correction
##
## data: RDG by SEX
## W = 46426, p-value = 0.3962
## alternative hypothesis: true location shift is not equal to 0
Visualmente:
ggplot(data=data, aes(x=SEX, y=RDG)) + geom_boxplot(notch = T)
Concluyendo que NO hay diferencia de medias.
Si hubieramos necesitado una pruab parametrica, debimos haber usaod la prueba T:
t.test(f1,data)
##
## Welch Two Sample t-test
##
## data: RDG by SEX
## t = 1.019, df = 572.67, p-value = 0.3087
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.784896 2.477264
## sample estimates:
## mean in group 1 mean in group 2
## 52.36300 51.51682
I.2 RDG y SES
f2=formula(RDG ~ SES)
aggregate(f2, data,mean)
## SES RDG
## 1 1 47.33597
## 2 2 51.92542
## 3 3 55.77593
library(knitr)
tablag=aggregate(f2, data,
FUN = function(x) {y <- shapiro.test(x); c(y$statistic, y$p.value)})
# para que se vea mejor:
shapiroTest=as.data.frame(tablag[,2])
names(shapiroTest)=c("W","Prob")
kable(cbind(tablag[1],shapiroTest))
| SES | W | Prob |
|---|---|---|
| 1 | 0.9533579 | 0.0001203 |
| 2 | 0.9841816 | 0.0022243 |
| 3 | 0.9814021 | 0.0282263 |
Aqui lo puedes graficar:
library(ggpubr)
ggqqplot(data=data,x="RDG") + facet_grid(. ~ SES)
Ninguno se acepta como normal. Vamos por no parametrico:
kruskal.test(f2,data)
##
## Kruskal-Wallis rank sum test
##
## data: RDG by SES
## Kruskal-Wallis chi-squared = 52.583, df = 2, p-value = 3.818e-12
Mismo resultado.
Visualmente:
ggplot(data=data, aes(x=SES, y=RDG)) + geom_boxplot(notch = T)
El valor medio que se alejaria seria el SES=1 de todos los demas.
Si se quiere una prureba parametrica, debemos utilizar la prueba F (anova de un factor):
summary(aov(f2, data))
## Df Sum Sq Mean Sq F value Pr(>F)
## SES 2 5329 2664.7 28.5 1.5e-12 ***
## Residuals 597 55811 93.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
I. SCIENCE
I.1 SCI y SEX
f1=formula(SCI ~ SEX)
aggregate(f1, data,mean)
## SEX SCI
## 1 1 53.23004
## 2 2 50.53884
library(knitr)
tablag=aggregate(f1, data,
FUN = function(x) {y <- shapiro.test(x); c(y$statistic, y$p.value)})
# para que se vea mejor:
shapiroTest=as.data.frame(tablag[,2])
names(shapiroTest)=c("W","Prob")
kable(cbind(tablag[1],shapiroTest))
| SEX | W | Prob |
|---|---|---|
| 1 | 0.9787394 | 0.0004268 |
| 2 | 0.9798800 | 0.0001525 |
Aqui lo puedes graficar:
library(ggpubr)
ggqqplot(data=data,x="SCI") + facet_grid(. ~ SEX)
Como no hay normalidad, vamos por camino NO parametrico:
wilcox.test(f1,data)
##
## Wilcoxon rank sum test with continuity correction
##
## data: SCI by SEX
## W = 51631, p-value = 0.0009173
## alternative hypothesis: true location shift is not equal to 0
Visualmente:
ggplot(data=data, aes(x=SEX, y=SCI)) + geom_boxplot(notch = T)
Concluyendo que SI hay diferencia de medias.
I.2 SCI y SES
f2=formula(SCI ~ SES)
aggregate(f2, data,mean)
## SES SCI
## 1 1 47.30576
## 2 2 51.93010
## 3 3 55.28025
library(knitr)
tablag=aggregate(f2, data,
FUN = function(x) {y <- shapiro.test(x); c(y$statistic, y$p.value)})
# para que se vea mejor:
shapiroTest=as.data.frame(tablag[,2])
names(shapiroTest)=c("W","Prob")
kable(cbind(tablag[1],shapiroTest))
| SES | W | Prob |
|---|---|---|
| 1 | 0.9773277 | 0.0205115 |
| 2 | 0.9833695 | 0.0015284 |
| 3 | 0.9574177 | 0.0000744 |
Aqui lo puedes graficar:
library(ggpubr)
ggqqplot(data=data,x="SCI") + facet_grid(. ~ SES)
Ningun grupo es normal. Vamos por no parametrico:
kruskal.test(f2,data)
##
## Kruskal-Wallis rank sum test
##
## data: SCI by SES
## Kruskal-Wallis chi-squared = 52.806, df = 2, p-value = 3.415e-12
Hay diferencias!
Visualmente:
ggplot(data=data, aes(x=SES, y=SCI)) + geom_boxplot(notch = T)
Los tres grupos se diferencian
Volver al indice