library(htmltab)
LINK6="https://es.wikipedia.org/wiki/%C3%8Dndice_global_de_felicidad"
GMT='//*[@id="mw-content-text"]/div/table'
feliz=htmltab(doc= LINK6,
which = GMT)
names(feliz)
## [1] "№"
## [2] "País"
## [3] "Puntuación"
## [4] "PIB per cápita"
## [5] "Apoyo social"
## [6] "Esperanza de años de vida saludable"
## [7] "Libertad para tomar decisiones vitales"
## [8] "Generosidad"
## [9] "Percepción de la corrupción"
str(feliz)
## 'data.frame': 156 obs. of 9 variables:
## $ № : chr "1" "2" "3" "4" ...
## $ País : chr "Finlandia" "Colombia" "Noruega" "Dinamarca" ...
## $ Puntuación : chr "7.633" "7.594" "7.560" "7.555" ...
## $ PIB per cápita : chr "1.305" "1.456" "1.372" "1.351" ...
## $ Apoyo social : chr "1.592" "1.582" "1.595" "1.590" ...
## $ Esperanza de años de vida saludable : chr "0.874" "0.873" "0.870" "0.868" ...
## $ Libertad para tomar decisiones vitales: chr "0.681" "0.686" "0.685" "0.683" ...
## $ Generosidad : chr "0.192" "0.286" "0.285" "0.284" ...
## $ Percepción de la corrupción : chr "0.393" "0.130" "0.410" "0.408" ...
library(stringr)
names(feliz)=str_split(names(feliz)," ",simplify = T)[,1]
feliz[,c(3:9)]=lapply(feliz[,c(3:9)],as.numeric)
str(feliz)
## 'data.frame': 156 obs. of 9 variables:
## $ № : chr "1" "2" "3" "4" ...
## $ País : chr "Finlandia" "Colombia" "Noruega" "Dinamarca" ...
## $ Puntuación : num 7.63 7.59 7.56 7.55 7.5 ...
## $ PIB : num 1.3 1.46 1.37 1.35 1.34 ...
## $ Apoyo : num 1.59 1.58 1.59 1.59 1.64 ...
## $ Esperanza : num 0.874 0.873 0.87 0.868 0.914 0.927 0.878 0.896 0.876 0.913 ...
## $ Libertad : num 0.681 0.686 0.685 0.683 0.677 0.66 0.638 0.653 0.669 0.659 ...
## $ Generosidad: num 0.192 0.286 0.285 0.284 0.353 0.256 0.333 0.321 0.365 0.285 ...
## $ Percepción : num 0.393 0.13 0.41 0.408 0.138 0.357 0.295 0.291 0.389 0.383 ...
pregunta uno, la variable dependiente es (esperanza de años de vida saludable) veamos la normalidad de todas las variables:
library(dlookr)
## Loading required package: mice
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
##
## transform
normality(feliz)
## Warning: `cols` is now required.
## Please use `cols = c(statistic)`
## # A tibble: 7 x 4
## vars statistic p_value sample
## <chr> <dbl> <dbl> <dbl>
## 1 Puntuación 0.984 6.48e- 2 156
## 2 PIB 0.977 1.18e- 2 156
## 3 Apoyo 0.917 8.71e- 8 156
## 4 Esperanza 0.954 5.15e- 5 156
## 5 Libertad 0.946 1.13e- 5 156
## 6 Generosidad 0.960 1.79e- 4 156
## 7 Percepción 0.814 8.49e-13 156
son anormales veamos la regresion
todas=lm(Esperanza~PIB+Apoyo+Libertad+Generosidad+Percepción,data=feliz)
summary(todas)
##
## Call:
## lm(formula = Esperanza ~ PIB + Apoyo + Libertad + Generosidad +
## Percepción, data = feliz)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44391 -0.06484 0.01431 0.07939 0.24825
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.003605 0.047854 -0.075 0.94004
## PIB 0.445871 0.036908 12.081 < 2e-16 ***
## Apoyo 0.139466 0.048666 2.866 0.00476 **
## Libertad 0.046434 0.078767 0.590 0.55641
## Generosidad -0.002610 0.116482 -0.022 0.98216
## Percepción 0.122647 0.128416 0.955 0.34108
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1295 on 150 degrees of freedom
## Multiple R-squared: 0.7373, Adjusted R-squared: 0.7285
## F-statistic: 84.18 on 5 and 150 DF, p-value: < 2.2e-16
pbi y generoosidad: amabas son anormales
num1=formula(~PIB + Generosidad)
cor.test(num1,data=feliz,method = "pearson",exact=F)
##
## Pearson's product-moment correlation
##
## data: PIB and Generosidad
## t = -0.060844, df = 154, p-value = 0.9516
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1619189 0.1523552
## sample estimates:
## cor
## -0.004902909
ahora en un grafico:
library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
s2=ggscatter(feliz,
x = "Generosidad", y = "PIB",
cor.coef = TRUE,
cor.method = "spearman")
s2
la correlacion es minima ## entre años de vida saludable y percepcion de corrupcion.
num2=formula(~Esperanza + Percepción)
cor.test(num2,data=feliz,method = "pearson",exact=F)
##
## Pearson's product-moment correlation
##
## data: Esperanza and Percepción
## t = 4.0953, df = 154, p-value = 6.791e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1643339 0.4484393
## sample estimates:
## cor
## 0.313382
s3=ggscatter(feliz,
x = "Percepción", y = "Esperanza",
cor.coef = TRUE,
cor.method = "spearman")
s3
## tomando a perdepcino como dependiente:
todas=lm(Percepción~PIB+Apoyo+Libertad+Generosidad+Esperanza,data=feliz)
summary(todas)
##
## Call:
## lm(formula = Percepción ~ PIB + Apoyo + Libertad + Generosidad +
## Esperanza, data = feliz)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18413 -0.05413 -0.01276 0.04097 0.31134
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.04356 0.03013 -1.446 0.150266
## PIB 0.04141 0.03269 1.267 0.207190
## Apoyo -0.03841 0.03153 -1.218 0.225052
## Libertad 0.19913 0.04727 4.213 4.34e-05 ***
## Generosidad 0.25261 0.07090 3.563 0.000492 ***
## Esperanza 0.04928 0.05160 0.955 0.341075
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08212 on 150 degrees of freedom
## Multiple R-squared: 0.3098, Adjusted R-squared: 0.2868
## F-statistic: 13.47 on 5 and 150 DF, p-value: 7.529e-11
link="https://docs.google.com/spreadsheets/d/e/2PACX-1vTgL5kS-g3Q5TuAhkIAsnnz-nqPg-WREgkWA9y2sWPRSUPKIHWknh9hJY7UcXRFWA/pub?output=csv"
IDE=read.csv(link, stringsAsFactors = F)
str(IDE)
## 'data.frame': 195 obs. of 13 variables:
## $ regionUbigeo : int 10000 10000 10000 10000 10000 10000 10000 20000 20000 20000 ...
## $ provinciaUbigeo: int 10100 10200 10300 10400 10500 10600 10700 20100 20200 20300 ...
## $ PROVINCIA : chr "CHACHAPOYAS" "BAGUA" "BONGARA" "CONDORCANQUI" ...
## $ IDE : num 0.774 0.662 0.632 0.46 0.605 ...
## $ identidad : num 98.6 94.6 97.5 86.2 96.2 ...
## $ salud : num 25.45 14.61 9.01 8.56 12.42 ...
## $ educacion : num 91.5 79.8 76.4 52.2 74.7 ...
## $ saneamiento : num 70.3 64.5 54.8 37.7 43.3 ...
## $ electrificacion: num 84 67.9 72.2 39.5 67.4 ...
## $ poblacion : int 54783 77438 32317 51802 52185 30236 118747 161003 7974 16879 ...
## $ costa : chr "NO" "NO" "NO" "NO" ...
## $ capital : chr "SI" "NO" "NO" "NO" ...
## $ tamano : chr "Pequena" "Pequena" "Muy pequena" "Pequena" ...
names(IDE)=str_split(names(IDE)," ",simplify = T)[,1]
names(IDE)
## [1] "regionUbigeo" "provinciaUbigeo" "PROVINCIA"
## [4] "IDE" "identidad" "salud"
## [7] "educacion" "saneamiento" "electrificacion"
## [10] "poblacion" "costa" "capital"
## [13] "tamano"
IDE[,c(11:13)]=lapply(IDE[,c(11:13)],as.factor)
todas1=lm(salud~identidad+educacion+saneamiento+electrificacion+poblacion+costa+capital+tamano,data=IDE)
summary(todas1)
##
## Call:
## lm(formula = salud ~ identidad + educacion + saneamiento + electrificacion +
## poblacion + costa + capital + tamano, data = IDE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.3402 -3.2121 -0.2517 2.6801 19.4999
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.176e+00 1.662e+01 0.251 0.8019
## identidad -2.388e-02 1.886e-01 -0.127 0.8994
## educacion 1.140e-02 4.858e-02 0.235 0.8147
## saneamiento 1.351e-01 2.594e-02 5.209 5.07e-07 ***
## electrificacion 2.556e-02 3.751e-02 0.681 0.4965
## poblacion 1.399e-06 4.946e-06 0.283 0.7776
## costaSI 2.171e+00 1.218e+00 1.783 0.0763 .
## capitalSI 7.036e+00 1.546e+00 4.551 9.73e-06 ***
## tamanoMediana -3.462e+00 2.507e+00 -1.381 0.1690
## tamanoMuy grande -1.258e+00 3.940e+01 -0.032 0.9746
## tamanoMuy pequena -3.255e-01 3.043e+00 -0.107 0.9149
## tamanoPequena -2.534e+00 2.859e+00 -0.887 0.3765
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.86 on 183 degrees of freedom
## Multiple R-squared: 0.5837, Adjusted R-squared: 0.5587
## F-statistic: 23.33 on 11 and 183 DF, p-value: < 2.2e-16
todas2=lm(identidad~salud+educacion+saneamiento+electrificacion+poblacion+costa+capital+tamano,data=IDE)
summary(todas2)
##
## Call:
## lm(formula = identidad ~ salud + educacion + saneamiento + electrificacion +
## poblacion + costa + capital + tamano, data = IDE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.1774 -0.8996 0.2286 1.1466 4.0155
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.530e+01 1.642e+00 51.935 < 2e-16 ***
## salud -3.668e-03 2.897e-02 -0.127 0.89938
## educacion 1.190e-01 1.689e-02 7.045 3.64e-11 ***
## saneamiento 2.971e-02 1.067e-02 2.785 0.00592 **
## electrificacion -8.407e-03 1.470e-02 -0.572 0.56819
## poblacion 9.183e-07 1.937e-06 0.474 0.63607
## costaSI 1.979e-01 4.812e-01 0.411 0.68139
## capitalSI 2.562e-01 6.391e-01 0.401 0.68898
## tamanoMediana 1.178e+00 9.838e-01 1.198 0.23259
## tamanoMuy grande -6.758e+00 1.543e+01 -0.438 0.66192
## tamanoMuy pequena 1.767e+00 1.185e+00 1.491 0.13777
## tamanoPequena 1.030e+00 1.120e+00 0.920 0.35884
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.905 on 183 degrees of freedom
## Multiple R-squared: 0.4343, Adjusted R-squared: 0.4003
## F-statistic: 12.77 on 11 and 183 DF, p-value: < 2.2e-16
f1=formula(saneamiento ~ costa)
aggregate(f1, IDE,mean)
## costa saneamiento
## 1 NO 50.97556
## 2 SI 83.20243
hasta aqui podemos decir que el saneamiento es mejor en la cosa que en las que no lo son VEAMOS SI ES O NO PARAMETRICO.
ggqqplot(data=IDE,x="saneamiento") + facet_grid(. ~ costa)
# funcion ad-hoc
normalidadTest=function(x) {y =shapiro.test(x);
c(y$statistic, y$p.value)}
# calculando
resultado= aggregate(f1, IDE,
FUN = normalidadTest)
# mostrando resultado
library(knitr)
shapiroTest=as.data.frame(resultado[,2])
names(shapiroTest)=c("SW_Statistic","Probabilidad")
kable(cbind(resultado[1],shapiroTest))
| costa | SW_Statistic | Probabilidad |
|---|---|---|
| NO | 0.9949525 | 0.8529651 |
| SI | 0.9580179 | 0.2422331 |
SI SON PARAMETRICAS.
EN ESTE CASO SE USA LA PRUEBA T
t.test(f1,IDE)
##
## Welch Two Sample t-test
##
## data: saneamiento by costa
## t = -14.131, df = 73.273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -36.77164 -27.68210
## sample estimates:
## mean in group NO mean in group SI
## 50.97556 83.20243
La H0 (Los promedios de las variable numerica en cada grupo de la dicotómica no se diferencian.) se rechasa al comprobar que el p valor es inferiori a 0.05