library(htmltab)
LINK6="https://es.wikipedia.org/wiki/%C3%8Dndice_global_de_felicidad"
GMT='//*[@id="mw-content-text"]/div/table'
feliz=htmltab(doc= LINK6,
              which = GMT)
names(feliz)
## [1] "№"                                     
## [2] "País"                                  
## [3] "Puntuación"                            
## [4] "PIB per cápita"                        
## [5] "Apoyo social"                          
## [6] "Esperanza de años de vida saludable"   
## [7] "Libertad para tomar decisiones vitales"
## [8] "Generosidad"                           
## [9] "Percepción de la corrupción"
str(feliz)
## 'data.frame':    156 obs. of  9 variables:
##  $ №                                     : chr  "1" "2" "3" "4" ...
##  $ País                                  : chr  "Finlandia" "Colombia" "Noruega" "Dinamarca" ...
##  $ Puntuación                            : chr  "7.633" "7.594" "7.560" "7.555" ...
##  $ PIB per cápita                        : chr  "1.305" "1.456" "1.372" "1.351" ...
##  $ Apoyo social                          : chr  "1.592" "1.582" "1.595" "1.590" ...
##  $ Esperanza de años de vida saludable   : chr  "0.874" "0.873" "0.870" "0.868" ...
##  $ Libertad para tomar decisiones vitales: chr  "0.681" "0.686" "0.685" "0.683" ...
##  $ Generosidad                           : chr  "0.192" "0.286" "0.285" "0.284" ...
##  $ Percepción de la corrupción           : chr  "0.393" "0.130" "0.410" "0.408" ...
library(stringr)
names(feliz)=str_split(names(feliz)," ",simplify = T)[,1]
feliz[,c(3:9)]=lapply(feliz[,c(3:9)],as.numeric)
str(feliz)
## 'data.frame':    156 obs. of  9 variables:
##  $ №          : chr  "1" "2" "3" "4" ...
##  $ País       : chr  "Finlandia" "Colombia" "Noruega" "Dinamarca" ...
##  $ Puntuación : num  7.63 7.59 7.56 7.55 7.5 ...
##  $ PIB        : num  1.3 1.46 1.37 1.35 1.34 ...
##  $ Apoyo      : num  1.59 1.58 1.59 1.59 1.64 ...
##  $ Esperanza  : num  0.874 0.873 0.87 0.868 0.914 0.927 0.878 0.896 0.876 0.913 ...
##  $ Libertad   : num  0.681 0.686 0.685 0.683 0.677 0.66 0.638 0.653 0.669 0.659 ...
##  $ Generosidad: num  0.192 0.286 0.285 0.284 0.353 0.256 0.333 0.321 0.365 0.285 ...
##  $ Percepción : num  0.393 0.13 0.41 0.408 0.138 0.357 0.295 0.291 0.389 0.383 ...

pregunta uno, la variable dependiente es (esperanza de años de vida saludable) veamos la normalidad de todas las variables:

library(dlookr)
## Loading required package: mice
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
## 
##     transform
normality(feliz)
## Warning: `cols` is now required.
## Please use `cols = c(statistic)`
## # A tibble: 7 x 4
##   vars        statistic  p_value sample
##   <chr>           <dbl>    <dbl>  <dbl>
## 1 Puntuación      0.984 6.48e- 2    156
## 2 PIB             0.977 1.18e- 2    156
## 3 Apoyo           0.917 8.71e- 8    156
## 4 Esperanza       0.954 5.15e- 5    156
## 5 Libertad        0.946 1.13e- 5    156
## 6 Generosidad     0.960 1.79e- 4    156
## 7 Percepción      0.814 8.49e-13    156

son anormales veamos la regresion

todas=lm(Esperanza~PIB+Apoyo+Libertad+Generosidad+Percepción,data=feliz)
summary(todas)
## 
## Call:
## lm(formula = Esperanza ~ PIB + Apoyo + Libertad + Generosidad + 
##     Percepción, data = feliz)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44391 -0.06484  0.01431  0.07939  0.24825 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.003605   0.047854  -0.075  0.94004    
## PIB          0.445871   0.036908  12.081  < 2e-16 ***
## Apoyo        0.139466   0.048666   2.866  0.00476 ** 
## Libertad     0.046434   0.078767   0.590  0.55641    
## Generosidad -0.002610   0.116482  -0.022  0.98216    
## Percepción   0.122647   0.128416   0.955  0.34108    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1295 on 150 degrees of freedom
## Multiple R-squared:  0.7373, Adjusted R-squared:  0.7285 
## F-statistic: 84.18 on 5 and 150 DF,  p-value: < 2.2e-16

pbi y generoosidad: amabas son anormales

num1=formula(~PIB + Generosidad)

cor.test(num1,data=feliz,method = "pearson",exact=F)
## 
##  Pearson's product-moment correlation
## 
## data:  PIB and Generosidad
## t = -0.060844, df = 154, p-value = 0.9516
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1619189  0.1523552
## sample estimates:
##          cor 
## -0.004902909

ahora en un grafico:

library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
s2=ggscatter(feliz, 
          x = "Generosidad", y = "PIB",
          cor.coef = TRUE, 
          cor.method = "spearman")
s2

la correlacion es minima ## entre años de vida saludable y percepcion de corrupcion.

num2=formula(~Esperanza + Percepción)

cor.test(num2,data=feliz,method = "pearson",exact=F)
## 
##  Pearson's product-moment correlation
## 
## data:  Esperanza and Percepción
## t = 4.0953, df = 154, p-value = 6.791e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1643339 0.4484393
## sample estimates:
##      cor 
## 0.313382
s3=ggscatter(feliz, 
          x = "Percepción", y = "Esperanza",
          cor.coef = TRUE, 
          cor.method = "spearman")
s3

## tomando a perdepcino como dependiente:

todas=lm(Percepción~PIB+Apoyo+Libertad+Generosidad+Esperanza,data=feliz)
summary(todas)
## 
## Call:
## lm(formula = Percepción ~ PIB + Apoyo + Libertad + Generosidad + 
##     Esperanza, data = feliz)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18413 -0.05413 -0.01276  0.04097  0.31134 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.04356    0.03013  -1.446 0.150266    
## PIB          0.04141    0.03269   1.267 0.207190    
## Apoyo       -0.03841    0.03153  -1.218 0.225052    
## Libertad     0.19913    0.04727   4.213 4.34e-05 ***
## Generosidad  0.25261    0.07090   3.563 0.000492 ***
## Esperanza    0.04928    0.05160   0.955 0.341075    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08212 on 150 degrees of freedom
## Multiple R-squared:  0.3098, Adjusted R-squared:  0.2868 
## F-statistic: 13.47 on 5 and 150 DF,  p-value: 7.529e-11

segunda parte

link="https://docs.google.com/spreadsheets/d/e/2PACX-1vTgL5kS-g3Q5TuAhkIAsnnz-nqPg-WREgkWA9y2sWPRSUPKIHWknh9hJY7UcXRFWA/pub?output=csv"
IDE=read.csv(link, stringsAsFactors = F)
str(IDE)
## 'data.frame':    195 obs. of  13 variables:
##  $ regionUbigeo   : int  10000 10000 10000 10000 10000 10000 10000 20000 20000 20000 ...
##  $ provinciaUbigeo: int  10100 10200 10300 10400 10500 10600 10700 20100 20200 20300 ...
##  $ PROVINCIA      : chr  "CHACHAPOYAS" "BAGUA" "BONGARA" "CONDORCANQUI" ...
##  $ IDE            : num  0.774 0.662 0.632 0.46 0.605 ...
##  $ identidad      : num  98.6 94.6 97.5 86.2 96.2 ...
##  $ salud          : num  25.45 14.61 9.01 8.56 12.42 ...
##  $ educacion      : num  91.5 79.8 76.4 52.2 74.7 ...
##  $ saneamiento    : num  70.3 64.5 54.8 37.7 43.3 ...
##  $ electrificacion: num  84 67.9 72.2 39.5 67.4 ...
##  $ poblacion      : int  54783 77438 32317 51802 52185 30236 118747 161003 7974 16879 ...
##  $ costa          : chr  "NO" "NO" "NO" "NO" ...
##  $ capital        : chr  "SI" "NO" "NO" "NO" ...
##  $ tamano         : chr  "Pequena" "Pequena" "Muy pequena" "Pequena" ...
names(IDE)=str_split(names(IDE)," ",simplify = T)[,1]
names(IDE)
##  [1] "regionUbigeo"    "provinciaUbigeo" "PROVINCIA"      
##  [4] "IDE"             "identidad"       "salud"          
##  [7] "educacion"       "saneamiento"     "electrificacion"
## [10] "poblacion"       "costa"           "capital"        
## [13] "tamano"
IDE[,c(11:13)]=lapply(IDE[,c(11:13)],as.factor)
todas1=lm(salud~identidad+educacion+saneamiento+electrificacion+poblacion+costa+capital+tamano,data=IDE)
summary(todas1)
## 
## Call:
## lm(formula = salud ~ identidad + educacion + saneamiento + electrificacion + 
##     poblacion + costa + capital + tamano, data = IDE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3402  -3.2121  -0.2517   2.6801  19.4999 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.176e+00  1.662e+01   0.251   0.8019    
## identidad         -2.388e-02  1.886e-01  -0.127   0.8994    
## educacion          1.140e-02  4.858e-02   0.235   0.8147    
## saneamiento        1.351e-01  2.594e-02   5.209 5.07e-07 ***
## electrificacion    2.556e-02  3.751e-02   0.681   0.4965    
## poblacion          1.399e-06  4.946e-06   0.283   0.7776    
## costaSI            2.171e+00  1.218e+00   1.783   0.0763 .  
## capitalSI          7.036e+00  1.546e+00   4.551 9.73e-06 ***
## tamanoMediana     -3.462e+00  2.507e+00  -1.381   0.1690    
## tamanoMuy grande  -1.258e+00  3.940e+01  -0.032   0.9746    
## tamanoMuy pequena -3.255e-01  3.043e+00  -0.107   0.9149    
## tamanoPequena     -2.534e+00  2.859e+00  -0.887   0.3765    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.86 on 183 degrees of freedom
## Multiple R-squared:  0.5837, Adjusted R-squared:  0.5587 
## F-statistic: 23.33 on 11 and 183 DF,  p-value: < 2.2e-16

SI TOMAMOS A IDENTIDAD COMO LA DEPENDIENTE

todas2=lm(identidad~salud+educacion+saneamiento+electrificacion+poblacion+costa+capital+tamano,data=IDE)
summary(todas2)
## 
## Call:
## lm(formula = identidad ~ salud + educacion + saneamiento + electrificacion + 
##     poblacion + costa + capital + tamano, data = IDE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.1774  -0.8996   0.2286   1.1466   4.0155 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        8.530e+01  1.642e+00  51.935  < 2e-16 ***
## salud             -3.668e-03  2.897e-02  -0.127  0.89938    
## educacion          1.190e-01  1.689e-02   7.045 3.64e-11 ***
## saneamiento        2.971e-02  1.067e-02   2.785  0.00592 ** 
## electrificacion   -8.407e-03  1.470e-02  -0.572  0.56819    
## poblacion          9.183e-07  1.937e-06   0.474  0.63607    
## costaSI            1.979e-01  4.812e-01   0.411  0.68139    
## capitalSI          2.562e-01  6.391e-01   0.401  0.68898    
## tamanoMediana      1.178e+00  9.838e-01   1.198  0.23259    
## tamanoMuy grande  -6.758e+00  1.543e+01  -0.438  0.66192    
## tamanoMuy pequena  1.767e+00  1.185e+00   1.491  0.13777    
## tamanoPequena      1.030e+00  1.120e+00   0.920  0.35884    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.905 on 183 degrees of freedom
## Multiple R-squared:  0.4343, Adjusted R-squared:  0.4003 
## F-statistic: 12.77 on 11 and 183 DF,  p-value: < 2.2e-16

SANEAMIENTO Y COSTA:

f1=formula(saneamiento ~ costa)
aggregate(f1, IDE,mean) 
##   costa saneamiento
## 1    NO    50.97556
## 2    SI    83.20243

hasta aqui podemos decir que el saneamiento es mejor en la cosa que en las que no lo son VEAMOS SI ES O NO PARAMETRICO.

ggqqplot(data=IDE,x="saneamiento") + facet_grid(. ~ costa)

# funcion ad-hoc
normalidadTest=function(x) {y =shapiro.test(x); 
                            c(y$statistic, y$p.value)}
# calculando
resultado= aggregate(f1, IDE,
                     FUN = normalidadTest) 


# mostrando resultado
library(knitr)

shapiroTest=as.data.frame(resultado[,2])
names(shapiroTest)=c("SW_Statistic","Probabilidad")
kable(cbind(resultado[1],shapiroTest))
costa SW_Statistic Probabilidad
NO 0.9949525 0.8529651
SI 0.9580179 0.2422331

SI SON PARAMETRICAS.

EN ESTE CASO SE USA LA PRUEBA T

t.test(f1,IDE)
## 
##  Welch Two Sample t-test
## 
## data:  saneamiento by costa
## t = -14.131, df = 73.273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -36.77164 -27.68210
## sample estimates:
## mean in group NO mean in group SI 
##         50.97556         83.20243

La H0 (Los promedios de las variable numerica en cada grupo de la dicotómica no se diferencian.) se rechasa al comprobar que el p valor es inferiori a 0.05