Prueba final

Recodificación

library(htmltab)
IF = htmltab(doc = "https://es.wikipedia.org/wiki/%C3%8Dndice_global_de_felicidad", 
               which = '//*[@id="mw-content-text"]/div/table' )

str(IF)

## 'data.frame':    156 obs. of  9 variables:
##  $ №                                     : chr  "1" "2" "3" "4" ...
##  $ País                                  : chr  "Finlandia" "Colombia" "Noruega" "Dinamarca" ...
##  $ Puntuación                            : chr  "7.633" "7.594" "7.560" "7.555" ...
##  $ PIB per cápita                        : chr  "1.305" "1.456" "1.372" "1.351" ...
##  $ Apoyo social                          : chr  "1.592" "1.582" "1.595" "1.590" ...
##  $ Esperanza de años de vida saludable   : chr  "0.874" "0.873" "0.870" "0.868" ...
##  $ Libertad para tomar decisiones vitales: chr  "0.681" "0.686" "0.685" "0.683" ...
##  $ Generosidad                           : chr  "0.192" "0.286" "0.285" "0.284" ...
##  $ Percepción de la corrupción           : chr  "0.393" "0.130" "0.410" "0.408" ...

Me quedo con lo que necesito

IF= IF [,c (3:9)]

Son numéricas

IF[,c(1:7)]=lapply(IF[,c(1:7)], as.numeric)

Se elimina espacios

library(stringr)
 names(IF)=str_split(names(IF)," ",simplify = T)[,1]

str(IF)

## 'data.frame':    156 obs. of  7 variables:
##  $ Puntuación : num  7.63 7.59 7.56 7.55 7.5 ...
##  $ PIB        : num  1.3 1.46 1.37 1.35 1.34 ...
##  $ Apoyo      : num  1.59 1.58 1.59 1.59 1.64 ...
##  $ Esperanza  : num  0.874 0.873 0.87 0.868 0.914 0.927 0.878 0.896 0.876 0.913 ...
##  $ Libertad   : num  0.681 0.686 0.685 0.683 0.677 0.66 0.638 0.653 0.669 0.659 ...
##  $ Generosidad: num  0.192 0.286 0.285 0.284 0.353 0.256 0.333 0.321 0.365 0.285 ...
##  $ Percepción : num  0.393 0.13 0.41 0.408 0.138 0.357 0.295 0.291 0.389 0.383 ...

Pregunta 1

regresion1=lm(Percepción~.,data=IF)

summary(regresion1)

## 
## Call:
## lm(formula = Percepción ~ ., data = IF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.17316 -0.05857 -0.01548  0.04287  0.33224 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -0.07672    0.03802  -2.018  0.04538 * 
## Puntuación   0.01812    0.01274   1.422  0.15718   
## PIB          0.02067    0.03569   0.579  0.56330   
## Apoyo       -0.05587    0.03374  -1.656  0.09980 . 
## Esperanza    0.03400    0.05254   0.647  0.51855   
## Libertad     0.17098    0.05110   3.346  0.00104 **
## Generosidad  0.23796    0.07141   3.332  0.00109 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08184 on 149 degrees of freedom
## Multiple R-squared:  0.3191, Adjusted R-squared:  0.2917 
## F-statistic: 11.64 on 6 and 149 DF,  p-value: 1.145e-10

Pregunta 2

Análisis bivariado

Ver su normalidad

library(dlookr)

## Loading required package: mice

## Loading required package: lattice

## 
## Attaching package: 'mice'

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4

## Warning in fun(libname, pkgname): couldn't connect to display ":0"

## 
## Attaching package: 'dlookr'

## The following object is masked from 'package:base':
## 
##     transform

normality(IF[,c(2,6)])

## Warning: `cols` is now required.
## Please use `cols = c(statistic)`

## # A tibble: 2 x 4
##   vars        statistic  p_value sample
##   <chr>           <dbl>    <dbl>  <dbl>
## 1 PIB             0.977 0.0118      156
## 2 Generosidad     0.960 0.000179    156

Niguna es mayor a 0.05, por ende, se va por el camino no paramétrico (spearman).

Corroborar hipótesis (H0 = No hay correlación)

frog = formula(~ PIB + Generosidad)

library (ggplot2)
library(magrittr)
library(ggpubr)  
NOOOO=ggscatter(IF, x = "Generosidad", y = "PIB", cor.coef = TRUE, cor.method = "spearman",  
  add = "reg.line", add.params = list(color = "blue", fill = "lightgray"), conf.int = TRUE)
NOOOO

Conclusión: No existe correlación ni significatividad (R = 0.00059, está muy cerca a 0)

Pregunta 3

Ver su normalidad

library(dlookr)
normality(IF[,c(4,7)])

## Warning: `cols` is now required.
## Please use `cols = c(statistic)`

## # A tibble: 2 x 4
##   vars       statistic  p_value sample
##   <chr>          <dbl>    <dbl>  <dbl>
## 1 Esperanza      0.954 5.15e- 5    156
## 2 Percepción     0.814 8.49e-13    156

La variable percepción es no es normal

Corroborar hipótesis (H0 = No hay correlación)

nahnah = formula(~ Esperanza + Percepción)

library (ggplot2)
library(magrittr)
library(ggpubr)  
Grafica0=ggscatter(IF, x = "Percepción", y = "Esperanza", cor.coef = TRUE, cor.method = "spearman",  
  add = "reg.line", add.params = list(color = "blue", fill = "lightgray"), conf.int = TRUE)
Grafica0

Conclusión: Hay correlación, pero esta no es significativa (R = 0.21).

Pregunta 4

regresion2=lm(Percepción~.,data=IF)

summary(regresion2)

## 
## Call:
## lm(formula = Percepción ~ ., data = IF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.17316 -0.05857 -0.01548  0.04287  0.33224 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -0.07672    0.03802  -2.018  0.04538 * 
## Puntuación   0.01812    0.01274   1.422  0.15718   
## PIB          0.02067    0.03569   0.579  0.56330   
## Apoyo       -0.05587    0.03374  -1.656  0.09980 . 
## Esperanza    0.03400    0.05254   0.647  0.51855   
## Libertad     0.17098    0.05110   3.346  0.00104 **
## Generosidad  0.23796    0.07141   3.332  0.00109 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08184 on 149 degrees of freedom
## Multiple R-squared:  0.3191, Adjusted R-squared:  0.2917 
## F-statistic: 11.64 on 6 and 149 DF,  p-value: 1.145e-10

Una variable tiene efecto inverso

Recodificación

linkA="https://docs.google.com/spreadsheets/d/e/2PACX-1vQAgq73wRBfVslN7zarqk50rY3a3WjfVKCoNFhF7rZdvpktMIDNHAnAI8YXL1ZMnkJFLiYnnYYOfeDk/pub?output=csv"

IDE=read.csv(linkA, stringsAsFactors = F,na.strings = '')

str(IDE)

## 'data.frame':    195 obs. of  13 variables:
##  $ regionUbigeo   : int  10000 10000 10000 10000 10000 10000 10000 20000 20000 20000 ...
##  $ provinciaUbigeo: int  10100 10200 10300 10400 10500 10600 10700 20100 20200 20300 ...
##  $ PROVINCIA      : chr  "CHACHAPOYAS" "BAGUA" "BONGARA" "CONDORCANQUI" ...
##  $ IDE            : num  0.774 0.662 0.632 0.46 0.605 ...
##  $ identidad      : num  98.6 94.6 97.5 86.2 96.2 ...
##  $ salud          : num  25.45 14.61 9.01 8.56 12.42 ...
##  $ educacion      : num  91.5 79.8 76.4 52.2 74.7 ...
##  $ saneamiento    : num  70.3 64.5 54.8 37.7 43.3 ...
##  $ electrificacion: num  84 67.9 72.2 39.5 67.4 ...
##  $ poblacion      : int  54783 77438 32317 51802 52185 30236 118747 161003 7974 16879 ...
##  $ costa          : chr  "NO" "NO" "NO" "NO" ...
##  $ capital        : chr  "SI" "NO" "NO" "NO" ...
##  $ tamano         : chr  "Pequena" "Pequena" "Muy pequena" "Pequena" ...

IDE= IDE [,c (5:13)]

str(IDE)

## 'data.frame':    195 obs. of  9 variables:
##  $ identidad      : num  98.6 94.6 97.5 86.2 96.2 ...
##  $ salud          : num  25.45 14.61 9.01 8.56 12.42 ...
##  $ educacion      : num  91.5 79.8 76.4 52.2 74.7 ...
##  $ saneamiento    : num  70.3 64.5 54.8 37.7 43.3 ...
##  $ electrificacion: num  84 67.9 72.2 39.5 67.4 ...
##  $ poblacion      : int  54783 77438 32317 51802 52185 30236 118747 161003 7974 16879 ...
##  $ costa          : chr  "NO" "NO" "NO" "NO" ...
##  $ capital        : chr  "SI" "NO" "NO" "NO" ...
##  $ tamano         : chr  "Pequena" "Pequena" "Muy pequena" "Pequena" ...

Modificar variable tamaño

str (IDE$tamano)

##  chr [1:195] "Pequena" "Pequena" "Muy pequena" "Pequena" "Pequena" ...

 W=c("Muy pequena","Pequena", "Mediana","Grande")
   IDE$tamano = factor(IDE$tamano, levels=W,ordered = TRUE)

str (IDE)

## 'data.frame':    195 obs. of  9 variables:
##  $ identidad      : num  98.6 94.6 97.5 86.2 96.2 ...
##  $ salud          : num  25.45 14.61 9.01 8.56 12.42 ...
##  $ educacion      : num  91.5 79.8 76.4 52.2 74.7 ...
##  $ saneamiento    : num  70.3 64.5 54.8 37.7 43.3 ...
##  $ electrificacion: num  84 67.9 72.2 39.5 67.4 ...
##  $ poblacion      : int  54783 77438 32317 51802 52185 30236 118747 161003 7974 16879 ...
##  $ costa          : chr  "NO" "NO" "NO" "NO" ...
##  $ capital        : chr  "SI" "NO" "NO" "NO" ...
##  $ tamano         : Ord.factor w/ 4 levels "Muy pequena"<..: 2 2 1 2 2 1 3 3 1 1 ...

Pregunta 1

regresion3=lm(identidad~.,data=IDE)

summary(regresion3)

## 
## Call:
## lm(formula = identidad ~ ., data = IDE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.1774  -0.9021   0.2405   1.1481   4.0155 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      8.629e+01  1.165e+00  74.085  < 2e-16 ***
## salud           -3.668e-03  2.897e-02  -0.127  0.89938    
## educacion        1.190e-01  1.689e-02   7.045 3.64e-11 ***
## saneamiento      2.971e-02  1.067e-02   2.785  0.00592 ** 
## electrificacion -8.407e-03  1.470e-02  -0.572  0.56819    
## poblacion        9.183e-07  1.937e-06   0.474  0.63607    
## costaSI          1.979e-01  4.812e-01   0.411  0.68139    
## capitalSI        2.562e-01  6.391e-01   0.401  0.68898    
## tamano.L        -1.152e+00  8.347e-01  -1.380  0.16918    
## tamano.Q        -2.210e-01  4.948e-01  -0.447  0.65567    
## tamano.C        -4.943e-01  3.030e-01  -1.632  0.10450    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.905 on 183 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.4323, Adjusted R-squared:  0.4012 
## F-statistic: 13.93 on 10 and 183 DF,  p-value: < 2.2e-16

Educación posee más efecto (0) que saenamiento (0.001)

Pregunta 2

regresion4=lm(salud~.,data=IDE)

summary(regresion4)

## 
## Call:
## lm(formula = salud ~ ., data = IDE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3402  -3.2217  -0.2625   2.6839  19.4999 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.596e+00  1.654e+01   0.157   0.8755    
## identidad       -2.388e-02  1.886e-01  -0.127   0.8994    
## educacion        1.140e-02  4.858e-02   0.235   0.8147    
## saneamiento      1.351e-01  2.594e-02   5.209 5.07e-07 ***
## electrificacion  2.556e-02  3.751e-02   0.681   0.4965    
## poblacion        1.399e-06  4.946e-06   0.283   0.7776    
## costaSI          2.171e+00  1.218e+00   1.783   0.0763 .  
## capitalSI        7.036e+00  1.546e+00   4.551 9.73e-06 ***
## tamano.L         1.093e-02  2.141e+00   0.005   0.9959    
## tamano.Q         2.836e+00  1.246e+00   2.276   0.0240 *  
## tamano.C         6.952e-01  7.770e-01   0.895   0.3721    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.86 on 183 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.5535, Adjusted R-squared:  0.5291 
## F-statistic: 22.68 on 10 and 183 DF,  p-value: < 2.2e-16

Saneamiento y Capital tienen un nivel de efecto a 0.

Pregunta 3

joooo= formula (saneamiento ~ costa)

aggregate (joooo, IDE, mean)

##   costa saneamiento
## 1    NO    50.97556
## 2    SI    83.20243

dadadad=function(x) {y =shapiro.test(x); c(y$statistic,y$p.value)}


resultado= aggregate(joooo, IDE, FUN = dadadad)

library(knitr)

shapiroTest=as.data.frame(resultado[,2])

names(shapiroTest)=c("SW_Statistic","Probabilidad")

kable(cbind(resultado[1],shapiroTest))

costa	SW_Statistic	Probabilidad
NO	0.9949525	0.8529651
SI	0.9580179	0.2422331

Para calcular necesitamos usar la prueba T al salir mayor a 0.05