1 Exploración de la dependiente

linkLL='https://docs.google.com/spreadsheets/d/e/2PACX-1vQUWlFgjXdo-Tptb1fhFyYjDV8bS-ZC-RBV_vNPdC80ORIFBz7zt4c1nI6bizAeKogdZ9e2y22uu-1n/pub?gid=962341748&single=true&output=csv'

Depart=read.csv(linkLL, stringsAsFactors = F)

# que tenemos
str(Depart)

## 'data.frame':    83 obs. of  3 variables:
##  $ Analfabetismo: num  1.5 5.3 4.4 3.9 3.7 5.7 3.5 9.8 2.8 11.1 ...
##  $ Agua         : num  3.1 16.1 5.1 42.8 15.7 7.9 18 6.3 12.5 11.6 ...
##  $ Energia      : num  0.7 6.3 1.6 6.6 4.3 3.3 4 6.7 5 10 ...

1.1 Exploración gráfica

library(ggplot2)
base1=ggplot(Depart,aes(x=Analfabetismo))
histNum= base1 + geom_histogram(bins=7) 
histNum

Atipicos

base2=ggplot(Depart,aes(y=Analfabetismo))
box=base2 + geom_boxplot() + coord_flip()

box

1.2 Exploración con estadígrafos

summary(Depart$Analfabetismo)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.50    6.40   11.70   12.56   17.20   30.30

Podriamos graficarlos en el boxplot:

estadigrafos=round(as.vector(summary(Depart$Analfabetismo)),2)

box + scale_y_continuous(breaks = estadigrafos)

library(DescTools)
Skew(Depart$Analfabetismo,conf.level = 0.05)

##      skew    lwr.ci    upr.ci 
## 0.4418950 0.4498141 0.4725494

IQR(Depart$Analfabetismo)

## [1] 10.8

# cuartil tres
q3=as.numeric(summary(Depart$Analfabetismo)[5])

# calculando umbral (distancia del q3)
umbral= q3+1.5*IQR(Depart$Analfabetismo)
umbral

## [1] 33.4

Depart[Depart$Analfabetismo>umbral,]

## [1] Analfabetismo Agua          Energia      
## <0 rows> (or 0-length row.names)

Gini(Depart$Analfabetismo,conf.level=0.95)

##      gini    lwr.ci    upr.ci 
## 0.3194691 0.2846361 0.3642236

base1 + gglorenz::stat_lorenz(color='red') +
    geom_abline(linetype = "dashed") + coord_fixed() +
    labs(x = "% Distritos",
         y = "% Tasa de analfabetismo",
         title = "Relación Distrito / Tasa de analfabetismo",
         caption = "Fuente: INEI")

## Registered S3 methods overwritten by 'ineq':
##   method   from     
##   plot.Lc  DescTools
##   lines.Lc DescTools

2. Exploracion bivariada

Test de normalidad

library(dlookr)

## Loading required package: mice

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Warning in fun(libname, pkgname): couldn't connect to display ":0"

## 
## Attaching package: 'dlookr'

## The following object is masked from 'package:base':
## 
##     transform

normality(Depart[,c(1,2,3)])

## # A tibble: 3 x 4
##   vars          statistic    p_value sample
##   <chr>             <dbl>      <dbl>  <dbl>
## 1 Analfabetismo     0.959 0.0100         83
## 2 Agua              0.888 0.00000267     83
## 3 Energia           0.879 0.00000118     83

HN 1

library(ggpubr)
HN1=ggscatter(Depart, 
          x = "Agua", y = "Analfabetismo",
          cor.coef = TRUE, 
          cor.method = "spearman") # spearman?

HN1

HN2

library(ggpubr)
HN2=ggscatter(Depart, 
          x = "Energia", y = "Analfabetismo",
          cor.coef = TRUE, 
          cor.method = "spearman") # spearman?

HN2

3. Regresión

3.1 Analisis de la dependiente

3.1.1 Verificar normalidad

shapiro.test(Depart$Analfabetismo)

## 
##  Shapiro-Wilk normality test
## 
## data:  Depart$Analfabetismo
## W = 0.95925, p-value = 0.01004

3.1.2 Verificar asimetria y si hay atipicos

Skew(Depart$Analfabetismo)

## [1] 0.441895

Histograma

base=ggplot(data=Depart, aes(x=Analfabetismo))
base+geom_histogram(bins=20)

Diagrama de cajas

base=ggplot(data=Depart, aes(y=Analfabetismo))
base+geom_boxplot() + coord_flip()

3.2 Analisis bivariado

3.2.1 Hipotesis 1

HS1=formula(~ Analfabetismo + Agua)
cor.test(HS1,data=Depart,method = "spearm",exact=F)

## 
##  Spearman's rank correlation rho
## 
## data:  Analfabetismo and Agua
## S = 43400, p-value = 1.03e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5445147

ggscatter(Depart, 
          x = "Agua", y = "Analfabetismo",
          cor.coef = TRUE, 
          cor.method = "spearman",
           add = "reg.line",
          add.params = list(color = "blue", fill = "lightgray"),
          conf.int = TRUE)

## `geom_smooth()` using formula 'y ~ x'

3.2.1 Hipotesis 2

HS2=formula(~ Analfabetismo + Energia)
cor.test(HS2,data=Depart,method = "spearm",exact=F)

## 
##  Spearman's rank correlation rho
## 
## data:  Analfabetismo and Energia
## S = 26014, p-value = 7.183e-15
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7269798

ggscatter(Depart, 
          x = "Energia", y = "Analfabetismo",
          cor.coef = TRUE, 
          cor.method = "spearman",
           add = "reg.line",
          add.params = list(color = "blue", fill = "lightgray"),
          conf.int = TRUE)

## `geom_smooth()` using formula 'y ~ x'

3.2 Análisis regresión

names(Depart)

## [1] "Analfabetismo" "Agua"          "Energia"

DepartReg=lm(Analfabetismo~.,data=Depart)
summary(DepartReg)

## 
## Call:
## lm(formula = Analfabetismo ~ ., data = Depart)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.9272  -3.3010  -0.9079   3.7909  12.1031 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.79245    1.05765   5.477 4.86e-07 ***
## Agua         0.05972    0.02067   2.890  0.00496 ** 
## Energia      0.14035    0.02632   5.332 8.79e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.24 on 80 degrees of freedom
## Multiple R-squared:  0.4557, Adjusted R-squared:  0.4421 
## F-statistic: 33.49 on 2 and 80 DF,  p-value: 2.717e-11