#1 Exploraxion de la dependiente

linkSM='https://docs.google.com/spreadsheets/d/e/2PACX-1vQIRb3zqxKyK005A99409jwVU5D2kLNKfWuhac_7xrBwUW8cX0HUISfbgZTJZZy6CSgkTd9Kgw08GBk/pub?gid=465018582&single=true&output=csv'

DataSanMartin=read.csv(linkSM, stringsAsFactors = F)

# que tenemos
str(DataSanMartin)
## 'data.frame':    77 obs. of  3 variables:
##  $ Esperanza: num  72.5 74.7 74.6 73.9 73.4 ...
##  $ Años     : num  7.76 6.54 5.78 4.58 5.55 5.34 7.59 4.71 4.7 5.1 ...
##  $ Ingreso  : num  953 1011 990 458 689 ...

##1.1 Exploracion gráfica

library(ggplot2)
base1=ggplot(DataSanMartin,aes(x=Esperanza))
histNum= base1 + geom_histogram(bins=7) 
histNum 

Atipicos

base2=ggplot(DataSanMartin,aes(y=Esperanza))
box=base2 + geom_boxplot() + coord_flip()

box

#1.2 Exploración con estadígrafos

summary(DataSanMartin$Esperanza)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   67.10   71.61   73.09   73.27   74.71   79.07

Podriamos graficarlos en el boxplot:

library(ggplot2)

estadigrafos=round(as.vector(summary(DataSanMartin$Esperanza)),2)

box + scale_y_continuous(breaks = estadigrafos) 

library(DescTools)
Skew(DataSanMartin$Esperanza,conf.level = 0.05)
##       skew     lwr.ci     upr.ci 
## 0.10648855 0.08515691 0.11197181
IQR(DataSanMartin$Esperanza)
## [1] 3.1
# cuartil tres
q3=as.numeric(summary(DataSanMartin$Esperanza)[5])

# calculando umbral (distancia del q3)
umbral= q3+1.5*IQR(DataSanMartin$Esperanza)
umbral
## [1] 79.36
DataSanMartin[DataSanMartin$Esperanza>umbral,]
## [1] Esperanza Años      Ingreso  
## <0 rows> (or 0-length row.names)
Gini(DataSanMartin$Esperanza,conf.level=0.95)
##       gini     lwr.ci     upr.ci 
## 0.01864240 0.01607253 0.02221314
library(gglorenz) # instalar 
## Registered S3 methods overwritten by 'ineq':
##   method   from     
##   plot.Lc  DescTools
##   lines.Lc DescTools
base1 + gglorenz::stat_lorenz(color='red') +
    geom_abline(linetype = "dashed") + coord_fixed() +
    labs(x = "% Distrito",
         y = "% Esperanza de vida",
         title = "Relación Distrito / Esperanza de vida",
         caption = "Fuente: PNUD")

#2. Exploracion bivariada

library(ggpubr)
E1=ggscatter(DataSanMartin, 
          x = "Años", y = "Esperanza",
          cor.coef = TRUE, 
          cor.method = "pearson") # spearman?

E1

library(ggpubr)
E2=ggscatter(DataSanMartin, 
          x = "Ingreso", y = "Esperanza",
          cor.coef = TRUE, 
          cor.method = "pearson") # spearman?

E2

#3. Regresión

##3.1 Analisis de la dependiente

###3.1.1 Verificar normalidad

shapiro.test(DataSanMartin$Esperanza)
## 
##  Shapiro-Wilk normality test
## 
## data:  DataSanMartin$Esperanza
## W = 0.99311, p-value = 0.956

###3.1.2 Verificar asimetria y si hay atipicos

# Variable dependiente:
# Apropiacion presupuestal en millones US$: 'apropiaciondolar'

library(DescTools) # instalar antes

Skew(DataSanMartin$Esperanza)
## [1] 0.1064886

Histograma

library(ggplot2)

base=ggplot(data=DataSanMartin, aes(x=Esperanza))
base+geom_histogram(bins=20) #usar 20 siempre

Diagrama de cajas

base=ggplot(data=DataSanMartin, aes(y=Esperanza))
base+geom_boxplot() + coord_flip()

##3.2 Analisis bivariado

###3.2.1 Hipotesis 1

HT1=formula(~ Esperanza + Años)
cor.test(HT1,data=DataSanMartin,method = "spearm",exact=F)
## 
##  Spearman's rank correlation rho
## 
## data:  Esperanza and Años
## S = 79084, p-value = 0.7328
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.03953699
library(ggpubr)
ggscatter(DataSanMartin, 
          x = "Años", y = "Esperanza",
          cor.coef = TRUE, 
          cor.method = "spearman",
           add = "reg.line",
          add.params = list(color = "blue", fill = "lightgray"),
          conf.int = TRUE) 
## `geom_smooth()` using formula 'y ~ x'

###3.2.1 Hipotesis 2

HT2=formula(~ Esperanza + Ingreso)
cor.test(HT2,data=DataSanMartin,method = "spearm",exact=F)
## 
##  Spearman's rank correlation rho
## 
## data:  Esperanza and Ingreso
## S = 75904, p-value = 0.9844
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## 0.002260942
library(ggpubr)
ggscatter(DataSanMartin, 
          x = "Ingreso", y = "Esperanza",
          cor.coef = TRUE, 
          cor.method = "spearman",
           add = "reg.line",
          add.params = list(color = "blue", fill = "lightgray"),
          conf.int = TRUE) 
## `geom_smooth()` using formula 'y ~ x'

##3.2 Análisis regresión

names(DataSanMartin)
## [1] "Esperanza" "Años"      "Ingreso"
RegresionSanMartin=lm(Esperanza~.,data=DataSanMartin)
summary(RegresionSanMartin)
## 
## Call:
## lm(formula = Esperanza ~ ., data = DataSanMartin)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2331 -1.7170 -0.2224  1.5375  5.9838 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 74.0609948  1.4609424  50.694   <2e-16 ***
## Años        -0.1477404  0.3885143  -0.380    0.705    
## Ingreso      0.0001361  0.0018599   0.073    0.942    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.44 on 74 degrees of freedom
## Multiple R-squared:  0.004473,   Adjusted R-squared:  -0.02243 
## F-statistic: 0.1663 on 2 and 74 DF,  p-value: 0.8471