#1 Exploraxion de la dependiente
linkSM='https://docs.google.com/spreadsheets/d/e/2PACX-1vQIRb3zqxKyK005A99409jwVU5D2kLNKfWuhac_7xrBwUW8cX0HUISfbgZTJZZy6CSgkTd9Kgw08GBk/pub?gid=465018582&single=true&output=csv'
DataSanMartin=read.csv(linkSM, stringsAsFactors = F)
# que tenemos
str(DataSanMartin)
## 'data.frame': 77 obs. of 3 variables:
## $ Esperanza: num 72.5 74.7 74.6 73.9 73.4 ...
## $ Años : num 7.76 6.54 5.78 4.58 5.55 5.34 7.59 4.71 4.7 5.1 ...
## $ Ingreso : num 953 1011 990 458 689 ...
##1.1 Exploracion gráfica
library(ggplot2)
base1=ggplot(DataSanMartin,aes(x=Esperanza))
histNum= base1 + geom_histogram(bins=7)
histNum
Atipicos
base2=ggplot(DataSanMartin,aes(y=Esperanza))
box=base2 + geom_boxplot() + coord_flip()
box
#1.2 Exploración con estadígrafos
summary(DataSanMartin$Esperanza)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 67.10 71.61 73.09 73.27 74.71 79.07
Podriamos graficarlos en el boxplot:
library(ggplot2)
estadigrafos=round(as.vector(summary(DataSanMartin$Esperanza)),2)
box + scale_y_continuous(breaks = estadigrafos)
library(DescTools)
Skew(DataSanMartin$Esperanza,conf.level = 0.05)
## skew lwr.ci upr.ci
## 0.10648855 0.08515691 0.11197181
IQR(DataSanMartin$Esperanza)
## [1] 3.1
# cuartil tres
q3=as.numeric(summary(DataSanMartin$Esperanza)[5])
# calculando umbral (distancia del q3)
umbral= q3+1.5*IQR(DataSanMartin$Esperanza)
umbral
## [1] 79.36
DataSanMartin[DataSanMartin$Esperanza>umbral,]
## [1] Esperanza Años Ingreso
## <0 rows> (or 0-length row.names)
Gini(DataSanMartin$Esperanza,conf.level=0.95)
## gini lwr.ci upr.ci
## 0.01864240 0.01607253 0.02221314
library(gglorenz) # instalar
## Registered S3 methods overwritten by 'ineq':
## method from
## plot.Lc DescTools
## lines.Lc DescTools
base1 + gglorenz::stat_lorenz(color='red') +
geom_abline(linetype = "dashed") + coord_fixed() +
labs(x = "% Distrito",
y = "% Esperanza de vida",
title = "Relación Distrito / Esperanza de vida",
caption = "Fuente: PNUD")
#2. Exploracion bivariada
library(ggpubr)
E1=ggscatter(DataSanMartin,
x = "Años", y = "Esperanza",
cor.coef = TRUE,
cor.method = "pearson") # spearman?
E1
library(ggpubr)
E2=ggscatter(DataSanMartin,
x = "Ingreso", y = "Esperanza",
cor.coef = TRUE,
cor.method = "pearson") # spearman?
E2
#3. Regresión
##3.1 Analisis de la dependiente
###3.1.1 Verificar normalidad
shapiro.test(DataSanMartin$Esperanza)
##
## Shapiro-Wilk normality test
##
## data: DataSanMartin$Esperanza
## W = 0.99311, p-value = 0.956
###3.1.2 Verificar asimetria y si hay atipicos
# Variable dependiente:
# Apropiacion presupuestal en millones US$: 'apropiaciondolar'
library(DescTools) # instalar antes
Skew(DataSanMartin$Esperanza)
## [1] 0.1064886
Histograma
library(ggplot2)
base=ggplot(data=DataSanMartin, aes(x=Esperanza))
base+geom_histogram(bins=20) #usar 20 siempre
Diagrama de cajas
base=ggplot(data=DataSanMartin, aes(y=Esperanza))
base+geom_boxplot() + coord_flip()
##3.2 Analisis bivariado
###3.2.1 Hipotesis 1
HT1=formula(~ Esperanza + Años)
cor.test(HT1,data=DataSanMartin,method = "spearm",exact=F)
##
## Spearman's rank correlation rho
##
## data: Esperanza and Años
## S = 79084, p-value = 0.7328
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.03953699
library(ggpubr)
ggscatter(DataSanMartin,
x = "Años", y = "Esperanza",
cor.coef = TRUE,
cor.method = "spearman",
add = "reg.line",
add.params = list(color = "blue", fill = "lightgray"),
conf.int = TRUE)
## `geom_smooth()` using formula 'y ~ x'
###3.2.1 Hipotesis 2
HT2=formula(~ Esperanza + Ingreso)
cor.test(HT2,data=DataSanMartin,method = "spearm",exact=F)
##
## Spearman's rank correlation rho
##
## data: Esperanza and Ingreso
## S = 75904, p-value = 0.9844
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.002260942
library(ggpubr)
ggscatter(DataSanMartin,
x = "Ingreso", y = "Esperanza",
cor.coef = TRUE,
cor.method = "spearman",
add = "reg.line",
add.params = list(color = "blue", fill = "lightgray"),
conf.int = TRUE)
## `geom_smooth()` using formula 'y ~ x'
##3.2 Análisis regresión
names(DataSanMartin)
## [1] "Esperanza" "Años" "Ingreso"
RegresionSanMartin=lm(Esperanza~.,data=DataSanMartin)
summary(RegresionSanMartin)
##
## Call:
## lm(formula = Esperanza ~ ., data = DataSanMartin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2331 -1.7170 -0.2224 1.5375 5.9838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74.0609948 1.4609424 50.694 <2e-16 ***
## Años -0.1477404 0.3885143 -0.380 0.705
## Ingreso 0.0001361 0.0018599 0.073 0.942
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.44 on 74 degrees of freedom
## Multiple R-squared: 0.004473, Adjusted R-squared: -0.02243
## F-statistic: 0.1663 on 2 and 74 DF, p-value: 0.8471