# Librerías
if (!require("dplyr")) install.packages("dplyr")
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("rio")) install.packages("rio")
if (!require("ggpubr")) install.packages("ggpubr")
library(dplyr)
library(ggplot2)
library(rio)
library(ggpubr)

# Cargar base de datos
base <- import("https://github.com/azula89/bases/raw/main/iiee_rur_con.xlsx")

# Crear variables
base <- base %>%
  mutate(
    pct_rural = (rural / n) * 100,
    pct_internet = (internet / n) * 100
  )

# Análisis descriptivo
summary(base[, c("pct_rural", "pct_internet")])
##    pct_rural       pct_internet   
##  Min.   :  0.00   Min.   : 4.467  
##  1st Qu.: 64.52   1st Qu.:11.905  
##  Median : 87.57   Median :21.721  
##  Mean   : 75.29   Mean   :25.939  
##  3rd Qu.: 94.77   3rd Qu.:34.222  
##  Max.   :100.00   Max.   :87.850
ggplot(base, aes(x = pct_rural)) + geom_histogram(fill = "skyblue", bins = 25)

ggplot(base, aes(x = pct_internet)) + geom_histogram(fill = "salmon", bins = 25)

# Correlación nacional
cor.test(base$pct_rural, base$pct_internet)
## 
##  Pearson's product-moment correlation
## 
## data:  base$pct_rural and base$pct_internet
## t = -21.403, df = 194, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8754658 -0.7909086
## sample estimates:
##        cor 
## -0.8381536
# Gráfico de correlación
ggscatter(base, x = "pct_rural", y = "pct_internet",
          add = "reg.line", conf.int = TRUE,
          cor.coef = TRUE, cor.method = "pearson")

# Modelo nacional
modelo <- lm(pct_internet ~ pct_rural, data = base)
summary(modelo)
## 
## Call:
## lm(formula = pct_internet ~ pct_rural, data = base)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.457  -6.622  -1.015   5.318  42.913 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 65.97668    1.98697    33.2   <2e-16 ***
## pct_rural   -0.53175    0.02484   -21.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.38 on 194 degrees of freedom
## Multiple R-squared:  0.7025, Adjusted R-squared:  0.701 
## F-statistic: 458.1 on 1 and 194 DF,  p-value: < 2.2e-16
predict(modelo, data.frame(pct_rural = 60), interval = "confidence")
##       fit      lwr      upr
## 1 34.0717 32.55258 35.59081
ggplot(base, aes(x = pct_rural, y = pct_internet)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", color = "red") +
  geom_vline(xintercept = 60, linetype = "dashed")

# Filtrado Amazonía
amazonia <- c("AMAZONAS", "LORETO", "MADRE DE DIOS", "SAN MARTIN", "UCAYALI")
base_amz <- base %>% filter(DPTO %in% amazonia)

# Correlación y modelo Amazonía
cor.test(base_amz$pct_rural, base_amz$pct_internet)
## 
##  Pearson's product-moment correlation
## 
## data:  base_amz$pct_rural and base_amz$pct_internet
## t = -4.3915, df = 30, p-value = 0.000129
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7997807 -0.3540713
## sample estimates:
##        cor 
## -0.6255351
modelo_amz <- lm(pct_internet ~ pct_rural, data = base_amz)
summary(modelo_amz)
## 
## Call:
## lm(formula = pct_internet ~ pct_rural, data = base_amz)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.211 -4.651 -0.742  2.363 35.590 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  58.8206     9.2450   6.362 5.09e-07 ***
## pct_rural    -0.4760     0.1084  -4.391 0.000129 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.888 on 30 degrees of freedom
## Multiple R-squared:  0.3913, Adjusted R-squared:  0.371 
## F-statistic: 19.28 on 1 and 30 DF,  p-value: 0.000129
predict(modelo_amz, data.frame(pct_rural = 85), interval = "confidence")
##        fit      lwr      upr
## 1 18.36083 15.50909 21.21256
# Visualización modelo Amazonía
ggplot(base_amz, aes(x = pct_rural, y = pct_internet)) +
  geom_point(color = "darkgreen") +
  geom_smooth(method = "lm", color = "orange") +
  geom_vline(xintercept = 85, linetype = "dashed")