Cargar librerías

library(tidyr)
library(tidyverse)
library(sf)
library(sp)
library(dplyr)
library(exactextractr)
library(units)
library(spatstat)
library(writexl)
library(dplyr)
library(readxl)
library(car)
library(ggplot2)
library(ggfortify)
library(patchwork)
library(ggrepel)
library(RColorBrewer)
library(ggrepel)

1. Escalonamiento 1: Departamento a provincia

Cálculo del IE provincial
Escalonamiento mediante el método de Geary y Stark (2002).

# Paso 0 — calcular POB total por provincia desde CCPP
pob_provincial2 <- POB_cp %>%
  st_drop_geometry() %>%
  mutate(idprov_6 = paste0(
    substr(str_pad(as.character(idcp), 8, pad = "0"), 1, 4),  # primeros 4 = dpto+prov
    "00"  # completar a 6 dígitos como en idprov
  )) %>%
  group_by(idprov_6) %>%
  summarise(POB_total = sum(POB, na.rm = TRUE))
# Paso 1: POB total por departamento (suma de provincias)
pob_departamental <- pob_provincial2 %>%
  mutate(iddep_2 = substr(idprov_6, 1, 2)) %>%
  group_by(iddep_2) %>%
  summarise(POB_dep = sum(POB_total, na.rm = TRUE))
# Paso 2: unir todo a Ingresos_prov y calcular ingreso ponderado
Ingresos_prov <- Ingresos_prov %>%
  mutate(
    idprov_6 = substr(str_pad(as.character(idprov), 6, pad = "0"), 1, 6),
    iddep_2  = substr(idprov_6, 1, 2)
  ) %>%
  left_join(pob_provincial2,    by = "idprov_6") %>%
  left_join(pob_departamental, by = "iddep_2") %>%
  mutate(
    # Ingreso ponderado = Ingresos * (POB provincia / POB departamento)
    Ingreso_pond = Ingresos * (POB_total / POB_dep)
  )
# Paso 3: calcular total de Ingreso_pond por departamento
ingresos_total_dep <- Ingresos_prov %>%
  st_drop_geometry() %>%
  group_by(iddep_2) %>%
  summarise(total_ingreso_dep = sum(Ingreso_pond, na.rm = TRUE))
# Paso 4: calcular PBI provincial con Geary-Stark corregido
Ingresos_prov2 <- Ingresos_prov %>%
  select(-any_of(c("total_ingreso_dep", "PBI_dep", "PBI_provincial"))) %>%
  left_join(ingresos_total_dep, by = "iddep_2") %>%
  left_join(st_drop_geometry(PBI_dep[, c("iddpto", "PBI_dep")]),
            by = c("iddep_2" = "iddpto")) %>%
  mutate(PBI_prov = PBI_dep * (Ingreso_pond / total_ingreso_dep))

Revisión del resultado del PBI provincial
La revisión se realizó según los rangos poblacionales.

tabla_26 <- PBI_prov7 %>%
  st_drop_geometry() %>%
  mutate(rango_pob = case_when(
    POB_ttl >= 2860    & POB_ttl < 10000    ~ "[2,860 -- 10,000>",
    POB_ttl >= 10000   & POB_ttl < 50000    ~ "[10,000 -- 50,000>",
    POB_ttl >= 50000   & POB_ttl < 100000   ~ "[50,000 -- 100,000>",
    POB_ttl >= 100000  & POB_ttl < 1000000  ~ "[100,000 -- 1'000,000>",
    POB_ttl >= 1000000                      ~ "[1'000,000 -- 8'574,974>"
  )) %>%
  group_by(rango_pob) %>%
  summarise(
    N_prov      = n(),
    PBI_mediana = median(PBI_prov, na.rm = TRUE),
    PBI_media   = mean(PBI_prov, na.rm = TRUE),
    PBI_total   = sum(PBI_prov, na.rm = TRUE)
  ) %>%
  arrange(factor(rango_pob, levels = c(
    "[2,860 -- 10,000>",
    "[10,000 -- 50,000>",
    "[50,000 -- 100,000>",
    "[100,000 -- 1'000,000>",
    "[1'000,000 -- 8'574,974>"
  )))

print(tabla_26)
## # A tibble: 5 × 5
##   rango_pob                N_prov PBI_mediana  PBI_media  PBI_total
##   <chr>                     <int>       <dbl>      <dbl>      <dbl>
## 1 [2,860 -- 10,000>            12      82779.     79618.    955412.
## 2 [10,000 -- 50,000>           73     240445.    273422.  19959778.
## 3 [50,000 -- 100,000>          56     661205.    839357.  47004000.
## 4 [100,000 -- 1'000,000>       53    2530457.   3571441. 189286356.
## 5 [1'000,000 -- 8'574,974>      2  105276584. 105276584. 210553168.

2. Escalonamiento 2: Provincia a distrito


Coeficientes a nivel distrital (S&P, 2022)
Para el cálculo del índice de producción se utilizan los coeficientes estimados por Seminario y Palomino (2022), dado que su análisis multitemporal (1993-2018) proporciona mayor robustez y rigor estadístico que una estimación de corte transversal para un único año.

# Coeficientes de la función cúbica
a0 <- 0        # intercepto (no reportado)
a1 <- 0.2853   # ln(luz)
a2 <- 0.0449   # ln(luz)^2
a3 <- 0.0028   # ln(luz)^3
a4 <- 0.8475   # ln(densidad)

Cálculo del Índice de producción y el Factor de distribución

# Paso 1 — Calcular índice PBI distrital
Den_rad6 <- Den_rad6 %>%
  mutate(
    area_km2 = area / 1e6,
    RADmean  = ifelse(RADmean <= 0 | is.na(RADmean), NA_real_, RADmean),
    DENSIDAD = ifelse(DENSIDAD <= 0 | is.na(DENSIDAD), NA_real_, DENSIDAD),
    ln_luz   = log(RADmean),      #luminosidad promedio por píxel, según S&P
    ln_dens  = log(DENSIDAD),
    ln_luz2  = ln_luz^2,
    ln_luz3  = ln_luz^3,
    # estimación ln(PIB/km²)
    ln_pib_km2_hat = a1*ln_luz + a2*ln_luz2 + a3*ln_luz3 + a4*ln_dens,   #índice de producción
    # recuperar PIB absoluto
    PBI_dtbucion_km2 = exp(ln_pib_km2_hat),      
    PBI_dtbucion = PBI_dtbucion_km2 * area_km2   #factor de distribución
  )
# Paso 2 — extraer clave provincial desde iddist (primeros 4 dígitos de 6)
Den_rad6 <- Den_rad6 %>%
  mutate(idprov_4 = substr(str_pad(as.character(iddist), 6, pad = "0"), 1, 4))
# Paso 3 — extraer clave provincial desde idprov (primeros 4 dígitos de 6)
PBI_prov7 <- PBI_prov7 %>%
  mutate(idprov_4 = substr(str_pad(as.character(idprov), 6, pad = "0"), 1, 4))
# Verificar
Den_rad6 %>%
  st_drop_geometry() %>%
  filter(idprov_4 == "1605") %>%
  select(nombdist, RADmean, DENSIDAD, PBI_dtbucion) %>%
  head(15)
# Paso 4 — sumar PBI_dtbucion por provincia 
total_indice_prov <- Den_rad6 %>%
  st_drop_geometry() %>%
  group_by(idprov_4) %>%
  summarise(total_dist_abs = sum(PBI_dtbucion, na.rm = TRUE))
# Paso 5 — unir y calcular PBI distrital escalado a nivel provincial
Den_rad6 <- Den_rad6 %>%
  select(-any_of(c("total_dist_abs", "PBI_prov", "PBI_dist"))) %>%
  left_join(total_indice_prov, by = "idprov_4") %>%
  left_join(st_drop_geometry(PBI_prov7[, c("idprov_4", "PBI_prov")]), by = "idprov_4") %>%
  mutate(
    PBI_prov = as.numeric(as.character(PBI_prov)),
    PBI_dist = PBI_prov * (PBI_dtbucion / total_dist_abs)
  )
# Verificar con la provincia de Yungay
Den_rad6 %>%
  st_drop_geometry() %>%
  filter(idprov_4 == "0220") %>%
  arrange(desc(PBI_dist)) %>%
  select(nombdist, PBI_dtbucion, total_dist_abs, PBI_prov, PBI_dist)

3. Escalonamiento 3: Distrito a centro poblado

names(PBI_cp_mix)
# Paso 1 — extraer 6 primeros dígitos del idcp para identificar distrito
POB_cp3 <- POB_cp3 %>%
  mutate(iddist_6 = substr(as.character(idcp), 1, 6))
# Paso 2 — sumar población por distrito
total_pob_dist <- POB_cp3 %>%
  st_drop_geometry() %>%
  group_by(iddist_6) %>%
  summarise(total_pob = sum(POB, na.rm=TRUE))
# Paso 3 — unir PBI distrital mixto
Den_rad5_simple <- PBI_cp_mix %>%
  st_drop_geometry() %>%
  mutate(iddist_6 = as.character(iddist)) %>%
  select(iddist_6, PBI_dist) %>%
  mutate(PBI_dist = as.numeric(as.character(PBI_dist)))
# Paso 4 — calcular PBI por CCPP
POB_cp3 <- POB_cp3 %>%
  left_join(total_pob_dist, by="iddist_6") %>%
  left_join(Den_rad5_simple, by="iddist_6") %>%
  mutate(
    POB      = as.numeric(as.character(POB)),
    PBI_dist = as.numeric(as.character(PBI_dist)),
    PBI_cp   = PBI_dist * (POB / total_pob)
  )
# Verificar
POB_cp3 %>%
  st_drop_geometry() %>%
  filter(substr(as.character(iddist_6), 1, 4) == "0220") %>%
  arrange(desc(PBI_cp)) %>%
  dplyr::select(CEN_POB, POB, PBI_dist, PBI_cp) %>%
  head(15)

4. Revisión de las variables y sus correlaciones

4.1. Correlación entre población e ingresos por hogar

PBI_distrital %>%
  st_drop_geometry() %>%
  filter(!is.na(Ingress) & !is.na(POB) & POB > 0 & Ingress > 0) %>%
  ggplot(aes(x = POB, y = Ingress)) +
  geom_point(alpha = 0.3, color = "#8B2323", size = 1.5) +
  geom_smooth(method = "lm", color = "#1A3E5C", se = TRUE) +
  scale_x_log10(labels = scales::comma) +
  scale_y_log10(labels = scales::comma) +
  labs(
    title = "Correlación entre población e ingresos por hogar",
    subtitle = "Distritos del Perú, 2017",
    x = "Población (escala logarítmica)",
    y = "Ingresos medios por hogar (escala logarítmica)",
    caption = "Fuente: INEI (2017) y PNUD (2017)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Correlación de Pearson con datos transformados logarítmicamente
cor_result <- cor.test(
  log(PBI_distrital$POB), 
  log(PBI_distrital$Ingress), 
  method = "pearson",
  use = "complete.obs"
)
cat("Coeficiente de correlación Pearson (r):", round(cor_result$estimate, 4), "\n")
## Coeficiente de correlación Pearson (r): 0.2968
cat("p-value:", cor_result$p.value, "\n")
## p-value: 1.97786e-39
# Calcular correlación primero
cor_result <- cor.test(
  log(PBI_distrital$POB[!is.na(PBI_distrital$Ingress) & !is.na(PBI_distrital$POB) & PBI_distrital$POB > 0 & PBI_distrital$Ingress > 0]),
  log(PBI_distrital$Ingress[!is.na(PBI_distrital$Ingress) & !is.na(PBI_distrital$POB) & PBI_distrital$POB > 0 & PBI_distrital$Ingress > 0]),
  method = "spearman"
)
# Gráfico con correlación anotada
PBI_distrital %>%
  st_drop_geometry() %>%
  filter(!is.na(Ingress) & !is.na(POB) & POB > 0 & Ingress > 0) %>%
  ggplot(aes(x = POB, y = Ingress)) +
  geom_point(alpha = 0.3, color = "#D94231", size = 1.5) + 
  geom_smooth(method = "lm", color = "#1A3E5C", se = TRUE) +
  scale_x_log10(labels = scales::comma) +
  scale_y_log10(labels = scales::comma) +
  annotate("text",
           x = 100000, y = 100,
           label = paste0("Pearson rho = 0.2968",
                         "\np-value < 0.001"),
           hjust = 0, size = 3.5, color = "#1A3E5C") +
  labs(
    title = "Correlación entre población e ingresos por hogar",
    subtitle = "Distritos del Perú, 2017",
    x = "Población (habitantes)",
    y = "Ingresos medios por hogar (soles)"
  ) +
  theme_minimal()

4.2. Correlación entre población e ingresos por hogar

Correlación con el coeficiente de Pearson por región natural.

PBI_distrital %>%
  st_drop_geometry() %>%
  filter(!is.na(Ingress) & !is.na(POB) & POB > 0 & Ingress > 0) %>%
  group_by(regn_nt) %>%
  summarise(
    r = cor.test(log(POB), log(Ingress), method = "pearson")$estimate,
    p_value = cor.test(log(POB), log(Ingress), method = "pearson")$p.value,
    n = n()
  )
PBI_distrital %>%
  st_drop_geometry() %>%
  filter(!is.na(Ingress) & !is.na(POB) & POB > 0 & Ingress > 0) %>%
  mutate(regn_nt = factor(regn_nt, levels = c(
    "COSTA",
    "SIERRA", 
    "SELVA ALTA",
    "SELVA BAJA"
  ))) %>%
  ggplot(aes(x = POB, y = Ingress, color = regn_nt)) +
  geom_point(alpha = 0.4, size = 1.5) +
  geom_smooth(method = "lm", se = TRUE,
              color = "#1A3E5C",
              fill = "grey80") +
  scale_x_log10(labels = scales::comma) +
  scale_y_log10(labels = scales::comma) +
  scale_color_manual(values = c(
    "COSTA"      = "#C8862A",
    "SIERRA"     = "#E8C98A",
    "SELVA ALTA" = "#4BA89E",
    "SELVA BAJA" = "#7BBFBF"
  )) +
  facet_wrap(~regn_nt) +
  labs(
    title = "Dispersión de los datos de población e ingresos por región natural",
    x = "Población (habitantes)",
    y = "Ingresos medios por hogar (soles)",
    color = "Región natural"
  ) +
  theme_minimal() +
  theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'

# Correlación de Pearson con datos transformados logarítmicamente
pob_rad_clean <- pob_rad %>% 
  filter(POB > 0, RD > 0)

cor_result2 <- cor.test(
  log(pob_rad_clean$POB), 
  log(pob_rad_clean$RD), 
  method = "pearson",
  use = "complete.obs"
)
cat("Coeficiente de correlación Pearson (r):", round(cor_result2$estimate, 4), "\n")
## Coeficiente de correlación Pearson (r): 0.4587
cat("p-value:", formatC(cor_result2$p.value, format = "e", digits = 3), "\n")
## p-value: 0.000e+00
# Correlación de Pearson con datos transformados logarítmicamente
pob_rad_clean2 <- pob_rad_dist %>% 
  filter(POB > 0, RADmean > 0)

cor_result3 <- cor.test(
  log(pob_rad_clean2$POB), 
  log(pob_rad_clean2$RADmean), 
  method = "pearson",
  use = "complete.obs"
)
cat("Coeficiente de correlación Pearson (r):", round(cor_result3$estimate, 4), "\n")
## Coeficiente de correlación Pearson (r): 0.6136
cat("p-value:", formatC(cor_result3$p.value, format = "e", digits = 3), "\n")
## p-value: 5.970e-196

4.4. Correlación entre población, ingresos y luminosidad (RAD)

# Correlación de Pearson con datos transformados logarítmicamente
pob_rad_clean2 <- pob_rad_dist %>% 
  filter(Ingresos > 0, RADmean > 0)

cor_result3 <- cor.test(
  log(pob_rad_clean2$Ingresos), 
  log(pob_rad_clean2$RADmean), 
  method = "pearson",
  use = "complete.obs"
)
cat("Coeficiente de correlación Pearson (r):", round(cor_result3$estimate, 4), "\n")
## Coeficiente de correlación Pearson (r): 0.4981
cat("p-value:", formatC(cor_result3$p.value, format = "e", digits = 3), "\n")
## p-value: 4.268e-118
# Gráfico 1 — POB vs RADmean
p1 <- pob_rad_dist %>%
  filter(POB > 0, RADmean > 0) %>%
  ggplot(aes(x = POB, y = RADmean)) +
  geom_point(alpha = 0.3, color = "#D94231", size = 1.5) +
  geom_smooth(method = "lm", color = "#1A3E5C", se = TRUE) +
  scale_x_log10(labels = scales::comma) +
  scale_y_log10(labels = scales::comma) +
  annotate("text",
           x = 100, y = 40,
           label = "Pearson r = 0.6136\np-value < 0.001",
           hjust = 0, size = 3.5, color = "#1A3E5C") +
  labs(
    title = "Población vs Luminosidad promedio",
    x = "Población (habitantes)",
    y = "RAD media (nW/cm²sr)"
  ) +
  theme_minimal()

# Gráfico 2 — Ingresos vs RADmean
p2 <- pob_rad_dist %>%
  filter(Ingresos > 0, RADmean > 0) %>%
  ggplot(aes(x = Ingresos, y = RADmean)) +
  geom_point(alpha = 0.3, color = "#D94231", size = 1.5) +
  geom_smooth(method = "lm", color = "#1A3E5C", se = TRUE) +
  scale_x_log10(labels = scales::comma) +
  scale_y_log10(labels = scales::comma) +
  annotate("text",
           x = 50, y = 40,
           label = "Pearson r = 0.4981\np-value < 0.001",
           hjust = 0, size = 3.5, color = "#1A3E5C") +
  labs(
    title = "Ingresos vs Luminosidad promedio",
    x = "Ingresos medios por hogar (soles)",
    y = "RAD media (nW/cm²sr)"
  ) +
  theme_minimal()

# Mostrar juntos
p1 + p2
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

4.5. Correlación entre ingresos y luminosidad (RAD) a nivel distrital

# Correlación de Pearson por región natural — Ingresos vs RADmean
pob_rad_dist %>%
  filter(Ingresos > 0, RADmean > 0) %>%
  group_by(region_nat) %>%
  summarise(
    r       = cor.test(log(Ingresos), log(RADmean), method = "pearson")$estimate,
    p_value = cor.test(log(Ingresos), log(RADmean), method = "pearson")$p.value,
    n       = n()
  )
# Correlación de Pearson por región natural — POB vs RADmean

pob_rad_dist %>%
  filter(POB > 0, RADmean > 0) %>%
  group_by(region_nat) %>%
  summarise(
    r       = cor.test(log(POB), log(RADmean), method = "pearson")$estimate,
    p_value = cor.test(log(POB), log(RADmean), method = "pearson")$p.value,
    n       = n()
  )
p1 <- pob_rad_dist %>%
  filter(POB > 0, RADmean > 0) %>%
  ggplot(aes(x = POB, y = RADmean, color = region_nat)) +
  geom_point(alpha = 0.3, size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, 
              color = "#1A3E5C",   # línea siempre azul oscuro
              fill = "grey80") +   # banda gris
  scale_x_log10(labels = scales::comma) +
  scale_y_log10(labels = scales::comma) +
  scale_color_manual(values = c(
    "COSTA"      = "#C8862A",
    "SIERRA"     = "#E8C98A",
    "SELVA ALTA" = "#4BA89E",
    "SELVA BAJA" = "#7BBFBF"
  )) +
  facet_wrap(~region_nat) +
  labs(
    title = "Población vs Luminosidad promedio por región natural",
    x = "Población (habitantes)",
    y = "RAD media (nW/cm²sr)",
    color = "Región natural"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# Mostrar juntos
p1
## `geom_smooth()` using formula = 'y ~ x'

# Gráfico 2 — Ingresos vs RADmean por región natural
p2 <- pob_rad_dist %>%
  filter(Ingresos > 0, RADmean > 0) %>%
  ggplot(aes(x = Ingresos, y = RADmean, color = region_nat)) +
  geom_point(alpha = 0.3, size = 1.5) +
  geom_smooth(method = "lm", se = TRUE,
              color = "#1A3E5C",  # línea siempre azul oscuro
              fill = "grey80") +  # banda gris
  scale_x_log10(labels = scales::comma) +
  scale_y_log10(labels = scales::comma) +
  scale_color_manual(values = c(
    "COSTA"      = "#C8862A",
    "SIERRA"     = "#E8C98A",
    "SELVA ALTA" = "#4BA89E",
    "SELVA BAJA" = "#7BBFBF"
  )) +
  facet_wrap(~region_nat) +
  labs(
    title = "Ingresos vs Luminosidad promedio por región natural",
    x = "Ingresos medios por hogar (soles)",
    y = "RADmean (nW/cm²sr)",
    color = "Región natural"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# Mostrar juntos
p2
## `geom_smooth()` using formula = 'y ~ x'

4.6. Correlación entre población, ingresos y luminosidad (RAD)a nivel de huella urbana

# Correlación de Pearson con datos transformados logarítmicamente
nnpp_datos <- nnpp %>% 
  filter(Total_pob > 0, RADmean > 0)

cor_result4 <- cor.test(
  log(nnpp_datos$Total_pob), 
  log(nnpp_datos$RADmean), 
  method = "pearson",
  use = "complete.obs"
)

# Gráfico con correlación anotada
nnpp_datos %>%
  ggplot(aes(x = Total_pob, y = RADmean)) +
  geom_point(alpha = 0.3, color = "#EF3154", size = 1.5) + 
  geom_smooth(method = "lm", color = "#1A3E5C", se = TRUE) +
  scale_x_log10(labels = scales::comma) +
  scale_y_log10(labels = scales::comma) +
  annotate("text",
           x = 100000, y = 0.1,
           label = paste0("Pearson r = ", round(cor_result4$estimate, 4),
                          "\np-value < 0.001"),
           hjust = 0, size = 3.5, color = "#1A3E5C") +
  labs(
    title = "Correlación entre población y luminosidad",
    subtitle = "Núcleos Poblados del Perú",
    x = "Población total (habitantes)",
    y = "RADmean (nW/cm²sr)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Correlación de Pearson con datos transformados logarítmicamente por región natural
nnpp_datos <- nnpp %>% 
  filter(Total_pob > 0, RADmean > 0, !is.na(region_nat))

resultados_cor <- nnpp_datos %>%
  group_by(region_nat) %>%
  summarise(
    n = n(),
    r = round(cor.test(log(Total_pob), log(RADmean), method = "pearson", use = "complete.obs")$estimate, 4),
    p_value = formatC(cor.test(log(Total_pob), log(RADmean), method = "pearson", use = "complete.obs")$p.value, format = "e", digits = 3)
  )

print(resultados_cor)
## Simple feature collection with 4 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -81.31321 ymin: -18.29288 xmax: -68.65762 ymax: -0.09284716
## Geodetic CRS:  WGS 84
## # A tibble: 4 × 5
##   region_nat     n     r p_value                                        geometry
##   <fct>      <int> <dbl> <chr>                                <MULTIPOLYGON [°]>
## 1 COSTA       2560 0.699 0.000e+00  (((-78.20279 -9.163739, -78.20107 -9.164183…
## 2 SELVA ALTA  2654 0.686 0.000e+00  (((-74.73878 -11.36925, -74.73886 -11.36927…
## 3 SELVA BAJA  2532 0.591 3.949e-238 (((-78.21484 -4.642538, -78.2147 -4.642529,…
## 4 SIERRA      8395 0.622 0.000e+00  (((-72.12328 -13.37384, -72.12364 -13.37384…
# Gráfico — Total_pob vs RADmean por región natural
p3 <- nnpp_datos %>%
  mutate(region_nat = factor(region_nat, levels = c("COSTA", "SIERRA", "SELVA ALTA", "SELVA BAJA"))) %>%
  ggplot(aes(x = Total_pob, y = RADmean, color = region_nat)) +
  geom_point(alpha = 0.3, size = 1.5) +
  geom_smooth(method = "lm", se = TRUE,
              color = "#1A3E5C",
              fill = "grey80") +
  scale_x_log10(labels = scales::comma) +
  scale_y_log10(labels = scales::comma) +
  scale_color_manual(values = c(
    "COSTA"      = "#C8862A",
    "SIERRA"     = "#E8C98A",
    "SELVA ALTA" = "#4BA89E",
    "SELVA BAJA" = "#7BBFBF"
  )) +
  facet_wrap(~region_nat) +
  labs(
    title = "Población vs Luminosidad promedio por región natural",
    x = "Población total (hab.)",
    y = "RADmean (nW/cm²sr)",
    color = "Región natural"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

p3
## `geom_smooth()` using formula = 'y ~ x'

4.7. Correlación entre PBI provincial, ingresos y luminosidad (RAD)

# Crear objeto verificacion uniendo PBI Geary-Stark provincial con Den_rad4
verificacion <- Den_rad5 %>%
  left_join(st_drop_geometry(PBI_prov7[, c("idprov", "PBI_prov")]), by="idprov")

# Filtrar valores válidos
verificacion_limpia <- verificacion %>%
  filter(PBI_prov > 0, RADmean > 0, DENSIDAD > 0) %>%
  mutate(
    ln_PBI  = log(PBI_prov),
    ln_luz  = log(RADmean),
    ln_dens = log(DENSIDAD)
  )
# Regresión 1 — luminosidad
reg_luz <- lm(ln_PBI ~ ln_luz, data=st_drop_geometry(verificacion_limpia))
summary(reg_luz)
## 
## Call:
## lm(formula = ln_PBI ~ ln_luz, data = st_drop_geometry(verificacion_limpia))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2198 -0.6328  0.0588  0.5358  2.8737 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  15.0548     0.1476  101.98   <2e-16 ***
## ln_luz        1.4213     0.1048   13.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9652 on 194 degrees of freedom
## Multiple R-squared:  0.4866, Adjusted R-squared:  0.484 
## F-statistic: 183.9 on 1 and 194 DF,  p-value: < 2.2e-16

5. Revisión de residuos

# Paso 1 - crear datos_reg
datos_reg <- st_drop_geometry(verificacion_limpia)[complete.cases(
  st_drop_geometry(verificacion_limpia)[, c("ln_PBI", "ln_luz")]), ]

residuos_df <- data.frame(
  fitted   = fitted(reg_luz),
  residuos = residuals(reg_luz),
  POB      = datos_reg$POB,
  PBI      = datos_reg$PBI_prov,        
  nombprov = datos_reg$nombprov
)

residuos_df$nombprov <- as.character(datos_reg$nombprov)
# Grupos a etiquetar
top5_resid    <- residuos_df |> slice_max(residuos, n = 5)
bottom5_resid <- residuos_df |> slice_min(residuos, n = 5)
top5_pbi      <- residuos_df |> slice_max(PBI,      n = 5)  # ← nuevo

etiquetas <- bind_rows(top5_resid, bottom5_resid, top5_pbi) |> 
  distinct(nombprov, .keep_all = TRUE)  # ← evita duplicados si una provincia aparece en 2 grupos

ggplot(residuos_df, aes(x = fitted, y = residuos, size = POB)) +
  geom_point(alpha = 0.4, color = "#df4180") +
  geom_hline(yintercept = 0, color = "#1A3E5C") +
  geom_text_repel(
    data         = etiquetas,
    aes(label    = nombprov),
    size         = 3,
    color        = "#1A3E5C",
    fontface     = "bold",
    box.padding  = 0.5,
    max.overlaps = 20
  ) +
  scale_size_continuous(
    name   = "Población",
    range  = c(1, 10),
    labels = scales::comma
  ) +
  labs(
    title    = "Residuos vs valores ajustados — regresión ln(PBI) ~ ln(luz)",
    subtitle = "Provincias del Perú, 2017",
    x        = "Valores ajustados ln(PBI)",
    y        = "Residuos"
  ) +
  theme_minimal()

Regresión sin la provincia de Lima

# Regresión 2 — luminosidad sin Lima
verificacion_sinLima <- verificacion_limpia %>%
  filter(nombprov != "LIMA")

reg_luz_sinLima <- lm(ln_PBI ~ ln_luz, data = st_drop_geometry(verificacion_sinLima))
summary(reg_luz_sinLima)
## 
## Call:
## lm(formula = ln_PBI ~ ln_luz, data = st_drop_geometry(verificacion_sinLima))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.10949 -0.64076  0.05583  0.53947  2.86550 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  15.0243     0.1612   93.22   <2e-16 ***
## ln_luz        1.3988     0.1151   12.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9671 on 193 degrees of freedom
## Multiple R-squared:  0.4336, Adjusted R-squared:  0.4307 
## F-statistic: 147.7 on 1 and 193 DF,  p-value: < 2.2e-16
# Regresión 2 — densidad
reg_dens <- lm(ln_PBI ~ ln_dens, data=st_drop_geometry(verificacion_limpia))
summary(reg_dens)
## 
## Call:
## lm(formula = ln_PBI ~ ln_dens, data = st_drop_geometry(verificacion_limpia))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.39158 -0.71955  0.00281  0.73953  2.79484 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.53911    0.16468   70.07   <2e-16 ***
## ln_dens      0.62141    0.05249   11.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.026 on 194 degrees of freedom
## Multiple R-squared:  0.4194, Adjusted R-squared:  0.4164 
## F-statistic: 140.1 on 1 and 194 DF,  p-value: < 2.2e-16
# Paso 1 - PRIMERO crear datos_reg
datos_reg <- st_drop_geometry(verificacion_limpia)[complete.cases(
  st_drop_geometry(verificacion_limpia)[, c("ln_PBI", "ln_dens")]), ]

residuos_df <- data.frame(
  fitted   = fitted(reg_dens),
  residuos = residuals(reg_dens),
  DENSIDAD = datos_reg$DENSIDAD,
  POB = datos_reg$POB,
  PBI      = datos_reg$PBI_prov,        
  nombprov = datos_reg$nombprov
)

residuos_df$nombprov <- as.character(datos_reg$nombprov)

residuos_df$nombprov <- ifelse(
  residuos_df$nombprov == "CA�ETE", "CAÑETE", residuos_df$nombprov
)

# Grupos a etiquetar
top5_resid    <- residuos_df |> slice_max(residuos, n = 5)
bottom5_resid <- residuos_df |> slice_min(residuos, n = 5)
top5_pbi      <- residuos_df |> slice_max(PBI, n = 5) 

etiquetas <- bind_rows(top5_resid, bottom5_resid, top5_pbi) |> 
  distinct(nombprov, .keep_all = TRUE)  # evitar duplicados

ggplot(residuos_df, aes(x = fitted, y = residuos, size = POB)) +
  geom_point(alpha = 0.4, color = "#df4180") +
  geom_hline(yintercept = 0, color = "#1A3E5C") +
  geom_text_repel(
    data         = etiquetas,
    aes(label    = nombprov),
    size         = 3,
    color        = "#1A3E5C",
    fontface     = "bold",
    box.padding  = 0.5,
    max.overlaps = 20
  ) +
  scale_size_continuous(
    name   = "Población",
    range  = c(1, 10),
    labels = scales::comma
  ) +
  labs(
    title    = "Residuos vs valores ajustados — regresión ln(PBI) ~ ln(densidad)",
    subtitle = "Provincias del Perú, 2017",
    x        = "Valores ajustados ln(PBI)",
    y        = "Residuos"
  ) +
  theme_minimal()

# Paso 1 - Asignar nombres de provincias a las filas
rownames(datos_reg) <- datos_reg$nombprov

# Paso 2 - Re-ajustar el modelo 
# Volver a correr el dataframe
resid_dens <- lm(ln_PBI ~ ln_dens, data=datos_reg)

# Paso 3 -  generar los gráficos
mi_estilo <- list(
  theme_minimal(),
  theme(
    plot.title = element_text(color = "#1A3E5C", face = "bold", size = 10),
    axis.title = element_text(size = 8),
    panel.grid.minor = element_blank()
  )
)

# En label.n poner cuántos nombres (5)
# Extraer datos de Lima del modelo
lima_data <- data.frame(
  fitted   = fitted(resid_dens )[which(rownames(datos_reg) == "LIMA")],
  residuos = residuals(resid_dens )[which(rownames(datos_reg) == "LIMA")],
  label    = "LIMA"
)

p1 <- autoplot(resid_dens , which = 1, colour = "#df4180", alpha = 0.3, size = 2,
               smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C", 
               label.n = 5, label.repel = TRUE) + mi_estilo

p2 <- autoplot(resid_dens , which = 2, colour = "#df4180", alpha = 0.3, size = 2,
               smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C", 
               label.n = 3) + mi_estilo

p3 <- autoplot(resid_dens, which = 3, colour = "#df4180", alpha = 0.3, size = 2,
               smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C", 
               label.n = 3) + mi_estilo

p4 <- autoplot(resid_dens, which = 4, colour = "#df4180", alpha = 0.3, size = 2,
               smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C", 
               label.n = 3) + mi_estilo

# Paso 4 - Ensamblar
(p1[[1]] + p2[[1]]) / (p3[[1]] + p4[[1]]) +
  plot_annotation(title = "Diagnóstico de residuos de la regresión lineal")

4.8. Regresión cúbica a nivel provincial

# TRANSFORMACIÓN LOGARÍTMICA
verificacion_prov <- verificacion_prov %>%
  mutate(
    ln_PBI  = log(PBI_prov),
    ln_luz  = log(RADmean),
    ln_dens = log(DENSIDAD)
  )

# VERIFICAR QUE NO HAY NAs NI -Inf por valores cero
sum(is.infinite(verificacion_prov$ln_luz))
## [1] 0
sum(is.infinite(verificacion_prov$ln_dens))
## [1] 0
# REGRESIÓN CÚBICA PROVINCIAL
reg_cubica_prov <- lm(ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
                      data = verificacion_prov)
summary(reg_cubica_prov)
## 
## Call:
## lm(formula = ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens, 
##     data = verificacion_prov)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.96418 -0.65753  0.01957  0.57772  3.04467 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.60923    0.46179  31.636  < 2e-16 ***
## ln_luz       1.28594    0.27049   4.754 3.92e-06 ***
## I(ln_luz^2) -0.15327    0.08249  -1.858   0.0647 .  
## I(ln_luz^3) -0.03346    0.04701  -0.712   0.4775    
## ln_dens      0.17681    0.08423   2.099   0.0371 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9183 on 191 degrees of freedom
## Multiple R-squared:  0.5424, Adjusted R-squared:  0.5328 
## F-statistic: 56.61 on 4 and 191 DF,  p-value: < 2.2e-16
# Paso 1 -  Asignar nombres al dataframe
rownames(verificacion_prov) <- verificacion_prov$nombprv

# Paso 2 - Re-ajustar el modelo
reg_cubica_prov <- lm(ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
                      data = verificacion_prov)

# Paso 3 - Extraer Lima
lima_data <- data.frame(
  fitted   = fitted(reg_cubica_prov)[which(rownames(verificacion_prov) == "LIMA")],
  residuos = residuals(reg_cubica_prov)[which(rownames(verificacion_prov) == "LIMA")],
  label    = "LIMA"
)

# Paso 4 - Gráficos
p1 <- autoplot(reg_cubica_prov, which = 1, colour = "#df4180", alpha = 0.3, size = 2,
               smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
               label.n = 5, label.repel = TRUE) + mi_estilo +
  geom_text_repel(data = lima_data, aes(x = fitted, y = residuos, label = label),
                  size = 3, color = "#1A3E5C", fontface = "bold",
                  box.padding = 0.5, arrow = arrow(length = unit(0.015, "npc")))

p2 <- autoplot(reg_cubica_prov, which = 2, colour = "#df4180", alpha = 0.3, size = 2,
               smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
               label.n = 5, label.repel = TRUE) + mi_estilo

p3 <- autoplot(reg_cubica_prov, which = 3, colour = "#df4180", alpha = 0.3, size = 2,
               smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
               label.n = 5, label.repel = TRUE) + mi_estilo

p4 <- autoplot(reg_cubica_prov, which = 4, colour = "#df4180", alpha = 0.3, size = 2,
               smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
               label.n = 3, label.repel = TRUE) + mi_estilo

# Paso 5 - Ensamblar
(p1[[1]] + p2[[1]]) / (p3[[1]] + p4[[1]]) +
  plot_annotation(title = "Diagnóstico de residuos — regresión cúbica provincial")

# RESIDUOS REGRESIÓN CÚBICA PROVINCIAL
datos_reg <- verificacion_prov[complete.cases(
  verificacion_prov[, c("ln_PBI", "ln_luz", "ln_dens")]), ]

residuos_df <- data.frame(
  fitted   = fitted(reg_cubica_prov),
  residuos = residuals(reg_cubica_prov),
  DENSIDAD = datos_reg$DENSIDAD,
  POB      = datos_reg$POB,        # verifica cómo se llama población
  PBI      = datos_reg$PBI_prov,        
  nombprov = as.character(datos_reg$nombprv)  # verifica nombre del campo
)
residuos_df$nombprov <- as.character(datos_reg$nombprov)
residuos_df$nombprov <- ifelse(
  residuos_df$nombprov == "CA�ETE", "CAÑETE", residuos_df$nombprov
)
# Grupos a etiquetar
top5_resid    <- residuos_df |> slice_max(residuos, n = 5)
bottom5_resid <- residuos_df |> slice_min(residuos, n = 5)
top5_pbi      <- residuos_df |> slice_max(PBI, n = 5) 
etiquetas <- bind_rows(top5_resid, bottom5_resid, top5_pbi) |> 
  distinct(nombprov, .keep_all = TRUE)  # evitar duplicados
ggplot(residuos_df, aes(x = fitted, y = residuos, size = POB)) +
  geom_point(alpha = 0.4, color = "#df4180") +
  geom_hline(yintercept = 0, color = "#1A3E5C") +
  geom_text_repel(
    data         = etiquetas,
    aes(label    = nombprov),
    size         = 3,
    color        = "#1A3E5C",
    fontface     = "bold",
    box.padding  = 0.5,
    max.overlaps = 20
  ) +
  scale_size_continuous(
    name   = "Población",
    range  = c(1, 10),
    labels = scales::comma
  ) +
  labs(
    title    = "Residuos vs valores ajustados — regresión ln(PBI) ~ ln(densidad) y ln(luminosidad)",
    subtitle = "Provincias del Perú, 2017",
    x        = "Valores ajustados ln(PBI)",
    y        = "Residuos"
  ) +
  theme_minimal()

# Ver cuánto pesa Ucayali en el modelo
verificacion_prov %>%
  filter(grepl("Ucayali", nombdep.x, ignore.case = TRUE)) %>%
  select(nombdep.x, nombprv, PBI_prov, RADmean, DENSIDAD, ln_PBI, ln_luz, ln_dens)
# Identificar observaciones influyentes
plot(reg_cubica_prov, which = 4)  # Cook's distance

# Identificar observación 29
verificacion_prov[29, ] %>%
  select(nombdep.x, nombprv, PBI_prov, RADmean, DENSIDAD, ln_PBI, ln_luz, ln_dens)
# Y las 109 y 147 también
verificacion_prov[c(29, 109, 147), ] %>%
  select(nombdep.x, nombprv, PBI_prov, RADmean, DENSIDAD, ln_PBI, ln_luz, ln_dens)

4.9. Regresión cúbica a nivel distrital

DEN_RAD_DIST <- DEN_RAD_DIST %>%
  mutate(idprov = paste0(substr(as.character(iddist), 1, 4), "00"))
head(DEN_RAD_DIST$idprov, 5)
## [1] "030200" "030400" "030400" "030200" "030200"
# ESCALONAMIENTO PROVINCIAL A DISTRITAL CON GEARY-STARK (INGRESOS)
PBI_dist_geary <- DEN_RAD_DIST %>%
  st_drop_geometry() %>%
  left_join(st_drop_geometry(PBI_prov_geary) %>% 
              select(idprov, PBI_prov), 
            by = "idprov") %>%
  group_by(idprov) %>%
  mutate(
    PBI_dist = PBI_prov * (Ingresos / sum(Ingresos, na.rm = TRUE))
  ) %>%
  ungroup()
# TRANSFORMACIÓN LOGARÍTMICA
PBI_dist_geary <- PBI_dist_geary %>%
  mutate(
    ln_PBI  = log(PBI_dist),
    ln_luz  = log(RADmean),
    ln_dens = log(DENSIDAD)
  )

# VERIFICAR
sum(is.infinite(PBI_dist_geary$ln_PBI))
## [1] 0
sum(is.infinite(PBI_dist_geary$ln_luz))
## [1] 0
sum(is.infinite(PBI_dist_geary$ln_dens))
## [1] 0
# REGRESIÓN CÚBICA DISTRITAL
reg_cubica_dist <- lm(ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
                      data = PBI_dist_geary)
summary(reg_cubica_dist)
## 
## Call:
## lm(formula = ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens, 
##     data = PBI_dist_geary)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.81346 -0.74192 -0.05575  0.68740  2.97214 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.12124    0.14898  88.076  < 2e-16 ***
## ln_luz       0.92785    0.06050  15.335  < 2e-16 ***
## I(ln_luz^2) -0.22255    0.02458  -9.054  < 2e-16 ***
## I(ln_luz^3)  0.05243    0.00926   5.661 1.73e-08 ***
## ln_dens     -0.13720    0.02672  -5.134 3.13e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.048 on 1869 degrees of freedom
##   (17 observations deleted due to missingness)
## Multiple R-squared:  0.476,  Adjusted R-squared:  0.4748 
## F-statistic: 424.4 on 4 and 1869 DF,  p-value: < 2.2e-16
# Combinar iddist con nombre para rownames únicos y legibles
PBI_dist_geary <- PBI_dist_geary %>%
  mutate(label_dist = paste0(nombdist, " (", iddist, ")"))

rownames(PBI_dist_geary) <- PBI_dist_geary$label_dist
## Warning: Setting row names on a tibble is deprecated.
# Re-ajustar el modelo
reg_cubica_dist <- lm(ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
                      data = PBI_dist_geary)

# 3. Gráficos (a nivel distrital no hay Lima provincia, hay Lima distrito)
p1 <- autoplot(reg_cubica_dist, which = 1, colour = "#df4180", alpha = 0.3, size = 1.5,
               smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
               label.n = 5, label.repel = TRUE) + mi_estilo

p2 <- autoplot(reg_cubica_dist, which = 2, colour = "#df4180", alpha = 0.3, size = 1.5,
               smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
               label.n = 5, label.repel = TRUE) + mi_estilo

p3 <- autoplot(reg_cubica_dist, which = 3, colour = "#df4180", alpha = 0.3, size = 1.5,
               smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
               label.n = 5, label.repel = TRUE) + mi_estilo

p4 <- autoplot(reg_cubica_dist, which = 4, colour = "#df4180", alpha = 0.3, size = 2,
               smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
               label.n = 5, label.repel = TRUE) + mi_estilo

# 4. Ensamblar
(p1[[1]] + p2[[1]]) / (p3[[1]] + p4[[1]]) +
  plot_annotation(title = "Diagnóstico de residuos — regresión cúbica distrital")

# Extraer datos del modelo distrital
datos_reg_dist <- PBI_dist_geary[complete.cases(
  PBI_dist_geary[, c("ln_PBI", "ln_luz", "ln_dens")]), ]

residuos_df_dist <- data.frame(
  fitted   = fitted(reg_cubica_dist),
  residuos = residuals(reg_cubica_dist),
  POB      = datos_reg_dist$POB,
  PBI      = datos_reg_dist$PBI_dist,  # ajusta el nombre del campo PBI distrital
  nombdist = as.character(datos_reg_dist$nombdist)
)

# Grupos a etiquetar
top5_resid    <- residuos_df_dist |> slice_max(residuos, n = 5)
bottom5_resid <- residuos_df_dist |> slice_min(residuos, n = 5)
top5_pbi      <- residuos_df_dist |> slice_max(PBI, n = 5)

etiquetas <- bind_rows(top5_resid, bottom5_resid, top5_pbi) |>
  distinct(nombdist, .keep_all = TRUE)

# Gráfico
ggplot(residuos_df_dist, aes(x = fitted, y = residuos, size = POB)) +
  geom_point(alpha = 0.4, color = "#df4180") +
  geom_hline(yintercept = 0, color = "#1A3E5C") +
  geom_text_repel(
    data         = etiquetas,
    aes(label    = nombdist),
    size         = 3,
    color        = "#1A3E5C",
    fontface     = "bold",
    box.padding  = 0.5,
    max.overlaps = 20
  ) +
  scale_size_continuous(
    name   = "Población",
    range  = c(1, 10),
    labels = scales::comma
  ) +
  labs(
    title    = "Diagnóstico de residuos — regresión cúbica distrital",
    subtitle = "Distritos del Perú, 2017",
    x        = "Valores ajustados ln(PBI)",
    y        = "Residuos"
  ) +
  theme_minimal()

##CÁLCULO POR RANGOS POBLACIONALES

tabla_dist <- PBI_dist_geary %>%
  st_drop_geometry() %>%
  mutate(rango_pob = case_when(
    POB >= 498.4  & POB <= 1000   ~ "498 - 1,000",
    POB >= 1001   & POB <= 5000   ~ "1,001 - 5,000",
    POB >= 5001   & POB <= 10000  ~ "5,001 - 10,000",
    POB >= 10001  & POB <= 25000  ~ "10,001 - 25,000",
    POB >= 25001  & POB <= 50000  ~ "25,001 - 50,000",
    POB >= 50001  & POB <= 100000 ~ "50,001 - 100,000",
    POB >= 100001 & POB <= 500000 ~ "100,001 - 500,000",
    POB >  500000                 ~ "Más de 500,000"
  )) %>%
  group_by(rango_pob) %>%
  summarise(
    N_distritos = n(),
    PBI_media   = mean(PBI_dist, na.rm = TRUE),
    PBI_total   = sum(PBI_dist, na.rm = TRUE)
  ) %>%
  arrange(factor(rango_pob, levels = c(
    "498 - 1,000",
    "1,001 - 5,000",
    "5,001 - 10,000",
    "10,001 - 25,000",
    "25,001 - 50,000",
    "50,001 - 100,000",
    "100,001 - 500,000",
    "Más de 500,000"
  )))

print(tabla_dist)
## # A tibble: 9 × 4
##   rango_pob         N_distritos PBI_media  PBI_total
##   <chr>                   <int>     <dbl>      <dbl>
## 1 498 - 1,000               215    74512.  16020072.
## 2 1,001 - 5,000             799    92552.  72838629.
## 3 5,001 - 10,000            325   154256.  49824765.
## 4 10,001 - 25,000           243   219005.  53218287.
## 5 25,001 - 50,000            98   560325.  54351572.
## 6 50,001 - 100,000           68  1351814.  91923336.
## 7 100,001 - 500,000          49  2241537. 109835295.
## 8 Más de 500,000              4  4027381.  16109524.
## 9 <NA>                       90    41332.   3637234.

4.9. Regresión cúbica a nivel distrital

HUELLA_dist <- st_join(
  HUELLA %>% st_transform(crs = st_crs(DEN_RAD_DIST)),
  DEN_RAD_DIST %>% select(iddist),
  join = st_intersects,
  largest = TRUE
)
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
sum(is.na(HUELLA_dist$iddist))
## [1] 2
# ELIMINAR LOS 2 NAs
HUELLA_dist <- HUELLA_dist %>%
  filter(!is.na(iddist))
# PASO 2: TRAER PBI DISTRITAL
HUELLA_dist <- HUELLA_dist %>%
  st_drop_geometry() %>%
  left_join(
    PBI_dist_geary %>% select(iddist, PBI_dist),
    by = "iddist"
  )

# PASO 3: ESCALONAR PBI A HUELLA POR POBLACIÓN
HUELLA_dist <- HUELLA_dist %>%
  group_by(iddist) %>%
  mutate(
    PBI_huella = PBI_dist * (Total_pob / sum(Total_pob, na.rm = TRUE))
  ) %>%
  ungroup()

# VERIFICAR
sum(is.na(HUELLA_dist$PBI_huella))
## [1] 84
# ELIMINAR HUELLAS SIN PBI
HUELLA_dist <- HUELLA_dist %>%
  filter(!is.na(PBI_huella))
# TRANSFORMACIÓN LOGARÍTMICA
HUELLA_dist <- HUELLA_dist %>%
  mutate(
    ln_PBI  = log(PBI_huella),
    ln_luz  = log(RADmean),
    ln_dens = log(DENpob)
  )

# VERIFICAR
sum(is.infinite(HUELLA_dist$ln_PBI))
## [1] 0
sum(is.infinite(HUELLA_dist$ln_luz))
## [1] 0
sum(is.infinite(HUELLA_dist$ln_dens))
## [1] 0
# REGRESIÓN CÚBICA HUELLA
reg_cubica_huella <- lm(ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
                        data = HUELLA_dist)
summary(reg_cubica_huella)
## 
## Call:
## lm(formula = ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens, 
##     data = HUELLA_dist)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.7608 -0.8529 -0.0672  0.7691  5.9780 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.938062   0.098081  80.934   <2e-16 ***
## ln_luz       1.282798   0.022985  55.810   <2e-16 ***
## I(ln_luz^2)  0.082954   0.009432   8.795   <2e-16 ***
## I(ln_luz^3) -0.063630   0.006371  -9.988   <2e-16 ***
## ln_dens      0.131743   0.014767   8.921   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.262 on 16052 degrees of freedom
## Multiple R-squared:  0.3449, Adjusted R-squared:  0.3447 
## F-statistic:  2112 on 4 and 16052 DF,  p-value: < 2.2e-16
# 1. Asignar rownames únicos combinando COD con ID
HUELLA_dist <- HUELLA_dist %>%
  mutate(label_huella = paste0(COD, " (", ID, ")"))

rownames(HUELLA_dist) <- HUELLA_dist$label_huella
## Warning: Setting row names on a tibble is deprecated.
# 2. Re-ajustar el modelo
reg_cubica_huella <- lm(ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
                        data = HUELLA_dist)

# 3. Gráficos
p1 <- autoplot(reg_cubica_huella, which = 1, colour = "#df4180", alpha = 0.1, size = 0.5,
               smooth.colour = "#1A3E5C", label.size = 0, label.n = 0) + mi_estilo

p2 <- autoplot(reg_cubica_huella, which = 2, colour = "#df4180", alpha = 0.1, size = 0.5,
               smooth.colour = "#1A3E5C", label.size = 0, label.n = 0) + mi_estilo

p3 <- autoplot(reg_cubica_huella, which = 3, colour = "#df4180", alpha = 0.1, size = 0.5,
               smooth.colour = "#1A3E5C", label.size = 0, label.n = 0) + mi_estilo

p4 <- autoplot(reg_cubica_huella, which = 4, colour = "#df4180", alpha = 0.1, size = 0.5,
               smooth.colour = "#1A3E5C", label.size = 0, label.n = 0) + mi_estilo

# 4. Ensamblar
(p1[[1]] + p2[[1]]) / (p3[[1]] + p4[[1]]) +
  plot_annotation(title = "Diagnóstico de residuos — regresión cúbica huella urbano-rural")

6. Revisión de colinealidad

# REGRESIÓN CÚBICA DISTRITAL
reg_lineal_dist <- lm(ln_PBI ~ ln_luz + ln_dens,
                      data = PBI_dist_geary)
summary(reg_lineal_dist)
## 
## Call:
## lm(formula = ln_PBI ~ ln_luz + ln_dens, data = PBI_dist_geary)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.13026 -0.74362 -0.08045  0.73140  3.02995 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.25690    0.12128 101.059   <2e-16 ***
## ln_luz       0.94883    0.04423  21.452   <2e-16 ***
## ln_dens     -0.03989    0.02531  -1.576    0.115    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.074 on 1871 degrees of freedom
##   (17 observations deleted due to missingness)
## Multiple R-squared:  0.4499, Adjusted R-squared:  0.4493 
## F-statistic: 765.2 on 2 and 1871 DF,  p-value: < 2.2e-16
reg_lineal_dist <- lm(ln_PBI ~ ln_luz + ln_dens, data = PBI_dist_geary)
vif(reg_lineal_dist)
##   ln_luz  ln_dens 
## 3.781857 3.781857