Cargar librerías
library(tidyr)
library(tidyverse)
library(sf)
library(sp)
library(dplyr)
library(exactextractr)
library(units)
library(spatstat)
library(writexl)
library(dplyr)
library(readxl)
library(car)
library(ggplot2)
library(ggfortify)
library(patchwork)
library(ggrepel)
library(RColorBrewer)
library(ggrepel)
Cálculo del IE provincial
Escalonamiento mediante el método de Geary y Stark (2002).
# Paso 0 — calcular POB total por provincia desde CCPP
pob_provincial2 <- POB_cp %>%
st_drop_geometry() %>%
mutate(idprov_6 = paste0(
substr(str_pad(as.character(idcp), 8, pad = "0"), 1, 4), # primeros 4 = dpto+prov
"00" # completar a 6 dígitos como en idprov
)) %>%
group_by(idprov_6) %>%
summarise(POB_total = sum(POB, na.rm = TRUE))
# Paso 1: POB total por departamento (suma de provincias)
pob_departamental <- pob_provincial2 %>%
mutate(iddep_2 = substr(idprov_6, 1, 2)) %>%
group_by(iddep_2) %>%
summarise(POB_dep = sum(POB_total, na.rm = TRUE))
# Paso 2: unir todo a Ingresos_prov y calcular ingreso ponderado
Ingresos_prov <- Ingresos_prov %>%
mutate(
idprov_6 = substr(str_pad(as.character(idprov), 6, pad = "0"), 1, 6),
iddep_2 = substr(idprov_6, 1, 2)
) %>%
left_join(pob_provincial2, by = "idprov_6") %>%
left_join(pob_departamental, by = "iddep_2") %>%
mutate(
# Ingreso ponderado = Ingresos * (POB provincia / POB departamento)
Ingreso_pond = Ingresos * (POB_total / POB_dep)
)
# Paso 3: calcular total de Ingreso_pond por departamento
ingresos_total_dep <- Ingresos_prov %>%
st_drop_geometry() %>%
group_by(iddep_2) %>%
summarise(total_ingreso_dep = sum(Ingreso_pond, na.rm = TRUE))
# Paso 4: calcular PBI provincial con Geary-Stark corregido
Ingresos_prov2 <- Ingresos_prov %>%
select(-any_of(c("total_ingreso_dep", "PBI_dep", "PBI_provincial"))) %>%
left_join(ingresos_total_dep, by = "iddep_2") %>%
left_join(st_drop_geometry(PBI_dep[, c("iddpto", "PBI_dep")]),
by = c("iddep_2" = "iddpto")) %>%
mutate(PBI_prov = PBI_dep * (Ingreso_pond / total_ingreso_dep))
Revisión del resultado del PBI provincial
La revisión se realizó según los rangos poblacionales.
tabla_26 <- PBI_prov7 %>%
st_drop_geometry() %>%
mutate(rango_pob = case_when(
POB_ttl >= 2860 & POB_ttl < 10000 ~ "[2,860 -- 10,000>",
POB_ttl >= 10000 & POB_ttl < 50000 ~ "[10,000 -- 50,000>",
POB_ttl >= 50000 & POB_ttl < 100000 ~ "[50,000 -- 100,000>",
POB_ttl >= 100000 & POB_ttl < 1000000 ~ "[100,000 -- 1'000,000>",
POB_ttl >= 1000000 ~ "[1'000,000 -- 8'574,974>"
)) %>%
group_by(rango_pob) %>%
summarise(
N_prov = n(),
PBI_mediana = median(PBI_prov, na.rm = TRUE),
PBI_media = mean(PBI_prov, na.rm = TRUE),
PBI_total = sum(PBI_prov, na.rm = TRUE)
) %>%
arrange(factor(rango_pob, levels = c(
"[2,860 -- 10,000>",
"[10,000 -- 50,000>",
"[50,000 -- 100,000>",
"[100,000 -- 1'000,000>",
"[1'000,000 -- 8'574,974>"
)))
print(tabla_26)
## # A tibble: 5 × 5
## rango_pob N_prov PBI_mediana PBI_media PBI_total
## <chr> <int> <dbl> <dbl> <dbl>
## 1 [2,860 -- 10,000> 12 82779. 79618. 955412.
## 2 [10,000 -- 50,000> 73 240445. 273422. 19959778.
## 3 [50,000 -- 100,000> 56 661205. 839357. 47004000.
## 4 [100,000 -- 1'000,000> 53 2530457. 3571441. 189286356.
## 5 [1'000,000 -- 8'574,974> 2 105276584. 105276584. 210553168.
Coeficientes a nivel distrital (S&P,
2022)
Para el cálculo del índice de producción se utilizan los coeficientes
estimados por Seminario y Palomino (2022), dado que su análisis
multitemporal (1993-2018) proporciona mayor robustez y rigor estadístico
que una estimación de corte transversal para un único año.
# Coeficientes de la función cúbica
a0 <- 0 # intercepto (no reportado)
a1 <- 0.2853 # ln(luz)
a2 <- 0.0449 # ln(luz)^2
a3 <- 0.0028 # ln(luz)^3
a4 <- 0.8475 # ln(densidad)
Cálculo del Índice de producción y el Factor de distribución
# Paso 1 — Calcular índice PBI distrital
Den_rad6 <- Den_rad6 %>%
mutate(
area_km2 = area / 1e6,
RADmean = ifelse(RADmean <= 0 | is.na(RADmean), NA_real_, RADmean),
DENSIDAD = ifelse(DENSIDAD <= 0 | is.na(DENSIDAD), NA_real_, DENSIDAD),
ln_luz = log(RADmean), #luminosidad promedio por píxel, según S&P
ln_dens = log(DENSIDAD),
ln_luz2 = ln_luz^2,
ln_luz3 = ln_luz^3,
# estimación ln(PIB/km²)
ln_pib_km2_hat = a1*ln_luz + a2*ln_luz2 + a3*ln_luz3 + a4*ln_dens, #índice de producción
# recuperar PIB absoluto
PBI_dtbucion_km2 = exp(ln_pib_km2_hat),
PBI_dtbucion = PBI_dtbucion_km2 * area_km2 #factor de distribución
)
# Paso 2 — extraer clave provincial desde iddist (primeros 4 dígitos de 6)
Den_rad6 <- Den_rad6 %>%
mutate(idprov_4 = substr(str_pad(as.character(iddist), 6, pad = "0"), 1, 4))
# Paso 3 — extraer clave provincial desde idprov (primeros 4 dígitos de 6)
PBI_prov7 <- PBI_prov7 %>%
mutate(idprov_4 = substr(str_pad(as.character(idprov), 6, pad = "0"), 1, 4))
# Verificar
Den_rad6 %>%
st_drop_geometry() %>%
filter(idprov_4 == "1605") %>%
select(nombdist, RADmean, DENSIDAD, PBI_dtbucion) %>%
head(15)
# Paso 4 — sumar PBI_dtbucion por provincia
total_indice_prov <- Den_rad6 %>%
st_drop_geometry() %>%
group_by(idprov_4) %>%
summarise(total_dist_abs = sum(PBI_dtbucion, na.rm = TRUE))
# Paso 5 — unir y calcular PBI distrital escalado a nivel provincial
Den_rad6 <- Den_rad6 %>%
select(-any_of(c("total_dist_abs", "PBI_prov", "PBI_dist"))) %>%
left_join(total_indice_prov, by = "idprov_4") %>%
left_join(st_drop_geometry(PBI_prov7[, c("idprov_4", "PBI_prov")]), by = "idprov_4") %>%
mutate(
PBI_prov = as.numeric(as.character(PBI_prov)),
PBI_dist = PBI_prov * (PBI_dtbucion / total_dist_abs)
)
# Verificar con la provincia de Yungay
Den_rad6 %>%
st_drop_geometry() %>%
filter(idprov_4 == "0220") %>%
arrange(desc(PBI_dist)) %>%
select(nombdist, PBI_dtbucion, total_dist_abs, PBI_prov, PBI_dist)
names(PBI_cp_mix)
# Paso 1 — extraer 6 primeros dígitos del idcp para identificar distrito
POB_cp3 <- POB_cp3 %>%
mutate(iddist_6 = substr(as.character(idcp), 1, 6))
# Paso 2 — sumar población por distrito
total_pob_dist <- POB_cp3 %>%
st_drop_geometry() %>%
group_by(iddist_6) %>%
summarise(total_pob = sum(POB, na.rm=TRUE))
# Paso 3 — unir PBI distrital mixto
Den_rad5_simple <- PBI_cp_mix %>%
st_drop_geometry() %>%
mutate(iddist_6 = as.character(iddist)) %>%
select(iddist_6, PBI_dist) %>%
mutate(PBI_dist = as.numeric(as.character(PBI_dist)))
# Paso 4 — calcular PBI por CCPP
POB_cp3 <- POB_cp3 %>%
left_join(total_pob_dist, by="iddist_6") %>%
left_join(Den_rad5_simple, by="iddist_6") %>%
mutate(
POB = as.numeric(as.character(POB)),
PBI_dist = as.numeric(as.character(PBI_dist)),
PBI_cp = PBI_dist * (POB / total_pob)
)
# Verificar
POB_cp3 %>%
st_drop_geometry() %>%
filter(substr(as.character(iddist_6), 1, 4) == "0220") %>%
arrange(desc(PBI_cp)) %>%
dplyr::select(CEN_POB, POB, PBI_dist, PBI_cp) %>%
head(15)
PBI_distrital %>%
st_drop_geometry() %>%
filter(!is.na(Ingress) & !is.na(POB) & POB > 0 & Ingress > 0) %>%
ggplot(aes(x = POB, y = Ingress)) +
geom_point(alpha = 0.3, color = "#8B2323", size = 1.5) +
geom_smooth(method = "lm", color = "#1A3E5C", se = TRUE) +
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
labs(
title = "Correlación entre población e ingresos por hogar",
subtitle = "Distritos del Perú, 2017",
x = "Población (escala logarítmica)",
y = "Ingresos medios por hogar (escala logarítmica)",
caption = "Fuente: INEI (2017) y PNUD (2017)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Correlación de Pearson con datos transformados logarítmicamente
cor_result <- cor.test(
log(PBI_distrital$POB),
log(PBI_distrital$Ingress),
method = "pearson",
use = "complete.obs"
)
cat("Coeficiente de correlación Pearson (r):", round(cor_result$estimate, 4), "\n")
## Coeficiente de correlación Pearson (r): 0.2968
cat("p-value:", cor_result$p.value, "\n")
## p-value: 1.97786e-39
# Calcular correlación primero
cor_result <- cor.test(
log(PBI_distrital$POB[!is.na(PBI_distrital$Ingress) & !is.na(PBI_distrital$POB) & PBI_distrital$POB > 0 & PBI_distrital$Ingress > 0]),
log(PBI_distrital$Ingress[!is.na(PBI_distrital$Ingress) & !is.na(PBI_distrital$POB) & PBI_distrital$POB > 0 & PBI_distrital$Ingress > 0]),
method = "spearman"
)
# Gráfico con correlación anotada
PBI_distrital %>%
st_drop_geometry() %>%
filter(!is.na(Ingress) & !is.na(POB) & POB > 0 & Ingress > 0) %>%
ggplot(aes(x = POB, y = Ingress)) +
geom_point(alpha = 0.3, color = "#D94231", size = 1.5) +
geom_smooth(method = "lm", color = "#1A3E5C", se = TRUE) +
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
annotate("text",
x = 100000, y = 100,
label = paste0("Pearson rho = 0.2968",
"\np-value < 0.001"),
hjust = 0, size = 3.5, color = "#1A3E5C") +
labs(
title = "Correlación entre población e ingresos por hogar",
subtitle = "Distritos del Perú, 2017",
x = "Población (habitantes)",
y = "Ingresos medios por hogar (soles)"
) +
theme_minimal()
Correlación con el coeficiente de Pearson por región natural.
PBI_distrital %>%
st_drop_geometry() %>%
filter(!is.na(Ingress) & !is.na(POB) & POB > 0 & Ingress > 0) %>%
group_by(regn_nt) %>%
summarise(
r = cor.test(log(POB), log(Ingress), method = "pearson")$estimate,
p_value = cor.test(log(POB), log(Ingress), method = "pearson")$p.value,
n = n()
)
PBI_distrital %>%
st_drop_geometry() %>%
filter(!is.na(Ingress) & !is.na(POB) & POB > 0 & Ingress > 0) %>%
mutate(regn_nt = factor(regn_nt, levels = c(
"COSTA",
"SIERRA",
"SELVA ALTA",
"SELVA BAJA"
))) %>%
ggplot(aes(x = POB, y = Ingress, color = regn_nt)) +
geom_point(alpha = 0.4, size = 1.5) +
geom_smooth(method = "lm", se = TRUE,
color = "#1A3E5C",
fill = "grey80") +
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
scale_color_manual(values = c(
"COSTA" = "#C8862A",
"SIERRA" = "#E8C98A",
"SELVA ALTA" = "#4BA89E",
"SELVA BAJA" = "#7BBFBF"
)) +
facet_wrap(~regn_nt) +
labs(
title = "Dispersión de los datos de población e ingresos por región natural",
x = "Población (habitantes)",
y = "Ingresos medios por hogar (soles)",
color = "Región natural"
) +
theme_minimal() +
theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
# Correlación de Pearson con datos transformados logarítmicamente
pob_rad_clean <- pob_rad %>%
filter(POB > 0, RD > 0)
cor_result2 <- cor.test(
log(pob_rad_clean$POB),
log(pob_rad_clean$RD),
method = "pearson",
use = "complete.obs"
)
cat("Coeficiente de correlación Pearson (r):", round(cor_result2$estimate, 4), "\n")
## Coeficiente de correlación Pearson (r): 0.4587
cat("p-value:", formatC(cor_result2$p.value, format = "e", digits = 3), "\n")
## p-value: 0.000e+00
# Correlación de Pearson con datos transformados logarítmicamente
pob_rad_clean2 <- pob_rad_dist %>%
filter(POB > 0, RADmean > 0)
cor_result3 <- cor.test(
log(pob_rad_clean2$POB),
log(pob_rad_clean2$RADmean),
method = "pearson",
use = "complete.obs"
)
cat("Coeficiente de correlación Pearson (r):", round(cor_result3$estimate, 4), "\n")
## Coeficiente de correlación Pearson (r): 0.6136
cat("p-value:", formatC(cor_result3$p.value, format = "e", digits = 3), "\n")
## p-value: 5.970e-196
# Correlación de Pearson con datos transformados logarítmicamente
pob_rad_clean2 <- pob_rad_dist %>%
filter(Ingresos > 0, RADmean > 0)
cor_result3 <- cor.test(
log(pob_rad_clean2$Ingresos),
log(pob_rad_clean2$RADmean),
method = "pearson",
use = "complete.obs"
)
cat("Coeficiente de correlación Pearson (r):", round(cor_result3$estimate, 4), "\n")
## Coeficiente de correlación Pearson (r): 0.4981
cat("p-value:", formatC(cor_result3$p.value, format = "e", digits = 3), "\n")
## p-value: 4.268e-118
# Gráfico 1 — POB vs RADmean
p1 <- pob_rad_dist %>%
filter(POB > 0, RADmean > 0) %>%
ggplot(aes(x = POB, y = RADmean)) +
geom_point(alpha = 0.3, color = "#D94231", size = 1.5) +
geom_smooth(method = "lm", color = "#1A3E5C", se = TRUE) +
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
annotate("text",
x = 100, y = 40,
label = "Pearson r = 0.6136\np-value < 0.001",
hjust = 0, size = 3.5, color = "#1A3E5C") +
labs(
title = "Población vs Luminosidad promedio",
x = "Población (habitantes)",
y = "RAD media (nW/cm²sr)"
) +
theme_minimal()
# Gráfico 2 — Ingresos vs RADmean
p2 <- pob_rad_dist %>%
filter(Ingresos > 0, RADmean > 0) %>%
ggplot(aes(x = Ingresos, y = RADmean)) +
geom_point(alpha = 0.3, color = "#D94231", size = 1.5) +
geom_smooth(method = "lm", color = "#1A3E5C", se = TRUE) +
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
annotate("text",
x = 50, y = 40,
label = "Pearson r = 0.4981\np-value < 0.001",
hjust = 0, size = 3.5, color = "#1A3E5C") +
labs(
title = "Ingresos vs Luminosidad promedio",
x = "Ingresos medios por hogar (soles)",
y = "RAD media (nW/cm²sr)"
) +
theme_minimal()
# Mostrar juntos
p1 + p2
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# Correlación de Pearson por región natural — Ingresos vs RADmean
pob_rad_dist %>%
filter(Ingresos > 0, RADmean > 0) %>%
group_by(region_nat) %>%
summarise(
r = cor.test(log(Ingresos), log(RADmean), method = "pearson")$estimate,
p_value = cor.test(log(Ingresos), log(RADmean), method = "pearson")$p.value,
n = n()
)
# Correlación de Pearson por región natural — POB vs RADmean
pob_rad_dist %>%
filter(POB > 0, RADmean > 0) %>%
group_by(region_nat) %>%
summarise(
r = cor.test(log(POB), log(RADmean), method = "pearson")$estimate,
p_value = cor.test(log(POB), log(RADmean), method = "pearson")$p.value,
n = n()
)
p1 <- pob_rad_dist %>%
filter(POB > 0, RADmean > 0) %>%
ggplot(aes(x = POB, y = RADmean, color = region_nat)) +
geom_point(alpha = 0.3, size = 1.5) +
geom_smooth(method = "lm", se = TRUE,
color = "#1A3E5C", # línea siempre azul oscuro
fill = "grey80") + # banda gris
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
scale_color_manual(values = c(
"COSTA" = "#C8862A",
"SIERRA" = "#E8C98A",
"SELVA ALTA" = "#4BA89E",
"SELVA BAJA" = "#7BBFBF"
)) +
facet_wrap(~region_nat) +
labs(
title = "Población vs Luminosidad promedio por región natural",
x = "Población (habitantes)",
y = "RAD media (nW/cm²sr)",
color = "Región natural"
) +
theme_minimal() +
theme(legend.position = "none")
# Mostrar juntos
p1
## `geom_smooth()` using formula = 'y ~ x'
# Gráfico 2 — Ingresos vs RADmean por región natural
p2 <- pob_rad_dist %>%
filter(Ingresos > 0, RADmean > 0) %>%
ggplot(aes(x = Ingresos, y = RADmean, color = region_nat)) +
geom_point(alpha = 0.3, size = 1.5) +
geom_smooth(method = "lm", se = TRUE,
color = "#1A3E5C", # línea siempre azul oscuro
fill = "grey80") + # banda gris
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
scale_color_manual(values = c(
"COSTA" = "#C8862A",
"SIERRA" = "#E8C98A",
"SELVA ALTA" = "#4BA89E",
"SELVA BAJA" = "#7BBFBF"
)) +
facet_wrap(~region_nat) +
labs(
title = "Ingresos vs Luminosidad promedio por región natural",
x = "Ingresos medios por hogar (soles)",
y = "RADmean (nW/cm²sr)",
color = "Región natural"
) +
theme_minimal() +
theme(legend.position = "none")
# Mostrar juntos
p2
## `geom_smooth()` using formula = 'y ~ x'
# Correlación de Pearson con datos transformados logarítmicamente
nnpp_datos <- nnpp %>%
filter(Total_pob > 0, RADmean > 0)
cor_result4 <- cor.test(
log(nnpp_datos$Total_pob),
log(nnpp_datos$RADmean),
method = "pearson",
use = "complete.obs"
)
# Gráfico con correlación anotada
nnpp_datos %>%
ggplot(aes(x = Total_pob, y = RADmean)) +
geom_point(alpha = 0.3, color = "#EF3154", size = 1.5) +
geom_smooth(method = "lm", color = "#1A3E5C", se = TRUE) +
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
annotate("text",
x = 100000, y = 0.1,
label = paste0("Pearson r = ", round(cor_result4$estimate, 4),
"\np-value < 0.001"),
hjust = 0, size = 3.5, color = "#1A3E5C") +
labs(
title = "Correlación entre población y luminosidad",
subtitle = "Núcleos Poblados del Perú",
x = "Población total (habitantes)",
y = "RADmean (nW/cm²sr)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Correlación de Pearson con datos transformados logarítmicamente por región natural
nnpp_datos <- nnpp %>%
filter(Total_pob > 0, RADmean > 0, !is.na(region_nat))
resultados_cor <- nnpp_datos %>%
group_by(region_nat) %>%
summarise(
n = n(),
r = round(cor.test(log(Total_pob), log(RADmean), method = "pearson", use = "complete.obs")$estimate, 4),
p_value = formatC(cor.test(log(Total_pob), log(RADmean), method = "pearson", use = "complete.obs")$p.value, format = "e", digits = 3)
)
print(resultados_cor)
## Simple feature collection with 4 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -81.31321 ymin: -18.29288 xmax: -68.65762 ymax: -0.09284716
## Geodetic CRS: WGS 84
## # A tibble: 4 × 5
## region_nat n r p_value geometry
## <fct> <int> <dbl> <chr> <MULTIPOLYGON [°]>
## 1 COSTA 2560 0.699 0.000e+00 (((-78.20279 -9.163739, -78.20107 -9.164183…
## 2 SELVA ALTA 2654 0.686 0.000e+00 (((-74.73878 -11.36925, -74.73886 -11.36927…
## 3 SELVA BAJA 2532 0.591 3.949e-238 (((-78.21484 -4.642538, -78.2147 -4.642529,…
## 4 SIERRA 8395 0.622 0.000e+00 (((-72.12328 -13.37384, -72.12364 -13.37384…
# Gráfico — Total_pob vs RADmean por región natural
p3 <- nnpp_datos %>%
mutate(region_nat = factor(region_nat, levels = c("COSTA", "SIERRA", "SELVA ALTA", "SELVA BAJA"))) %>%
ggplot(aes(x = Total_pob, y = RADmean, color = region_nat)) +
geom_point(alpha = 0.3, size = 1.5) +
geom_smooth(method = "lm", se = TRUE,
color = "#1A3E5C",
fill = "grey80") +
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
scale_color_manual(values = c(
"COSTA" = "#C8862A",
"SIERRA" = "#E8C98A",
"SELVA ALTA" = "#4BA89E",
"SELVA BAJA" = "#7BBFBF"
)) +
facet_wrap(~region_nat) +
labs(
title = "Población vs Luminosidad promedio por región natural",
x = "Población total (hab.)",
y = "RADmean (nW/cm²sr)",
color = "Región natural"
) +
theme_minimal() +
theme(legend.position = "none")
p3
## `geom_smooth()` using formula = 'y ~ x'
# Crear objeto verificacion uniendo PBI Geary-Stark provincial con Den_rad4
verificacion <- Den_rad5 %>%
left_join(st_drop_geometry(PBI_prov7[, c("idprov", "PBI_prov")]), by="idprov")
# Filtrar valores válidos
verificacion_limpia <- verificacion %>%
filter(PBI_prov > 0, RADmean > 0, DENSIDAD > 0) %>%
mutate(
ln_PBI = log(PBI_prov),
ln_luz = log(RADmean),
ln_dens = log(DENSIDAD)
)
# Regresión 1 — luminosidad
reg_luz <- lm(ln_PBI ~ ln_luz, data=st_drop_geometry(verificacion_limpia))
summary(reg_luz)
##
## Call:
## lm(formula = ln_PBI ~ ln_luz, data = st_drop_geometry(verificacion_limpia))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2198 -0.6328 0.0588 0.5358 2.8737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.0548 0.1476 101.98 <2e-16 ***
## ln_luz 1.4213 0.1048 13.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9652 on 194 degrees of freedom
## Multiple R-squared: 0.4866, Adjusted R-squared: 0.484
## F-statistic: 183.9 on 1 and 194 DF, p-value: < 2.2e-16
# Paso 1 - crear datos_reg
datos_reg <- st_drop_geometry(verificacion_limpia)[complete.cases(
st_drop_geometry(verificacion_limpia)[, c("ln_PBI", "ln_luz")]), ]
residuos_df <- data.frame(
fitted = fitted(reg_luz),
residuos = residuals(reg_luz),
POB = datos_reg$POB,
PBI = datos_reg$PBI_prov,
nombprov = datos_reg$nombprov
)
residuos_df$nombprov <- as.character(datos_reg$nombprov)
# Grupos a etiquetar
top5_resid <- residuos_df |> slice_max(residuos, n = 5)
bottom5_resid <- residuos_df |> slice_min(residuos, n = 5)
top5_pbi <- residuos_df |> slice_max(PBI, n = 5) # ← nuevo
etiquetas <- bind_rows(top5_resid, bottom5_resid, top5_pbi) |>
distinct(nombprov, .keep_all = TRUE) # ← evita duplicados si una provincia aparece en 2 grupos
ggplot(residuos_df, aes(x = fitted, y = residuos, size = POB)) +
geom_point(alpha = 0.4, color = "#df4180") +
geom_hline(yintercept = 0, color = "#1A3E5C") +
geom_text_repel(
data = etiquetas,
aes(label = nombprov),
size = 3,
color = "#1A3E5C",
fontface = "bold",
box.padding = 0.5,
max.overlaps = 20
) +
scale_size_continuous(
name = "Población",
range = c(1, 10),
labels = scales::comma
) +
labs(
title = "Residuos vs valores ajustados — regresión ln(PBI) ~ ln(luz)",
subtitle = "Provincias del Perú, 2017",
x = "Valores ajustados ln(PBI)",
y = "Residuos"
) +
theme_minimal()
Regresión sin la provincia de Lima
# Regresión 2 — luminosidad sin Lima
verificacion_sinLima <- verificacion_limpia %>%
filter(nombprov != "LIMA")
reg_luz_sinLima <- lm(ln_PBI ~ ln_luz, data = st_drop_geometry(verificacion_sinLima))
summary(reg_luz_sinLima)
##
## Call:
## lm(formula = ln_PBI ~ ln_luz, data = st_drop_geometry(verificacion_sinLima))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.10949 -0.64076 0.05583 0.53947 2.86550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.0243 0.1612 93.22 <2e-16 ***
## ln_luz 1.3988 0.1151 12.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9671 on 193 degrees of freedom
## Multiple R-squared: 0.4336, Adjusted R-squared: 0.4307
## F-statistic: 147.7 on 1 and 193 DF, p-value: < 2.2e-16
# Regresión 2 — densidad
reg_dens <- lm(ln_PBI ~ ln_dens, data=st_drop_geometry(verificacion_limpia))
summary(reg_dens)
##
## Call:
## lm(formula = ln_PBI ~ ln_dens, data = st_drop_geometry(verificacion_limpia))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.39158 -0.71955 0.00281 0.73953 2.79484
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.53911 0.16468 70.07 <2e-16 ***
## ln_dens 0.62141 0.05249 11.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.026 on 194 degrees of freedom
## Multiple R-squared: 0.4194, Adjusted R-squared: 0.4164
## F-statistic: 140.1 on 1 and 194 DF, p-value: < 2.2e-16
# Paso 1 - PRIMERO crear datos_reg
datos_reg <- st_drop_geometry(verificacion_limpia)[complete.cases(
st_drop_geometry(verificacion_limpia)[, c("ln_PBI", "ln_dens")]), ]
residuos_df <- data.frame(
fitted = fitted(reg_dens),
residuos = residuals(reg_dens),
DENSIDAD = datos_reg$DENSIDAD,
POB = datos_reg$POB,
PBI = datos_reg$PBI_prov,
nombprov = datos_reg$nombprov
)
residuos_df$nombprov <- as.character(datos_reg$nombprov)
residuos_df$nombprov <- ifelse(
residuos_df$nombprov == "CA�ETE", "CAÑETE", residuos_df$nombprov
)
# Grupos a etiquetar
top5_resid <- residuos_df |> slice_max(residuos, n = 5)
bottom5_resid <- residuos_df |> slice_min(residuos, n = 5)
top5_pbi <- residuos_df |> slice_max(PBI, n = 5)
etiquetas <- bind_rows(top5_resid, bottom5_resid, top5_pbi) |>
distinct(nombprov, .keep_all = TRUE) # evitar duplicados
ggplot(residuos_df, aes(x = fitted, y = residuos, size = POB)) +
geom_point(alpha = 0.4, color = "#df4180") +
geom_hline(yintercept = 0, color = "#1A3E5C") +
geom_text_repel(
data = etiquetas,
aes(label = nombprov),
size = 3,
color = "#1A3E5C",
fontface = "bold",
box.padding = 0.5,
max.overlaps = 20
) +
scale_size_continuous(
name = "Población",
range = c(1, 10),
labels = scales::comma
) +
labs(
title = "Residuos vs valores ajustados — regresión ln(PBI) ~ ln(densidad)",
subtitle = "Provincias del Perú, 2017",
x = "Valores ajustados ln(PBI)",
y = "Residuos"
) +
theme_minimal()
# Paso 1 - Asignar nombres de provincias a las filas
rownames(datos_reg) <- datos_reg$nombprov
# Paso 2 - Re-ajustar el modelo
# Volver a correr el dataframe
resid_dens <- lm(ln_PBI ~ ln_dens, data=datos_reg)
# Paso 3 - generar los gráficos
mi_estilo <- list(
theme_minimal(),
theme(
plot.title = element_text(color = "#1A3E5C", face = "bold", size = 10),
axis.title = element_text(size = 8),
panel.grid.minor = element_blank()
)
)
# En label.n poner cuántos nombres (5)
# Extraer datos de Lima del modelo
lima_data <- data.frame(
fitted = fitted(resid_dens )[which(rownames(datos_reg) == "LIMA")],
residuos = residuals(resid_dens )[which(rownames(datos_reg) == "LIMA")],
label = "LIMA"
)
p1 <- autoplot(resid_dens , which = 1, colour = "#df4180", alpha = 0.3, size = 2,
smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
label.n = 5, label.repel = TRUE) + mi_estilo
p2 <- autoplot(resid_dens , which = 2, colour = "#df4180", alpha = 0.3, size = 2,
smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
label.n = 3) + mi_estilo
p3 <- autoplot(resid_dens, which = 3, colour = "#df4180", alpha = 0.3, size = 2,
smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
label.n = 3) + mi_estilo
p4 <- autoplot(resid_dens, which = 4, colour = "#df4180", alpha = 0.3, size = 2,
smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
label.n = 3) + mi_estilo
# Paso 4 - Ensamblar
(p1[[1]] + p2[[1]]) / (p3[[1]] + p4[[1]]) +
plot_annotation(title = "Diagnóstico de residuos de la regresión lineal")
# TRANSFORMACIÓN LOGARÍTMICA
verificacion_prov <- verificacion_prov %>%
mutate(
ln_PBI = log(PBI_prov),
ln_luz = log(RADmean),
ln_dens = log(DENSIDAD)
)
# VERIFICAR QUE NO HAY NAs NI -Inf por valores cero
sum(is.infinite(verificacion_prov$ln_luz))
## [1] 0
sum(is.infinite(verificacion_prov$ln_dens))
## [1] 0
# REGRESIÓN CÚBICA PROVINCIAL
reg_cubica_prov <- lm(ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
data = verificacion_prov)
summary(reg_cubica_prov)
##
## Call:
## lm(formula = ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
## data = verificacion_prov)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.96418 -0.65753 0.01957 0.57772 3.04467
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.60923 0.46179 31.636 < 2e-16 ***
## ln_luz 1.28594 0.27049 4.754 3.92e-06 ***
## I(ln_luz^2) -0.15327 0.08249 -1.858 0.0647 .
## I(ln_luz^3) -0.03346 0.04701 -0.712 0.4775
## ln_dens 0.17681 0.08423 2.099 0.0371 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9183 on 191 degrees of freedom
## Multiple R-squared: 0.5424, Adjusted R-squared: 0.5328
## F-statistic: 56.61 on 4 and 191 DF, p-value: < 2.2e-16
# Paso 1 - Asignar nombres al dataframe
rownames(verificacion_prov) <- verificacion_prov$nombprv
# Paso 2 - Re-ajustar el modelo
reg_cubica_prov <- lm(ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
data = verificacion_prov)
# Paso 3 - Extraer Lima
lima_data <- data.frame(
fitted = fitted(reg_cubica_prov)[which(rownames(verificacion_prov) == "LIMA")],
residuos = residuals(reg_cubica_prov)[which(rownames(verificacion_prov) == "LIMA")],
label = "LIMA"
)
# Paso 4 - Gráficos
p1 <- autoplot(reg_cubica_prov, which = 1, colour = "#df4180", alpha = 0.3, size = 2,
smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
label.n = 5, label.repel = TRUE) + mi_estilo +
geom_text_repel(data = lima_data, aes(x = fitted, y = residuos, label = label),
size = 3, color = "#1A3E5C", fontface = "bold",
box.padding = 0.5, arrow = arrow(length = unit(0.015, "npc")))
p2 <- autoplot(reg_cubica_prov, which = 2, colour = "#df4180", alpha = 0.3, size = 2,
smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
label.n = 5, label.repel = TRUE) + mi_estilo
p3 <- autoplot(reg_cubica_prov, which = 3, colour = "#df4180", alpha = 0.3, size = 2,
smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
label.n = 5, label.repel = TRUE) + mi_estilo
p4 <- autoplot(reg_cubica_prov, which = 4, colour = "#df4180", alpha = 0.3, size = 2,
smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
label.n = 3, label.repel = TRUE) + mi_estilo
# Paso 5 - Ensamblar
(p1[[1]] + p2[[1]]) / (p3[[1]] + p4[[1]]) +
plot_annotation(title = "Diagnóstico de residuos — regresión cúbica provincial")
# RESIDUOS REGRESIÓN CÚBICA PROVINCIAL
datos_reg <- verificacion_prov[complete.cases(
verificacion_prov[, c("ln_PBI", "ln_luz", "ln_dens")]), ]
residuos_df <- data.frame(
fitted = fitted(reg_cubica_prov),
residuos = residuals(reg_cubica_prov),
DENSIDAD = datos_reg$DENSIDAD,
POB = datos_reg$POB, # verifica cómo se llama población
PBI = datos_reg$PBI_prov,
nombprov = as.character(datos_reg$nombprv) # verifica nombre del campo
)
residuos_df$nombprov <- as.character(datos_reg$nombprov)
residuos_df$nombprov <- ifelse(
residuos_df$nombprov == "CA�ETE", "CAÑETE", residuos_df$nombprov
)
# Grupos a etiquetar
top5_resid <- residuos_df |> slice_max(residuos, n = 5)
bottom5_resid <- residuos_df |> slice_min(residuos, n = 5)
top5_pbi <- residuos_df |> slice_max(PBI, n = 5)
etiquetas <- bind_rows(top5_resid, bottom5_resid, top5_pbi) |>
distinct(nombprov, .keep_all = TRUE) # evitar duplicados
ggplot(residuos_df, aes(x = fitted, y = residuos, size = POB)) +
geom_point(alpha = 0.4, color = "#df4180") +
geom_hline(yintercept = 0, color = "#1A3E5C") +
geom_text_repel(
data = etiquetas,
aes(label = nombprov),
size = 3,
color = "#1A3E5C",
fontface = "bold",
box.padding = 0.5,
max.overlaps = 20
) +
scale_size_continuous(
name = "Población",
range = c(1, 10),
labels = scales::comma
) +
labs(
title = "Residuos vs valores ajustados — regresión ln(PBI) ~ ln(densidad) y ln(luminosidad)",
subtitle = "Provincias del Perú, 2017",
x = "Valores ajustados ln(PBI)",
y = "Residuos"
) +
theme_minimal()
# Ver cuánto pesa Ucayali en el modelo
verificacion_prov %>%
filter(grepl("Ucayali", nombdep.x, ignore.case = TRUE)) %>%
select(nombdep.x, nombprv, PBI_prov, RADmean, DENSIDAD, ln_PBI, ln_luz, ln_dens)
# Identificar observaciones influyentes
plot(reg_cubica_prov, which = 4) # Cook's distance
# Identificar observación 29
verificacion_prov[29, ] %>%
select(nombdep.x, nombprv, PBI_prov, RADmean, DENSIDAD, ln_PBI, ln_luz, ln_dens)
# Y las 109 y 147 también
verificacion_prov[c(29, 109, 147), ] %>%
select(nombdep.x, nombprv, PBI_prov, RADmean, DENSIDAD, ln_PBI, ln_luz, ln_dens)
DEN_RAD_DIST <- DEN_RAD_DIST %>%
mutate(idprov = paste0(substr(as.character(iddist), 1, 4), "00"))
head(DEN_RAD_DIST$idprov, 5)
## [1] "030200" "030400" "030400" "030200" "030200"
# ESCALONAMIENTO PROVINCIAL A DISTRITAL CON GEARY-STARK (INGRESOS)
PBI_dist_geary <- DEN_RAD_DIST %>%
st_drop_geometry() %>%
left_join(st_drop_geometry(PBI_prov_geary) %>%
select(idprov, PBI_prov),
by = "idprov") %>%
group_by(idprov) %>%
mutate(
PBI_dist = PBI_prov * (Ingresos / sum(Ingresos, na.rm = TRUE))
) %>%
ungroup()
# TRANSFORMACIÓN LOGARÍTMICA
PBI_dist_geary <- PBI_dist_geary %>%
mutate(
ln_PBI = log(PBI_dist),
ln_luz = log(RADmean),
ln_dens = log(DENSIDAD)
)
# VERIFICAR
sum(is.infinite(PBI_dist_geary$ln_PBI))
## [1] 0
sum(is.infinite(PBI_dist_geary$ln_luz))
## [1] 0
sum(is.infinite(PBI_dist_geary$ln_dens))
## [1] 0
# REGRESIÓN CÚBICA DISTRITAL
reg_cubica_dist <- lm(ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
data = PBI_dist_geary)
summary(reg_cubica_dist)
##
## Call:
## lm(formula = ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
## data = PBI_dist_geary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.81346 -0.74192 -0.05575 0.68740 2.97214
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.12124 0.14898 88.076 < 2e-16 ***
## ln_luz 0.92785 0.06050 15.335 < 2e-16 ***
## I(ln_luz^2) -0.22255 0.02458 -9.054 < 2e-16 ***
## I(ln_luz^3) 0.05243 0.00926 5.661 1.73e-08 ***
## ln_dens -0.13720 0.02672 -5.134 3.13e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.048 on 1869 degrees of freedom
## (17 observations deleted due to missingness)
## Multiple R-squared: 0.476, Adjusted R-squared: 0.4748
## F-statistic: 424.4 on 4 and 1869 DF, p-value: < 2.2e-16
# Combinar iddist con nombre para rownames únicos y legibles
PBI_dist_geary <- PBI_dist_geary %>%
mutate(label_dist = paste0(nombdist, " (", iddist, ")"))
rownames(PBI_dist_geary) <- PBI_dist_geary$label_dist
## Warning: Setting row names on a tibble is deprecated.
# Re-ajustar el modelo
reg_cubica_dist <- lm(ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
data = PBI_dist_geary)
# 3. Gráficos (a nivel distrital no hay Lima provincia, hay Lima distrito)
p1 <- autoplot(reg_cubica_dist, which = 1, colour = "#df4180", alpha = 0.3, size = 1.5,
smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
label.n = 5, label.repel = TRUE) + mi_estilo
p2 <- autoplot(reg_cubica_dist, which = 2, colour = "#df4180", alpha = 0.3, size = 1.5,
smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
label.n = 5, label.repel = TRUE) + mi_estilo
p3 <- autoplot(reg_cubica_dist, which = 3, colour = "#df4180", alpha = 0.3, size = 1.5,
smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
label.n = 5, label.repel = TRUE) + mi_estilo
p4 <- autoplot(reg_cubica_dist, which = 4, colour = "#df4180", alpha = 0.3, size = 2,
smooth.colour = "#1A3E5C", label.size = 3, label.colour = "#1A3E5C",
label.n = 5, label.repel = TRUE) + mi_estilo
# 4. Ensamblar
(p1[[1]] + p2[[1]]) / (p3[[1]] + p4[[1]]) +
plot_annotation(title = "Diagnóstico de residuos — regresión cúbica distrital")
# Extraer datos del modelo distrital
datos_reg_dist <- PBI_dist_geary[complete.cases(
PBI_dist_geary[, c("ln_PBI", "ln_luz", "ln_dens")]), ]
residuos_df_dist <- data.frame(
fitted = fitted(reg_cubica_dist),
residuos = residuals(reg_cubica_dist),
POB = datos_reg_dist$POB,
PBI = datos_reg_dist$PBI_dist, # ajusta el nombre del campo PBI distrital
nombdist = as.character(datos_reg_dist$nombdist)
)
# Grupos a etiquetar
top5_resid <- residuos_df_dist |> slice_max(residuos, n = 5)
bottom5_resid <- residuos_df_dist |> slice_min(residuos, n = 5)
top5_pbi <- residuos_df_dist |> slice_max(PBI, n = 5)
etiquetas <- bind_rows(top5_resid, bottom5_resid, top5_pbi) |>
distinct(nombdist, .keep_all = TRUE)
# Gráfico
ggplot(residuos_df_dist, aes(x = fitted, y = residuos, size = POB)) +
geom_point(alpha = 0.4, color = "#df4180") +
geom_hline(yintercept = 0, color = "#1A3E5C") +
geom_text_repel(
data = etiquetas,
aes(label = nombdist),
size = 3,
color = "#1A3E5C",
fontface = "bold",
box.padding = 0.5,
max.overlaps = 20
) +
scale_size_continuous(
name = "Población",
range = c(1, 10),
labels = scales::comma
) +
labs(
title = "Diagnóstico de residuos — regresión cúbica distrital",
subtitle = "Distritos del Perú, 2017",
x = "Valores ajustados ln(PBI)",
y = "Residuos"
) +
theme_minimal()
##CÁLCULO POR RANGOS POBLACIONALES
tabla_dist <- PBI_dist_geary %>%
st_drop_geometry() %>%
mutate(rango_pob = case_when(
POB >= 498.4 & POB <= 1000 ~ "498 - 1,000",
POB >= 1001 & POB <= 5000 ~ "1,001 - 5,000",
POB >= 5001 & POB <= 10000 ~ "5,001 - 10,000",
POB >= 10001 & POB <= 25000 ~ "10,001 - 25,000",
POB >= 25001 & POB <= 50000 ~ "25,001 - 50,000",
POB >= 50001 & POB <= 100000 ~ "50,001 - 100,000",
POB >= 100001 & POB <= 500000 ~ "100,001 - 500,000",
POB > 500000 ~ "Más de 500,000"
)) %>%
group_by(rango_pob) %>%
summarise(
N_distritos = n(),
PBI_media = mean(PBI_dist, na.rm = TRUE),
PBI_total = sum(PBI_dist, na.rm = TRUE)
) %>%
arrange(factor(rango_pob, levels = c(
"498 - 1,000",
"1,001 - 5,000",
"5,001 - 10,000",
"10,001 - 25,000",
"25,001 - 50,000",
"50,001 - 100,000",
"100,001 - 500,000",
"Más de 500,000"
)))
print(tabla_dist)
## # A tibble: 9 × 4
## rango_pob N_distritos PBI_media PBI_total
## <chr> <int> <dbl> <dbl>
## 1 498 - 1,000 215 74512. 16020072.
## 2 1,001 - 5,000 799 92552. 72838629.
## 3 5,001 - 10,000 325 154256. 49824765.
## 4 10,001 - 25,000 243 219005. 53218287.
## 5 25,001 - 50,000 98 560325. 54351572.
## 6 50,001 - 100,000 68 1351814. 91923336.
## 7 100,001 - 500,000 49 2241537. 109835295.
## 8 Más de 500,000 4 4027381. 16109524.
## 9 <NA> 90 41332. 3637234.
HUELLA_dist <- st_join(
HUELLA %>% st_transform(crs = st_crs(DEN_RAD_DIST)),
DEN_RAD_DIST %>% select(iddist),
join = st_intersects,
largest = TRUE
)
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
sum(is.na(HUELLA_dist$iddist))
## [1] 2
# ELIMINAR LOS 2 NAs
HUELLA_dist <- HUELLA_dist %>%
filter(!is.na(iddist))
# PASO 2: TRAER PBI DISTRITAL
HUELLA_dist <- HUELLA_dist %>%
st_drop_geometry() %>%
left_join(
PBI_dist_geary %>% select(iddist, PBI_dist),
by = "iddist"
)
# PASO 3: ESCALONAR PBI A HUELLA POR POBLACIÓN
HUELLA_dist <- HUELLA_dist %>%
group_by(iddist) %>%
mutate(
PBI_huella = PBI_dist * (Total_pob / sum(Total_pob, na.rm = TRUE))
) %>%
ungroup()
# VERIFICAR
sum(is.na(HUELLA_dist$PBI_huella))
## [1] 84
# ELIMINAR HUELLAS SIN PBI
HUELLA_dist <- HUELLA_dist %>%
filter(!is.na(PBI_huella))
# TRANSFORMACIÓN LOGARÍTMICA
HUELLA_dist <- HUELLA_dist %>%
mutate(
ln_PBI = log(PBI_huella),
ln_luz = log(RADmean),
ln_dens = log(DENpob)
)
# VERIFICAR
sum(is.infinite(HUELLA_dist$ln_PBI))
## [1] 0
sum(is.infinite(HUELLA_dist$ln_luz))
## [1] 0
sum(is.infinite(HUELLA_dist$ln_dens))
## [1] 0
# REGRESIÓN CÚBICA HUELLA
reg_cubica_huella <- lm(ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
data = HUELLA_dist)
summary(reg_cubica_huella)
##
## Call:
## lm(formula = ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
## data = HUELLA_dist)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.7608 -0.8529 -0.0672 0.7691 5.9780
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.938062 0.098081 80.934 <2e-16 ***
## ln_luz 1.282798 0.022985 55.810 <2e-16 ***
## I(ln_luz^2) 0.082954 0.009432 8.795 <2e-16 ***
## I(ln_luz^3) -0.063630 0.006371 -9.988 <2e-16 ***
## ln_dens 0.131743 0.014767 8.921 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.262 on 16052 degrees of freedom
## Multiple R-squared: 0.3449, Adjusted R-squared: 0.3447
## F-statistic: 2112 on 4 and 16052 DF, p-value: < 2.2e-16
# 1. Asignar rownames únicos combinando COD con ID
HUELLA_dist <- HUELLA_dist %>%
mutate(label_huella = paste0(COD, " (", ID, ")"))
rownames(HUELLA_dist) <- HUELLA_dist$label_huella
## Warning: Setting row names on a tibble is deprecated.
# 2. Re-ajustar el modelo
reg_cubica_huella <- lm(ln_PBI ~ ln_luz + I(ln_luz^2) + I(ln_luz^3) + ln_dens,
data = HUELLA_dist)
# 3. Gráficos
p1 <- autoplot(reg_cubica_huella, which = 1, colour = "#df4180", alpha = 0.1, size = 0.5,
smooth.colour = "#1A3E5C", label.size = 0, label.n = 0) + mi_estilo
p2 <- autoplot(reg_cubica_huella, which = 2, colour = "#df4180", alpha = 0.1, size = 0.5,
smooth.colour = "#1A3E5C", label.size = 0, label.n = 0) + mi_estilo
p3 <- autoplot(reg_cubica_huella, which = 3, colour = "#df4180", alpha = 0.1, size = 0.5,
smooth.colour = "#1A3E5C", label.size = 0, label.n = 0) + mi_estilo
p4 <- autoplot(reg_cubica_huella, which = 4, colour = "#df4180", alpha = 0.1, size = 0.5,
smooth.colour = "#1A3E5C", label.size = 0, label.n = 0) + mi_estilo
# 4. Ensamblar
(p1[[1]] + p2[[1]]) / (p3[[1]] + p4[[1]]) +
plot_annotation(title = "Diagnóstico de residuos — regresión cúbica huella urbano-rural")
# REGRESIÓN CÚBICA DISTRITAL
reg_lineal_dist <- lm(ln_PBI ~ ln_luz + ln_dens,
data = PBI_dist_geary)
summary(reg_lineal_dist)
##
## Call:
## lm(formula = ln_PBI ~ ln_luz + ln_dens, data = PBI_dist_geary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.13026 -0.74362 -0.08045 0.73140 3.02995
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.25690 0.12128 101.059 <2e-16 ***
## ln_luz 0.94883 0.04423 21.452 <2e-16 ***
## ln_dens -0.03989 0.02531 -1.576 0.115
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.074 on 1871 degrees of freedom
## (17 observations deleted due to missingness)
## Multiple R-squared: 0.4499, Adjusted R-squared: 0.4493
## F-statistic: 765.2 on 2 and 1871 DF, p-value: < 2.2e-16
reg_lineal_dist <- lm(ln_PBI ~ ln_luz + ln_dens, data = PBI_dist_geary)
vif(reg_lineal_dist)
## ln_luz ln_dens
## 3.781857 3.781857