#Cargar datos
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
# Hoja 1: Residenciales
datos_res <- read_excel("Electrohuila2025.xlsx", sheet = 1) #Residenciales
# Hoja 2: Comerciales
datos_com <- read_excel("Electrohuila2025.xlsx", sheet = 2) #Comerciales
## New names:
## • `` -> `...49`
## • `` -> `...50`
datos_res[datos_res == 9 | datos_res == 99] <- NA
datos_com[datos_com == 9 | datos_com == 99] <- NA
datos_res
## # A tibble: 326 × 48
## No `1.Municipio` `2.TipoCliente` `3.Estrato` `4.NomCliente` `5. Dirección`
## <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 1 Neiva 1 2 Jorge Eliecer… CLL 13 # 1G -…
## 2 2 Neiva 1 2 Luzmila Urdan… K 1F # 12 - 49
## 3 3 Neiva 1 2 Rosaura Call… K 1G # 13 - 67
## 4 4 Neiva 1 2 Yenny chirinos K 1G # 13 - 6…
## 5 5 Neiva 1 2 Stiven Gonzal… CLL 14 # 1E -…
## 6 6 Neiva 1 2 Jairo Ruben O… K 1H # 14 - 17
## 7 7 Neiva 1 2 Orlando CRA 1G # 9 - …
## 8 8 Neiva 1 3 Tatyana Monrr… C 12 # 2 - 64
## 9 NA Neiva 1 3 Lina Quintero CLL 12 # 2 - …
## 10 10 Neiva 1 3 Martha Dussan K 3 # 11 - 93
## # ℹ 316 more rows
## # ℹ 42 more variables: `6.Genero` <dbl>, `7.Edad` <dbl>, `8.Barrio` <chr>,
## # `9.Celular` <dbl>, `10.Correo` <chr>, `11.GradoEstud` <chr>,
## # `12.Notrabajan` <dbl>, `13.Tingresos` <dbl>, `14.Titfactura` <chr>,
## # `15.valorfact` <dbl>, `16.ConsumoE` <dbl>, `17. CodigoNIU` <dbl>,
## # `18.Mediofac` <dbl>, `19.B` <dbl>, `20B` <dbl>, `21B` <dbl>, `22B` <dbl>,
## # `23B` <dbl>, `24C` <dbl>, `25C` <dbl>, `26C` <dbl>, `27C` <dbl>, …
datos_com
## # A tibble: 91 × 50
## No `1.Municipio` `2.TipoCliente` `3.Estrato` `4.NomCliente` `5. Dirección`
## <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 1 Neiva 2 NA Adenis Cruz A… CRA 3 # 9 -17
## 2 2 Neiva 2 NA Gonzalo Penag… CARRERA 6 # 5…
## 3 3 Neiva 2 NA Rocio Nieto CRA 5 # 9 - 24
## 4 4 Neiva 2 NA Luz Mery Leal CRA 6 # 9 - 65
## 5 5 Neiva 2 NA Edwin Solano K 6 # 5 - 27
## 6 6 Neiva 2 NA Angela Villar… CLL 31 # 11W …
## 7 7 Neiva 2 NA Alba Almonacid CRA 1 CW # 29…
## 8 8 Neiva 2 NA Miguel Angel … C 33 # 3BW - …
## 9 NA Neiva 2 NA Gladys CRA 5W # 38 -…
## 10 10 Neiva 2 NA Roberth Valen… K 4W # 37 - 1…
## # ℹ 81 more rows
## # ℹ 44 more variables: `6.Genero` <dbl>, `7.Edad` <dbl>, `8.Barrio` <chr>,
## # `9.Celular` <dbl>, `10.Correo` <chr>, `11.GradoEstud` <chr>,
## # `12.Notrabajan` <dbl>, `13.Tingresos` <dbl>, `14.Titfactura` <chr>,
## # `15.valorfact` <dbl>, `16.ConsumoE` <dbl>, `17. CodigoNIU` <dbl>,
## # `18.Mediofac` <dbl>, `19.B` <dbl>, `20B` <dbl>, `21B` <dbl>, `22B` <dbl>,
## # `23B` <dbl>, `24C` <dbl>, `25C` <dbl>, `26C` <dbl>, `27C` <dbl>, …
Usaremos las variables cuantitativas :
Residenciales
#9 y 99 no aplica
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
## Warning: package 'purrr' was built under R version 4.5.2
cols_obj <- c("7.Edad", "13.Tingresos", "15.valorfact", "16.ConsumoE",
"19.B", "20B", "21B", "22B", "23B", "24C", "25C", "26C",
"27C", "28D", "29D", "30D", "31D", "32D", "33E", "34E",
"35E", "36E", "37F", "38F", "39F", "40F", "41G", "42G",
"43G", "44H", "45Nivelsatisf")
# Función simple para correr KS y devolver fila con resultados
ks_row <- function(df, colname) {
if (!colname %in% names(df)) return(data.frame(variable=colname, n_validos=NA, D=NA, p_valor=NA, note="no existe", stringsAsFactors=FALSE))
x <- df[[colname]]
xnum <- suppressWarnings(as.numeric(x))
xnum <- xnum[!is.na(xnum)]
n <- length(xnum)
if (n < 3 || sd(xnum)==0 || is.na(sd(xnum))) {
return(data.frame(variable=colname, n_validos=n, D=NA, p_valor=NA, note=ifelse(n<3,"n<3","sd=0/NA"), stringsAsFactors=FALSE))
}
tst <- tryCatch(ks.test(xnum, "pnorm", mean(xnum), sd(xnum)), error=function(e) NULL)
if (is.null(tst)) {
data.frame(variable=colname, n_validos=n, D=NA, p_valor=NA, note="error", stringsAsFactors=FALSE)
} else {
data.frame(variable=colname, n_validos=n, D=as.numeric(tst$statistic), p_valor=as.numeric(tst$p.value), note=NA, stringsAsFactors=FALSE)
}
}
# --- Ejecutar para la hoja Residenciales (datos_res) ---
ks_list_res <- lapply(cols_obj, function(col) ks_row(datos_res, col))
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
ks_tabla_res <- do.call(rbind, ks_list_res)
# Mostrar tabla ordenada por nombre de variable
ks_tabla_res <- ks_tabla_res[order(ks_tabla_res$variable), ]
print(ks_tabla_res, row.names = FALSE)
## variable n_validos D p_valor note
## 13.Tingresos 326 0.24441204 2.431264e-17 NA
## 15.valorfact 326 0.16331660 5.603998e-08 NA
## 16.ConsumoE 323 0.06967991 8.686047e-02 NA
## 19.B 326 0.27443797 9.429645e-22 NA
## 20B 326 0.26274979 5.654387e-20 NA
## 21B 326 0.20031385 8.690680e-12 NA
## 22B 326 0.21637367 1.107054e-13 NA
## 23B 326 0.22268166 1.819451e-14 NA
## 24C 326 0.29216523 1.350035e-24 NA
## 25C 326 0.31154197 6.576168e-28 NA
## 26C 326 0.31684216 7.497112e-29 NA
## 27C 326 0.33956792 4.475874e-33 NA
## 28D 326 0.52066565 3.454736e-77 NA
## 29D 48 0.28820564 6.885987e-04 NA
## 30D 48 0.25839335 3.291221e-03 NA
## 31D 48 0.24881458 5.247059e-03 NA
## 32D 48 0.18208574 8.292447e-02 NA
## 33E 326 0.15341137 4.333566e-07 NA
## 34E 326 0.31779012 5.064574e-29 NA
## 35E 326 0.26130440 9.265476e-20 NA
## 36E 326 0.22546122 8.076542e-15 NA
## 37F 326 0.30185170 3.170021e-26 NA
## 38F 326 0.28262244 4.825029e-23 NA
## 39F 326 0.25378861 1.156310e-18 NA
## 40F 326 0.25829263 2.570319e-19 NA
## 41G 326 0.31725123 6.330596e-29 NA
## 42G 326 0.19393254 4.481567e-11 NA
## 43G 326 0.15737308 1.941815e-07 NA
## 44H 326 0.22077373 3.158765e-14 NA
## 45Nivelsatisf 326 0.29626057 2.805332e-25 NA
## 7.Edad 326 0.08847721 1.214484e-02 NA
Comerciales
cols_obj <- c("7.Edad", "13.Tingresos", "15.valorfact", "16.ConsumoE",
"19.B", "20B", "21B", "22B", "23B", "24C", "25C", "26C",
"27C", "28D", "29D", "30D", "31D", "32D", "33E", "34E",
"35E", "36E", "37F", "38F", "39F", "40F", "41G", "42G",
"43G", "44H", "45Nivelsatisf")
# Función que ejecuta KS y devuelve una fila con resultados
ks_row <- function(df, colname) {
if (!colname %in% names(df)) return(data.frame(variable=colname, n_validos=NA, D=NA, p_valor=NA, note="no existe", stringsAsFactors=FALSE))
x <- df[[colname]]
xnum <- suppressWarnings(as.numeric(x))
xnum <- xnum[!is.na(xnum)]
n <- length(xnum)
if (n < 3 || sd(xnum)==0 || is.na(sd(xnum))) {
return(data.frame(variable=colname, n_validos=n, D=NA, p_valor=NA, note=ifelse(n<3,"n<3","sd=0/NA"), stringsAsFactors=FALSE))
}
tst <- tryCatch(ks.test(xnum, "pnorm", mean(xnum), sd(xnum)), error=function(e) NULL)
if (is.null(tst)) {
data.frame(variable=colname, n_validos=n, D=NA, p_valor=NA, note="error", stringsAsFactors=FALSE)
} else {
data.frame(variable=colname, n_validos=n, D=as.numeric(tst$statistic), p_valor=as.numeric(tst$p.value), note=NA, stringsAsFactors=FALSE)
}
}
# --- Ejecutar para la hoja Comerciales (datos_com) ---
ks_list_com <- lapply(cols_obj, function(col) ks_row(datos_com, col))
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
ks_tabla_com <- do.call(rbind, ks_list_com)
# Ordenar por nombre de variable y redondear D/p_valor para mejor lectura
ks_tabla_com <- ks_tabla_com[order(ks_tabla_com$variable), ]
ks_tabla_com$D <- ifelse(is.na(ks_tabla_com$D), NA, round(as.numeric(ks_tabla_com$D), 4))
ks_tabla_com$p_valor <- ifelse(is.na(ks_tabla_com$p_valor), NA, round(as.numeric(ks_tabla_com$p_valor), 4))
# Mostrar tabla
print(ks_tabla_com, row.names = FALSE)
## variable n_validos D p_valor note
## 13.Tingresos 91 0.2828 0.0000 NA
## 15.valorfact 91 0.3598 0.0000 NA
## 16.ConsumoE 90 0.1895 0.0031 NA
## 19.B 91 0.2455 0.0000 NA
## 20B 91 0.1899 0.0028 NA
## 21B 91 0.1998 0.0014 NA
## 22B 91 0.2118 0.0006 NA
## 23B 91 0.2134 0.0005 NA
## 24C 91 0.2948 0.0000 NA
## 25C 91 0.2698 0.0000 NA
## 26C 91 0.2733 0.0000 NA
## 27C 91 0.3375 0.0000 NA
## 28D 91 0.4263 0.0000 NA
## 29D 24 0.3757 0.0023 NA
## 30D 24 0.2870 0.0384 NA
## 31D 24 0.3052 0.0229 NA
## 32D 24 0.2333 0.1467 NA
## 33E 91 0.1605 0.0184 NA
## 34E 91 0.2851 0.0000 NA
## 35E 91 0.1898 0.0028 NA
## 36E 91 0.2773 0.0000 NA
## 37F 91 0.2848 0.0000 NA
## 38F 91 0.2234 0.0002 NA
## 39F 91 0.2122 0.0006 NA
## 40F 91 0.2677 0.0000 NA
## 41G 91 0.3523 0.0000 NA
## 42G 91 0.2478 0.0000 NA
## 43G 91 0.1847 0.0040 NA
## 44H 91 0.3061 0.0000 NA
## 45Nivelsatisf 91 0.2431 0.0000 NA
## 7.Edad 91 0.0946 0.3897 NA
Descriptivos Cuantitativos (Residenciales)
library(psych)
## Warning: package 'psych' was built under R version 4.5.2
datos_res %>%
select(`7.Edad`, `12.Notrabajan`, `13.Tingresos`, `15.valorfact`, `16.ConsumoE`) %>%
describe()
## vars n mean sd median trimmed mad
## 7.Edad 1 326 45.33 16.78 47 45.05 20.76
## 12.Notrabajan 2 326 1.52 0.84 1 1.37 0.00
## 13.Tingresos 3 326 2251525.77 1435093.66 1611500 1992371.76 575990.10
## 15.valorfact 4 326 118542.37 100474.53 96095 103046.68 60905.21
## 16.ConsumoE 5 323 137.88 80.44 128 131.00 74.13
## min max range skew kurtosis se
## 7.Edad 18 84 66 0.04 -1.11 0.93
## 12.Notrabajan 0 7 7 2.55 10.35 0.05
## 13.Tingresos 216000 12000000 11784000 2.51 9.28 79482.49
## 15.valorfact 13030 1112630 1099600 3.98 30.04 5564.77
## 16.ConsumoE 0 508 508 1.00 1.64 4.48
Frecuencias Sociodemográficas (Residenciales)
# Frecuencia para Estrato
datos_res %>%
count(`3.Estrato`) %>%
mutate(Proporcion = n / sum(n))
## # A tibble: 5 × 3
## `3.Estrato` n Proporcion
## <dbl> <int> <dbl>
## 1 1 96 0.294
## 2 2 184 0.564
## 3 3 34 0.104
## 4 4 10 0.0307
## 5 5 2 0.00613
# Frecuencia para Género
datos_res %>%
count(`6.Genero`) %>%
mutate(Proporcion = n / sum(n))
## # A tibble: 3 × 3
## `6.Genero` n Proporcion
## <dbl> <int> <dbl>
## 1 1 126 0.387
## 2 2 199 0.610
## 3 3 1 0.00307
# Frecuencia para Medio de Factura
datos_res %>%
count(`18.Mediofac`) %>%
mutate(Proporcion = n / sum(n))
## # A tibble: 1 × 3
## `18.Mediofac` n Proporcion
## <dbl> <int> <dbl>
## 1 1 326 1
Graficos Descriptivos (Residenciales)
library(ggplot2)
##
## Adjuntando el paquete: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
# Histograma del Consumo de EnergÃa
datos_res %>%
ggplot(aes(x = `16.ConsumoE`)) +
geom_histogram(binwidth = 50, fill = "darkgreen", color = "white") +
labs(title = "Distribución del Consumo (Residencial)", x = "Consumo de EnergÃa") +
theme_minimal()
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Boxplot de Ingresos por Estrato (EstadÃstica bivariada preliminar)
datos_res %>%
mutate(Estrato_Factor = factor(`3.Estrato`)) %>%
# Generamos el Boxplot.
ggplot(aes(x = Estrato_Factor, y = `13.Tingresos`, fill = Estrato_Factor)) +
geom_boxplot() +
labs(title = "Ingresos (`13.Tingresos`) por Estrato (`3.Estrato`)",
y = "Ingresos",
x = "Estrato") +
scale_fill_discrete(name = "Estratos") +
theme_minimal()
Tablas de contigencia (Residenciales)
#Tablas de Contingencia: Estrato vs. Género (Residenciales)
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.5.2
datos_res %>%
filter(!is.na(`3.Estrato`), !is.na(`6.Genero`)) %>%
with(
CrossTable(`3.Estrato`, `6.Genero`,
prop.c = TRUE, prop.t = FALSE, prop.r = FALSE)
)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 326
##
##
## | 6.Genero
## 3.Estrato | 1 | 2 | 3 | Row Total |
## -------------|-----------|-----------|-----------|-----------|
## 1 | 28 | 68 | 0 | 96 |
## | 2.234 | 1.507 | 0.294 | |
## | 0.222 | 0.342 | 0.000 | |
## -------------|-----------|-----------|-----------|-----------|
## 2 | 76 | 108 | 0 | 184 |
## | 0.335 | 0.166 | 0.564 | |
## | 0.603 | 0.543 | 0.000 | |
## -------------|-----------|-----------|-----------|-----------|
## 3 | 13 | 20 | 1 | 34 |
## | 0.002 | 0.027 | 7.693 | |
## | 0.103 | 0.101 | 1.000 | |
## -------------|-----------|-----------|-----------|-----------|
## 4 | 8 | 2 | 0 | 10 |
## | 4.424 | 2.760 | 0.031 | |
## | 0.063 | 0.010 | 0.000 | |
## -------------|-----------|-----------|-----------|-----------|
## 5 | 1 | 1 | 0 | 2 |
## | 0.067 | 0.040 | 0.006 | |
## | 0.008 | 0.005 | 0.000 | |
## -------------|-----------|-----------|-----------|-----------|
## Column Total | 126 | 199 | 1 | 326 |
## | 0.387 | 0.610 | 0.003 | |
## -------------|-----------|-----------|-----------|-----------|
##
##
**Analisis Bivariado Cuantitativo (Correlacion y regresion simple)
# Correlación: Ingresos vs. Consumo (Residenciales)
coef_correlacion <- cor(datos_res$`13.Tingresos`, datos_res$`16.ConsumoE`,
use = "pairwise.complete.obs")
# Imprimir el coeficiente de correlación
print(coef_correlacion)
## [1] 0.1747066
datos_com %>%
select(`7.Edad`, `12.Notrabajan`, `13.Tingresos`, `15.valorfact`, `16.ConsumoE`) %>%
describe()
## vars n mean sd median trimmed mad
## 7.Edad 1 91 44.25 13.40 45 44.15 14.83
## 12.Notrabajan 2 91 2.09 1.31 2 1.85 1.48
## 13.Tingresos 3 91 3461798.79 4632142.03 2000000 2626621.78 1186080.00
## 15.valorfact 4 91 590989.96 1616029.40 261330 363343.29 256697.36
## 16.ConsumoE 5 90 324.36 312.53 221 272.19 202.37
## min max range skew kurtosis se
## 7.Edad 19 73 54 0.03 -0.93 1.40
## 12.Notrabajan 1 7 6 1.59 2.36 0.14
## 13.Tingresos 800000 40000000 39200000 5.69 40.27 485580.43
## 15.valorfact 10850 15292220 15281370 8.28 72.28 169405.91
## 16.ConsumoE 0 1765 1765 1.77 3.87 32.94
Distribucion de frecuencia (Comercial)
datos_com %>%
count(`6.Genero`) %>%
mutate(Proporcion = n / sum(n))
## # A tibble: 2 × 3
## `6.Genero` n Proporcion
## <dbl> <int> <dbl>
## 1 1 41 0.451
## 2 2 50 0.549
datos_com %>%
count(`12.Notrabajan`) %>%
mutate(Proporcion = n / sum(n))
## # A tibble: 7 × 3
## `12.Notrabajan` n Proporcion
## <dbl> <int> <dbl>
## 1 1 35 0.385
## 2 2 36 0.396
## 3 3 7 0.0769
## 4 4 7 0.0769
## 5 5 3 0.0330
## 6 6 2 0.0220
## 7 7 1 0.0110
Histograma de consumo de energia (comercial)
datos_com %>%
ggplot(aes(x = `16.ConsumoE`)) +
geom_histogram(binwidth = 500, fill = "darkblue", color = "white") +
# TÃtulo simplificado sin caracteres especiales (por si acaso)
labs(title = "Distribucion del Consumo (Comercial)",
x = "Consumo de Energia (16.ConsumoE)") + # <-- Sustituye 'n' por 'n', 'é' por 'e'
theme_minimal()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
Analisis bivariado: Correlacion y regresion (Comercial)
Se evaluó la relación entre Ingresos y Consumo de EnergÃa, y se ajustó un modelo de regresión lineal simple para explorar cómo varÃa el Consumo según la Edad:
#Correlación entre Ingresos y Consumo
coef_correlacion_com <- cor(datos_com$`13.Tingresos`, datos_com$`16.ConsumoE`,
use = "pairwise.complete.obs")
print(coef_correlacion_com)
## [1] 0.1291277
#Modelo de regresión: ConsumoE en función de Edad
modelo_com <- lm(`16.ConsumoE` ~ `7.Edad`, data=datos_com)
summary(modelo_com)
##
## Call:
## lm(formula = `16.ConsumoE` ~ `7.Edad`, data = datos_com)
##
## Residuals:
## Min 1Q Median 3Q Max
## -328.98 -205.42 -112.12 92.51 1427.10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 253.653 115.967 2.187 0.0314 *
## `7.Edad` 1.590 2.499 0.636 0.5264
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 313.6 on 88 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.004576, Adjusted R-squared: -0.006735
## F-statistic: 0.4046 on 1 and 88 DF, p-value: 0.5264
#Visualización de la regresión
ggplot(datos_com, aes(x = `7.Edad`, y = `16.ConsumoE`)) +
geom_point() +
geom_smooth(method="lm", se=FALSE, color="red") +
theme_minimal() +
labs(title="Relación entre Edad y Consumo de EnergÃa (Comercial)",
x="Edad", y="Consumo de EnergÃa")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
# Modelo de regresión multivariada: 16.ConsumoE ~ variables 19.B - 45Niversatisf (Residencial)
modelo_res <- lm(`16.ConsumoE` ~ `19.B` + `20B` + `21B` + `22B` + `23B` +
`24C` + `25C` + `26C` + `27C` + `28D` + `29D` + `30D` +
`31D` + `32D` + `33E` + `34E` + `35E` + `36E` + `37F` +
`38F` + `39F` + `40F` + `41G` + `42G` + `43G` + `44H` +
`45Nivelsatisf`,
data = datos_res)
# Resumen del modelo
summary(modelo_res)
##
## Call:
## lm(formula = `16.ConsumoE` ~ `19.B` + `20B` + `21B` + `22B` +
## `23B` + `24C` + `25C` + `26C` + `27C` + `28D` + `29D` + `30D` +
## `31D` + `32D` + `33E` + `34E` + `35E` + `36E` + `37F` + `38F` +
## `39F` + `40F` + `41G` + `42G` + `43G` + `44H` + `45Nivelsatisf`,
## data = datos_res)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.966 -39.904 -9.013 35.836 119.787
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 226.6102 190.5855 1.189 0.2484
## `19.B` 29.3247 23.9160 1.226 0.2344
## `20B` -52.4925 19.6398 -2.673 0.0146 *
## `21B` 11.8622 19.5804 0.606 0.5514
## `22B` -2.6374 20.0636 -0.131 0.8967
## `23B` -22.8097 17.6082 -1.295 0.2099
## `24C` 25.6504 22.0588 1.163 0.2586
## `25C` -8.8826 9.9365 -0.894 0.3820
## `26C` -11.4850 17.0489 -0.674 0.5082
## `27C` 4.1335 29.3757 0.141 0.8895
## `28D` -57.9422 90.0702 -0.643 0.5273
## `29D` -12.3968 21.5653 -0.575 0.5718
## `30D` 39.1304 28.6903 1.364 0.1878
## `31D` 13.2739 19.5384 0.679 0.5047
## `32D` 2.2678 16.3772 0.138 0.8913
## `33E` -3.5287 16.3088 -0.216 0.8309
## `34E` 10.0350 21.8684 0.459 0.6513
## `35E` 0.7101 16.6153 0.043 0.9663
## `36E` -15.1755 12.1806 -1.246 0.2272
## `37F` -16.2614 19.4814 -0.835 0.4137
## `38F` -4.6502 31.0871 -0.150 0.8826
## `39F` -17.1451 16.0631 -1.067 0.2985
## `40F` 22.9441 25.3607 0.905 0.3764
## `41G` 16.7367 13.8109 1.212 0.2397
## `42G` -25.5050 10.9630 -2.326 0.0306 *
## `43G` 12.4242 16.6489 0.746 0.4642
## `44H` -25.7842 15.3669 -1.678 0.1089
## `45Nivelsatisf` -1.3680 20.1422 -0.068 0.9465
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74.72 on 20 degrees of freedom
## (278 observations deleted due to missingness)
## Multiple R-squared: 0.6731, Adjusted R-squared: 0.2317
## F-statistic: 1.525 on 27 and 20 DF, p-value: 0.167
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Cargando paquete requerido: carData
## Warning: package 'carData' was built under R version 4.5.2
##
## Adjuntando el paquete: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
# Lista original de variables independientes
vars_independientes <- c("19.B", "20B", "21B", "22B", "23B",
"24C", "25C", "26C", "27C", "28D",
"29D", "30D", "31D", "32D", "33E",
"34E", "35E", "36E", "37F", "38F",
"39F", "40F", "41G", "42G", "43G",
"44H", "45Nivelsatisf")
# Paso 1: Filtrar variables con demasiados NA (por ejemplo, más del 20%)
na_pct <- colSums(is.na(datos_com[, vars_independientes])) / nrow(datos_com)
vars_viables <- names(na_pct[na_pct <= 0.2])
# Paso 2: Crear dataset con filas completas
datos_completo <- datos_com %>%
select(`16.ConsumoE`, all_of(vars_viables)) %>%
na.omit()
# Paso 3: Ajustar modelo multivariado
modelo_com <- lm(`16.ConsumoE` ~ ., data = datos_completo)
# Paso 4: Revisar colinealidad
vif_vals <- vif(modelo_com)
vif_vals # eliminar variables con VIF > 10 si es necesario
## `19.B` `20B` `21B` `22B` `23B`
## 6.405919 5.995460 3.816654 4.240240 1.694928
## `24C` `25C` `26C` `27C` `28D`
## 1.343931 1.763678 2.166284 3.128025 1.592057
## `33E` `34E` `35E` `36E` `37F`
## 1.797122 4.343088 1.954833 1.667214 1.716159
## `38F` `39F` `40F` `41G` `42G`
## 2.333979 1.775202 1.558453 1.318453 1.586325
## `43G` `44H` `45Nivelsatisf`
## 1.719814 1.537094 1.962456
# Paso 5: Resumen del modelo
summary(modelo_com)
##
## Call:
## lm(formula = `16.ConsumoE` ~ ., data = datos_completo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -404.34 -179.99 -57.96 140.66 1173.18
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 419.6885 413.6063 1.015 0.3140
## `19.B` -96.6265 76.6518 -1.261 0.2119
## `20B` 42.5905 68.8242 0.619 0.5382
## `21B` 58.2970 53.3407 1.093 0.2784
## `22B` -73.5261 57.7921 -1.272 0.2077
## `23B` 13.8064 34.8293 0.396 0.6931
## `24C` -43.4368 41.6386 -1.043 0.3007
## `25C` 2.3831 23.1454 0.103 0.9183
## `26C` 22.9697 42.8230 0.536 0.5935
## `27C` 15.5500 83.8297 0.185 0.8534
## `28D` 0.7271 86.7091 0.008 0.9933
## `33E` 28.6388 31.3815 0.913 0.3648
## `34E` -4.8553 77.4319 -0.063 0.9502
## `35E` 3.7775 35.0049 0.108 0.9144
## `36E` -25.5276 25.1061 -1.017 0.3130
## `37F` -0.9995 41.4796 -0.024 0.9808
## `38F` 84.3886 49.2020 1.715 0.0910 .
## `39F` 17.1250 27.6129 0.620 0.5373
## `40F` 17.6740 31.5095 0.561 0.5768
## `41G` 18.8281 27.7475 0.679 0.4998
## `42G` 2.6417 25.6114 0.103 0.9182
## `43G` -15.5246 28.1120 -0.552 0.5826
## `44H` -81.0116 41.8058 -1.938 0.0569 .
## `45Nivelsatisf` -40.8014 57.9926 -0.704 0.4842
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 323.1 on 66 degrees of freedom
## Multiple R-squared: 0.2076, Adjusted R-squared: -0.06849
## F-statistic: 0.752 on 23 and 66 DF, p-value: 0.7741
library(FactoMineR) # Para análisis factorial
## Warning: package 'FactoMineR' was built under R version 4.5.2
library(factoextra) # Para gráficos de posicionamiento
## Warning: package 'factoextra' was built under R version 4.5.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Selección de variables
vars_factoriales <- c("19.B", "20B", "21B", "22B", "23B",
"24C", "25C", "26C", "27C", "28D",
"29D", "30D", "31D", "32D", "33E",
"34E", "35E", "36E", "37F", "38F",
"39F", "40F", "41G", "42G", "43G",
"44H", "45Nivelsatisf")
# Filtrar datos residenciales
datos_res_factor <- datos_res %>%
select(all_of(vars_factoriales)) %>%
na.omit()
Análisis Factorial de Componentes Principales (PCA)
# PCA para variables cuantitativas
pca_res <- PCA(datos_res_factor, graph = FALSE)
# Resumen de eigenvalues y varianza explicada
eig_res <- get_eigenvalue(pca_res)
eig_res
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 6.26918011 23.2191856 23.21919
## Dim.2 3.39533221 12.5753045 35.79449
## Dim.3 2.78671685 10.3211735 46.11566
## Dim.4 1.96466190 7.2765255 53.39219
## Dim.5 1.77781488 6.5844995 59.97669
## Dim.6 1.61763382 5.9912364 65.96793
## Dim.7 1.23448946 4.5721832 70.54011
## Dim.8 1.12284484 4.1586846 74.69879
## Dim.9 0.99435891 3.6828108 78.38160
## Dim.10 0.96271464 3.5656098 81.94721
## Dim.11 0.74317558 2.7525021 84.69972
## Dim.12 0.65714333 2.4338642 87.13358
## Dim.13 0.54829475 2.0307213 89.16430
## Dim.14 0.47260468 1.7503877 90.91469
## Dim.15 0.43493424 1.6108676 92.52556
## Dim.16 0.39145691 1.4498404 93.97540
## Dim.17 0.30119248 1.1155277 95.09092
## Dim.18 0.25254677 0.9353584 96.02628
## Dim.19 0.24453559 0.9056874 96.93197
## Dim.20 0.17651571 0.6537619 97.58573
## Dim.21 0.16162993 0.5986294 98.18436
## Dim.22 0.12294465 0.4553506 98.63971
## Dim.23 0.09491442 0.3515349 98.99125
## Dim.24 0.08754056 0.3242243 99.31547
## Dim.25 0.08382281 0.3104549 99.62593
## Dim.26 0.05676010 0.2102226 99.83615
## Dim.27 0.04423986 0.1638513 100.00000
Mapas de posicionamiento
# Mapa de variables
fviz_pca_var(pca_res,
col.var = "contrib", # color por contribución
gradient.cols = c("blue", "yellow", "red"),
repel = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Mapa de individuos/clientes
fviz_pca_ind(pca_res,
geom.ind = "point",
col.ind = "cos2", # color por calidad de representación
gradient.cols = c("blue", "yellow", "red"),
repel = TRUE)
# Datos comerciales
datos_com_factor <- datos_com %>%
select(all_of(vars_factoriales)) %>%
na.omit()
# PCA
pca_com <- PCA(datos_com_factor, graph = FALSE)
# Mapas de variables e individuos
fviz_pca_var(pca_com, col.var = "contrib", gradient.cols = c("blue","yellow","red"), repel=TRUE)
fviz_pca_ind(pca_com, geom.ind="point", col.ind="cos2", gradient.cols=c("blue","yellow","red"), repel=TRUE)