Proyecto Electrohuila

#Cargar datos
library(readxl)

## Warning: package 'readxl' was built under R version 4.5.2

# Hoja 1: Residenciales
datos_res <- read_excel("Electrohuila2025.xlsx", sheet = 1) #Residenciales

# Hoja 2: Comerciales
datos_com <- read_excel("Electrohuila2025.xlsx", sheet = 2) #Comerciales

## New names:
## • `` -> `...49`
## • `` -> `...50`

datos_res[datos_res == 9 | datos_res == 99] <- NA
datos_com[datos_com == 9 | datos_com == 99] <- NA

datos_res

## # A tibble: 326 × 48
##       No `1.Municipio` `2.TipoCliente` `3.Estrato` `4.NomCliente` `5. Dirección`
##    <dbl> <chr>                   <dbl>       <dbl> <chr>          <chr>         
##  1     1 Neiva                       1           2 Jorge Eliecer… CLL 13 # 1G -…
##  2     2 Neiva                       1           2 Luzmila Urdan… K 1F # 12 - 49
##  3     3 Neiva                       1           2 Rosaura  Call… K 1G # 13 - 67
##  4     4 Neiva                       1           2 Yenny chirinos K 1G # 13 - 6…
##  5     5 Neiva                       1           2 Stiven Gonzal… CLL 14 # 1E -…
##  6     6 Neiva                       1           2 Jairo Ruben O… K 1H # 14 - 17
##  7     7 Neiva                       1           2 Orlando        CRA 1G # 9 - …
##  8     8 Neiva                       1           3 Tatyana Monrr… C 12 # 2 - 64 
##  9    NA Neiva                       1           3 Lina Quintero  CLL 12 # 2 - …
## 10    10 Neiva                       1           3 Martha Dussan  K 3 # 11 - 93 
## # ℹ 316 more rows
## # ℹ 42 more variables: `6.Genero` <dbl>, `7.Edad` <dbl>, `8.Barrio` <chr>,
## #   `9.Celular` <dbl>, `10.Correo` <chr>, `11.GradoEstud` <chr>,
## #   `12.Notrabajan` <dbl>, `13.Tingresos` <dbl>, `14.Titfactura` <chr>,
## #   `15.valorfact` <dbl>, `16.ConsumoE` <dbl>, `17. CodigoNIU` <dbl>,
## #   `18.Mediofac` <dbl>, `19.B` <dbl>, `20B` <dbl>, `21B` <dbl>, `22B` <dbl>,
## #   `23B` <dbl>, `24C` <dbl>, `25C` <dbl>, `26C` <dbl>, `27C` <dbl>, …

datos_com

## # A tibble: 91 × 50
##       No `1.Municipio` `2.TipoCliente` `3.Estrato` `4.NomCliente` `5. Dirección`
##    <dbl> <chr>                   <dbl>       <dbl> <chr>          <chr>         
##  1     1 Neiva                       2          NA Adenis Cruz A… CRA 3 # 9 -17 
##  2     2 Neiva                       2          NA Gonzalo Penag… CARRERA 6 # 5…
##  3     3 Neiva                       2          NA Rocio Nieto    CRA 5 # 9 - 24
##  4     4 Neiva                       2          NA Luz Mery Leal  CRA 6 # 9 - 65
##  5     5 Neiva                       2          NA Edwin Solano   K 6 # 5 - 27  
##  6     6 Neiva                       2          NA Angela Villar… CLL 31 # 11W …
##  7     7 Neiva                       2          NA Alba Almonacid CRA 1 CW # 29…
##  8     8 Neiva                       2          NA Miguel Angel … C 33 # 3BW - …
##  9    NA Neiva                       2          NA Gladys         CRA 5W # 38 -…
## 10    10 Neiva                       2          NA Roberth Valen… K 4W # 37 - 1…
## # ℹ 81 more rows
## # ℹ 44 more variables: `6.Genero` <dbl>, `7.Edad` <dbl>, `8.Barrio` <chr>,
## #   `9.Celular` <dbl>, `10.Correo` <chr>, `11.GradoEstud` <chr>,
## #   `12.Notrabajan` <dbl>, `13.Tingresos` <dbl>, `14.Titfactura` <chr>,
## #   `15.valorfact` <dbl>, `16.ConsumoE` <dbl>, `17. CodigoNIU` <dbl>,
## #   `18.Mediofac` <dbl>, `19.B` <dbl>, `20B` <dbl>, `21B` <dbl>, `22B` <dbl>,
## #   `23B` <dbl>, `24C` <dbl>, `25C` <dbl>, `26C` <dbl>, `27C` <dbl>, …

1.Prueba de normalidad Kolmogorov-Smirnov

Usaremos las variables cuantitativas :

Residenciales

#9 y 99 no aplica
library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(purrr)

## Warning: package 'purrr' was built under R version 4.5.2

cols_obj <- c("7.Edad", "13.Tingresos", "15.valorfact", "16.ConsumoE",
              "19.B", "20B", "21B", "22B", "23B", "24C", "25C", "26C",
              "27C", "28D", "29D", "30D", "31D", "32D", "33E", "34E",
              "35E", "36E", "37F", "38F", "39F", "40F", "41G", "42G",
              "43G", "44H", "45Nivelsatisf")

# Función simple para correr KS y devolver fila con resultados
ks_row <- function(df, colname) {
  if (!colname %in% names(df)) return(data.frame(variable=colname, n_validos=NA, D=NA, p_valor=NA, note="no existe", stringsAsFactors=FALSE))
  x <- df[[colname]]
  xnum <- suppressWarnings(as.numeric(x))
  xnum <- xnum[!is.na(xnum)]
  n <- length(xnum)
  if (n < 3 || sd(xnum)==0 || is.na(sd(xnum))) {
    return(data.frame(variable=colname, n_validos=n, D=NA, p_valor=NA, note=ifelse(n<3,"n<3","sd=0/NA"), stringsAsFactors=FALSE))
  }
  tst <- tryCatch(ks.test(xnum, "pnorm", mean(xnum), sd(xnum)), error=function(e) NULL)
  if (is.null(tst)) {
    data.frame(variable=colname, n_validos=n, D=NA, p_valor=NA, note="error", stringsAsFactors=FALSE)
  } else {
    data.frame(variable=colname, n_validos=n, D=as.numeric(tst$statistic), p_valor=as.numeric(tst$p.value), note=NA, stringsAsFactors=FALSE)
  }
}

# --- Ejecutar para la hoja Residenciales (datos_res) ---
ks_list_res <- lapply(cols_obj, function(col) ks_row(datos_res, col))

## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test

## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test

ks_tabla_res <- do.call(rbind, ks_list_res)

# Mostrar tabla ordenada por nombre de variable
ks_tabla_res <- ks_tabla_res[order(ks_tabla_res$variable), ]


print(ks_tabla_res, row.names = FALSE)

##       variable n_validos          D      p_valor note
##   13.Tingresos       326 0.24441204 2.431264e-17   NA
##   15.valorfact       326 0.16331660 5.603998e-08   NA
##    16.ConsumoE       323 0.06967991 8.686047e-02   NA
##           19.B       326 0.27443797 9.429645e-22   NA
##            20B       326 0.26274979 5.654387e-20   NA
##            21B       326 0.20031385 8.690680e-12   NA
##            22B       326 0.21637367 1.107054e-13   NA
##            23B       326 0.22268166 1.819451e-14   NA
##            24C       326 0.29216523 1.350035e-24   NA
##            25C       326 0.31154197 6.576168e-28   NA
##            26C       326 0.31684216 7.497112e-29   NA
##            27C       326 0.33956792 4.475874e-33   NA
##            28D       326 0.52066565 3.454736e-77   NA
##            29D        48 0.28820564 6.885987e-04   NA
##            30D        48 0.25839335 3.291221e-03   NA
##            31D        48 0.24881458 5.247059e-03   NA
##            32D        48 0.18208574 8.292447e-02   NA
##            33E       326 0.15341137 4.333566e-07   NA
##            34E       326 0.31779012 5.064574e-29   NA
##            35E       326 0.26130440 9.265476e-20   NA
##            36E       326 0.22546122 8.076542e-15   NA
##            37F       326 0.30185170 3.170021e-26   NA
##            38F       326 0.28262244 4.825029e-23   NA
##            39F       326 0.25378861 1.156310e-18   NA
##            40F       326 0.25829263 2.570319e-19   NA
##            41G       326 0.31725123 6.330596e-29   NA
##            42G       326 0.19393254 4.481567e-11   NA
##            43G       326 0.15737308 1.941815e-07   NA
##            44H       326 0.22077373 3.158765e-14   NA
##  45Nivelsatisf       326 0.29626057 2.805332e-25   NA
##         7.Edad       326 0.08847721 1.214484e-02   NA

Comerciales

cols_obj <- c("7.Edad", "13.Tingresos", "15.valorfact", "16.ConsumoE",
              "19.B", "20B", "21B", "22B", "23B", "24C", "25C", "26C",
              "27C", "28D", "29D", "30D", "31D", "32D", "33E", "34E",
              "35E", "36E", "37F", "38F", "39F", "40F", "41G", "42G",
              "43G", "44H", "45Nivelsatisf")

# Función que ejecuta KS y devuelve una fila con resultados
ks_row <- function(df, colname) {
  if (!colname %in% names(df)) return(data.frame(variable=colname, n_validos=NA, D=NA, p_valor=NA, note="no existe", stringsAsFactors=FALSE))
  x <- df[[colname]]
  xnum <- suppressWarnings(as.numeric(x))
  xnum <- xnum[!is.na(xnum)]
  n <- length(xnum)
  if (n < 3 || sd(xnum)==0 || is.na(sd(xnum))) {
    return(data.frame(variable=colname, n_validos=n, D=NA, p_valor=NA, note=ifelse(n<3,"n<3","sd=0/NA"), stringsAsFactors=FALSE))
  }
  tst <- tryCatch(ks.test(xnum, "pnorm", mean(xnum), sd(xnum)), error=function(e) NULL)
  if (is.null(tst)) {
    data.frame(variable=colname, n_validos=n, D=NA, p_valor=NA, note="error", stringsAsFactors=FALSE)
  } else {
    data.frame(variable=colname, n_validos=n, D=as.numeric(tst$statistic), p_valor=as.numeric(tst$p.value), note=NA, stringsAsFactors=FALSE)
  }
}

# --- Ejecutar para la hoja Comerciales (datos_com) ---
ks_list_com <- lapply(cols_obj, function(col) ks_row(datos_com, col))

## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## Warning in ks.test.default(xnum, "pnorm", mean(xnum), sd(xnum)): ties should
## not be present for the one-sample Kolmogorov-Smirnov test

ks_tabla_com <- do.call(rbind, ks_list_com)

# Ordenar por nombre de variable y redondear D/p_valor para mejor lectura
ks_tabla_com <- ks_tabla_com[order(ks_tabla_com$variable), ]
ks_tabla_com$D <- ifelse(is.na(ks_tabla_com$D), NA, round(as.numeric(ks_tabla_com$D), 4))
ks_tabla_com$p_valor <- ifelse(is.na(ks_tabla_com$p_valor), NA, round(as.numeric(ks_tabla_com$p_valor), 4))


# Mostrar tabla
print(ks_tabla_com, row.names = FALSE)

##       variable n_validos      D p_valor note
##   13.Tingresos        91 0.2828  0.0000   NA
##   15.valorfact        91 0.3598  0.0000   NA
##    16.ConsumoE        90 0.1895  0.0031   NA
##           19.B        91 0.2455  0.0000   NA
##            20B        91 0.1899  0.0028   NA
##            21B        91 0.1998  0.0014   NA
##            22B        91 0.2118  0.0006   NA
##            23B        91 0.2134  0.0005   NA
##            24C        91 0.2948  0.0000   NA
##            25C        91 0.2698  0.0000   NA
##            26C        91 0.2733  0.0000   NA
##            27C        91 0.3375  0.0000   NA
##            28D        91 0.4263  0.0000   NA
##            29D        24 0.3757  0.0023   NA
##            30D        24 0.2870  0.0384   NA
##            31D        24 0.3052  0.0229   NA
##            32D        24 0.2333  0.1467   NA
##            33E        91 0.1605  0.0184   NA
##            34E        91 0.2851  0.0000   NA
##            35E        91 0.1898  0.0028   NA
##            36E        91 0.2773  0.0000   NA
##            37F        91 0.2848  0.0000   NA
##            38F        91 0.2234  0.0002   NA
##            39F        91 0.2122  0.0006   NA
##            40F        91 0.2677  0.0000   NA
##            41G        91 0.3523  0.0000   NA
##            42G        91 0.2478  0.0000   NA
##            43G        91 0.1847  0.0040   NA
##            44H        91 0.3061  0.0000   NA
##  45Nivelsatisf        91 0.2431  0.0000   NA
##         7.Edad        91 0.0946  0.3897   NA

2. Análisis Descriptivo variables A

2.1 Análisis Descriptivo para Clientes Residenciales

Descriptivos Cuantitativos (Residenciales)

library(psych)

## Warning: package 'psych' was built under R version 4.5.2

datos_res %>%
  select(`7.Edad`, `12.Notrabajan`, `13.Tingresos`, `15.valorfact`, `16.ConsumoE`) %>%
  describe()

##               vars   n       mean         sd  median    trimmed       mad
## 7.Edad           1 326      45.33      16.78      47      45.05     20.76
## 12.Notrabajan    2 326       1.52       0.84       1       1.37      0.00
## 13.Tingresos     3 326 2251525.77 1435093.66 1611500 1992371.76 575990.10
## 15.valorfact     4 326  118542.37  100474.53   96095  103046.68  60905.21
## 16.ConsumoE      5 323     137.88      80.44     128     131.00     74.13
##                  min      max    range skew kurtosis       se
## 7.Edad            18       84       66 0.04    -1.11     0.93
## 12.Notrabajan      0        7        7 2.55    10.35     0.05
## 13.Tingresos  216000 12000000 11784000 2.51     9.28 79482.49
## 15.valorfact   13030  1112630  1099600 3.98    30.04  5564.77
## 16.ConsumoE        0      508      508 1.00     1.64     4.48

Frecuencias Sociodemográficas (Residenciales)

# Frecuencia para Estrato
datos_res %>%
  count(`3.Estrato`) %>%
  mutate(Proporcion = n / sum(n))

## # A tibble: 5 × 3
##   `3.Estrato`     n Proporcion
##         <dbl> <int>      <dbl>
## 1           1    96    0.294  
## 2           2   184    0.564  
## 3           3    34    0.104  
## 4           4    10    0.0307 
## 5           5     2    0.00613

# Frecuencia para Género
datos_res %>%
  count(`6.Genero`) %>%
  mutate(Proporcion = n / sum(n))

## # A tibble: 3 × 3
##   `6.Genero`     n Proporcion
##        <dbl> <int>      <dbl>
## 1          1   126    0.387  
## 2          2   199    0.610  
## 3          3     1    0.00307

# Frecuencia para Medio de Factura
datos_res %>%
  count(`18.Mediofac`) %>%
  mutate(Proporcion = n / sum(n))

## # A tibble: 1 × 3
##   `18.Mediofac`     n Proporcion
##           <dbl> <int>      <dbl>
## 1             1   326          1

Graficos Descriptivos (Residenciales)

library(ggplot2)

## 
## Adjuntando el paquete: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

# Histograma del Consumo de Energía

datos_res %>%
  ggplot(aes(x = `16.ConsumoE`)) + 
  geom_histogram(binwidth = 50, fill = "darkgreen", color = "white") +
  labs(title = "Distribución del Consumo (Residencial)", x = "Consumo de Energía") +
  theme_minimal()

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Boxplot de Ingresos por Estrato (Estadística bivariada preliminar)
datos_res %>%
  mutate(Estrato_Factor = factor(`3.Estrato`)) %>%
  
  # Generamos el Boxplot. 
  ggplot(aes(x = Estrato_Factor, y = `13.Tingresos`, fill = Estrato_Factor)) + 
  geom_boxplot() +
  labs(title = "Ingresos (`13.Tingresos`) por Estrato (`3.Estrato`)", 
       y = "Ingresos", 
       x = "Estrato") +
  scale_fill_discrete(name = "Estratos") +
  theme_minimal()

Tablas de contigencia (Residenciales)

#Tablas de Contingencia: Estrato vs. Género (Residenciales)

library(gmodels)

## Warning: package 'gmodels' was built under R version 4.5.2

datos_res %>%
  filter(!is.na(`3.Estrato`), !is.na(`6.Genero`)) %>%
  with(
    CrossTable(`3.Estrato`, `6.Genero`, 
               prop.c = TRUE, prop.t = FALSE, prop.r = FALSE)
  )

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  326 
## 
##  
##              | 6.Genero 
##    3.Estrato |         1 |         2 |         3 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|
##            1 |        28 |        68 |         0 |        96 | 
##              |     2.234 |     1.507 |     0.294 |           | 
##              |     0.222 |     0.342 |     0.000 |           | 
## -------------|-----------|-----------|-----------|-----------|
##            2 |        76 |       108 |         0 |       184 | 
##              |     0.335 |     0.166 |     0.564 |           | 
##              |     0.603 |     0.543 |     0.000 |           | 
## -------------|-----------|-----------|-----------|-----------|
##            3 |        13 |        20 |         1 |        34 | 
##              |     0.002 |     0.027 |     7.693 |           | 
##              |     0.103 |     0.101 |     1.000 |           | 
## -------------|-----------|-----------|-----------|-----------|
##            4 |         8 |         2 |         0 |        10 | 
##              |     4.424 |     2.760 |     0.031 |           | 
##              |     0.063 |     0.010 |     0.000 |           | 
## -------------|-----------|-----------|-----------|-----------|
##            5 |         1 |         1 |         0 |         2 | 
##              |     0.067 |     0.040 |     0.006 |           | 
##              |     0.008 |     0.005 |     0.000 |           | 
## -------------|-----------|-----------|-----------|-----------|
## Column Total |       126 |       199 |         1 |       326 | 
##              |     0.387 |     0.610 |     0.003 |           | 
## -------------|-----------|-----------|-----------|-----------|
## 
##

**Analisis Bivariado Cuantitativo (Correlacion y regresion simple)

# Correlación: Ingresos vs. Consumo (Residenciales)
coef_correlacion <- cor(datos_res$`13.Tingresos`, datos_res$`16.ConsumoE`, 
                        use = "pairwise.complete.obs")

# Imprimir el coeficiente de correlación
print(coef_correlacion)

## [1] 0.1747066

2.2 Análisis Descriptivo para Clientes comerciales

datos_com %>%
  select(`7.Edad`, `12.Notrabajan`, `13.Tingresos`, `15.valorfact`, `16.ConsumoE`) %>%
  describe()

##               vars  n       mean         sd  median    trimmed        mad
## 7.Edad           1 91      44.25      13.40      45      44.15      14.83
## 12.Notrabajan    2 91       2.09       1.31       2       1.85       1.48
## 13.Tingresos     3 91 3461798.79 4632142.03 2000000 2626621.78 1186080.00
## 15.valorfact     4 91  590989.96 1616029.40  261330  363343.29  256697.36
## 16.ConsumoE      5 90     324.36     312.53     221     272.19     202.37
##                  min      max    range skew kurtosis        se
## 7.Edad            19       73       54 0.03    -0.93      1.40
## 12.Notrabajan      1        7        6 1.59     2.36      0.14
## 13.Tingresos  800000 40000000 39200000 5.69    40.27 485580.43
## 15.valorfact   10850 15292220 15281370 8.28    72.28 169405.91
## 16.ConsumoE        0     1765     1765 1.77     3.87     32.94

Distribucion de frecuencia (Comercial)

datos_com %>%
count(`6.Genero`) %>%
mutate(Proporcion = n / sum(n))

## # A tibble: 2 × 3
##   `6.Genero`     n Proporcion
##        <dbl> <int>      <dbl>
## 1          1    41      0.451
## 2          2    50      0.549

datos_com %>%
count(`12.Notrabajan`) %>%
mutate(Proporcion = n / sum(n))

## # A tibble: 7 × 3
##   `12.Notrabajan`     n Proporcion
##             <dbl> <int>      <dbl>
## 1               1    35     0.385 
## 2               2    36     0.396 
## 3               3     7     0.0769
## 4               4     7     0.0769
## 5               5     3     0.0330
## 6               6     2     0.0220
## 7               7     1     0.0110

Histograma de consumo de energia (comercial)

datos_com %>%
  ggplot(aes(x = `16.ConsumoE`)) +
  geom_histogram(binwidth = 500, fill = "darkblue", color = "white") +
  # Título simplificado sin caracteres especiales (por si acaso)
  labs(title = "Distribucion del Consumo (Comercial)", 
       x = "Consumo de Energia (16.ConsumoE)") + # <-- Sustituye 'n' por 'n', 'é' por 'e'
  theme_minimal()

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

Analisis bivariado: Correlacion y regresion (Comercial)

Se evaluó la relación entre Ingresos y Consumo de Energía, y se ajustó un modelo de regresión lineal simple para explorar cómo varía el Consumo según la Edad:

#Correlación entre Ingresos y Consumo

coef_correlacion_com <- cor(datos_com$`13.Tingresos`, datos_com$`16.ConsumoE`,
use = "pairwise.complete.obs")
print(coef_correlacion_com)

## [1] 0.1291277

#Modelo de regresión: ConsumoE en función de Edad

modelo_com <- lm(`16.ConsumoE` ~ `7.Edad`, data=datos_com)
summary(modelo_com)

## 
## Call:
## lm(formula = `16.ConsumoE` ~ `7.Edad`, data = datos_com)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -328.98 -205.42 -112.12   92.51 1427.10 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  253.653    115.967   2.187   0.0314 *
## `7.Edad`       1.590      2.499   0.636   0.5264  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 313.6 on 88 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.004576,   Adjusted R-squared:  -0.006735 
## F-statistic: 0.4046 on 1 and 88 DF,  p-value: 0.5264

#Visualización de la regresión

ggplot(datos_com, aes(x = `7.Edad`, y = `16.ConsumoE`)) +
geom_point() +
geom_smooth(method="lm", se=FALSE, color="red") +
theme_minimal() +
labs(title="Relación entre Edad y Consumo de Energía (Comercial)",
x="Edad", y="Consumo de Energía")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

3.Modelos de regresion multivariado

3.1 Regresión Multivariada para Clientes Residenciales

# Modelo de regresión multivariada: 16.ConsumoE ~ variables 19.B - 45Niversatisf (Residencial)
modelo_res <- lm(`16.ConsumoE` ~ `19.B` + `20B` + `21B` + `22B` + `23B` + 
                 `24C` + `25C` + `26C` + `27C` + `28D` + `29D` + `30D` +
                 `31D` + `32D` + `33E` + `34E` + `35E` + `36E` + `37F` +
                 `38F` + `39F` + `40F` + `41G` + `42G` + `43G` + `44H` +
                 `45Nivelsatisf`,
                 data = datos_res)

# Resumen del modelo
summary(modelo_res)

## 
## Call:
## lm(formula = `16.ConsumoE` ~ `19.B` + `20B` + `21B` + `22B` + 
##     `23B` + `24C` + `25C` + `26C` + `27C` + `28D` + `29D` + `30D` + 
##     `31D` + `32D` + `33E` + `34E` + `35E` + `36E` + `37F` + `38F` + 
##     `39F` + `40F` + `41G` + `42G` + `43G` + `44H` + `45Nivelsatisf`, 
##     data = datos_res)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -82.966 -39.904  -9.013  35.836 119.787 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     226.6102   190.5855   1.189   0.2484  
## `19.B`           29.3247    23.9160   1.226   0.2344  
## `20B`           -52.4925    19.6398  -2.673   0.0146 *
## `21B`            11.8622    19.5804   0.606   0.5514  
## `22B`            -2.6374    20.0636  -0.131   0.8967  
## `23B`           -22.8097    17.6082  -1.295   0.2099  
## `24C`            25.6504    22.0588   1.163   0.2586  
## `25C`            -8.8826     9.9365  -0.894   0.3820  
## `26C`           -11.4850    17.0489  -0.674   0.5082  
## `27C`             4.1335    29.3757   0.141   0.8895  
## `28D`           -57.9422    90.0702  -0.643   0.5273  
## `29D`           -12.3968    21.5653  -0.575   0.5718  
## `30D`            39.1304    28.6903   1.364   0.1878  
## `31D`            13.2739    19.5384   0.679   0.5047  
## `32D`             2.2678    16.3772   0.138   0.8913  
## `33E`            -3.5287    16.3088  -0.216   0.8309  
## `34E`            10.0350    21.8684   0.459   0.6513  
## `35E`             0.7101    16.6153   0.043   0.9663  
## `36E`           -15.1755    12.1806  -1.246   0.2272  
## `37F`           -16.2614    19.4814  -0.835   0.4137  
## `38F`            -4.6502    31.0871  -0.150   0.8826  
## `39F`           -17.1451    16.0631  -1.067   0.2985  
## `40F`            22.9441    25.3607   0.905   0.3764  
## `41G`            16.7367    13.8109   1.212   0.2397  
## `42G`           -25.5050    10.9630  -2.326   0.0306 *
## `43G`            12.4242    16.6489   0.746   0.4642  
## `44H`           -25.7842    15.3669  -1.678   0.1089  
## `45Nivelsatisf`  -1.3680    20.1422  -0.068   0.9465  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74.72 on 20 degrees of freedom
##   (278 observations deleted due to missingness)
## Multiple R-squared:  0.6731, Adjusted R-squared:  0.2317 
## F-statistic: 1.525 on 27 and 20 DF,  p-value: 0.167

3.2 Regresión Multivariada para Clientes Comerciales

library(car)

## Warning: package 'car' was built under R version 4.5.2

## Cargando paquete requerido: carData

## Warning: package 'carData' was built under R version 4.5.2

## 
## Adjuntando el paquete: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:purrr':
## 
##     some

## The following object is masked from 'package:dplyr':
## 
##     recode

# Lista original de variables independientes
vars_independientes <- c("19.B", "20B", "21B", "22B", "23B", 
                         "24C", "25C", "26C", "27C", "28D", 
                         "29D", "30D", "31D", "32D", "33E", 
                         "34E", "35E", "36E", "37F", "38F", 
                         "39F", "40F", "41G", "42G", "43G", 
                         "44H", "45Nivelsatisf")

# Paso 1: Filtrar variables con demasiados NA (por ejemplo, más del 20%)
na_pct <- colSums(is.na(datos_com[, vars_independientes])) / nrow(datos_com)
vars_viables <- names(na_pct[na_pct <= 0.2])

# Paso 2: Crear dataset con filas completas
datos_completo <- datos_com %>% 
  select(`16.ConsumoE`, all_of(vars_viables)) %>%
  na.omit()

# Paso 3: Ajustar modelo multivariado
modelo_com <- lm(`16.ConsumoE` ~ ., data = datos_completo)

# Paso 4: Revisar colinealidad
vif_vals <- vif(modelo_com)
vif_vals  # eliminar variables con VIF > 10 si es necesario

##          `19.B`           `20B`           `21B`           `22B`           `23B` 
##        6.405919        5.995460        3.816654        4.240240        1.694928 
##           `24C`           `25C`           `26C`           `27C`           `28D` 
##        1.343931        1.763678        2.166284        3.128025        1.592057 
##           `33E`           `34E`           `35E`           `36E`           `37F` 
##        1.797122        4.343088        1.954833        1.667214        1.716159 
##           `38F`           `39F`           `40F`           `41G`           `42G` 
##        2.333979        1.775202        1.558453        1.318453        1.586325 
##           `43G`           `44H` `45Nivelsatisf` 
##        1.719814        1.537094        1.962456

# Paso 5: Resumen del modelo
summary(modelo_com)

## 
## Call:
## lm(formula = `16.ConsumoE` ~ ., data = datos_completo)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -404.34 -179.99  -57.96  140.66 1173.18 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     419.6885   413.6063   1.015   0.3140  
## `19.B`          -96.6265    76.6518  -1.261   0.2119  
## `20B`            42.5905    68.8242   0.619   0.5382  
## `21B`            58.2970    53.3407   1.093   0.2784  
## `22B`           -73.5261    57.7921  -1.272   0.2077  
## `23B`            13.8064    34.8293   0.396   0.6931  
## `24C`           -43.4368    41.6386  -1.043   0.3007  
## `25C`             2.3831    23.1454   0.103   0.9183  
## `26C`            22.9697    42.8230   0.536   0.5935  
## `27C`            15.5500    83.8297   0.185   0.8534  
## `28D`             0.7271    86.7091   0.008   0.9933  
## `33E`            28.6388    31.3815   0.913   0.3648  
## `34E`            -4.8553    77.4319  -0.063   0.9502  
## `35E`             3.7775    35.0049   0.108   0.9144  
## `36E`           -25.5276    25.1061  -1.017   0.3130  
## `37F`            -0.9995    41.4796  -0.024   0.9808  
## `38F`            84.3886    49.2020   1.715   0.0910 .
## `39F`            17.1250    27.6129   0.620   0.5373  
## `40F`            17.6740    31.5095   0.561   0.5768  
## `41G`            18.8281    27.7475   0.679   0.4998  
## `42G`             2.6417    25.6114   0.103   0.9182  
## `43G`           -15.5246    28.1120  -0.552   0.5826  
## `44H`           -81.0116    41.8058  -1.938   0.0569 .
## `45Nivelsatisf` -40.8014    57.9926  -0.704   0.4842  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 323.1 on 66 degrees of freedom
## Multiple R-squared:  0.2076, Adjusted R-squared:  -0.06849 
## F-statistic: 0.752 on 23 and 66 DF,  p-value: 0.7741

4. Analisis factorial - Mapas de posicionamiento

4.1 Analisis factorial - Mapas de posicionamiento clientes residencial**

library(FactoMineR)  # Para análisis factorial

## Warning: package 'FactoMineR' was built under R version 4.5.2

library(factoextra)  # Para gráficos de posicionamiento

## Warning: package 'factoextra' was built under R version 4.5.2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

# Selección de variables
vars_factoriales <- c("19.B", "20B", "21B", "22B", "23B", 
                      "24C", "25C", "26C", "27C", "28D", 
                      "29D", "30D", "31D", "32D", "33E", 
                      "34E", "35E", "36E", "37F", "38F", 
                      "39F", "40F", "41G", "42G", "43G", 
                      "44H", "45Nivelsatisf")

# Filtrar datos residenciales
datos_res_factor <- datos_res %>%
  select(all_of(vars_factoriales)) %>%
  na.omit()

Análisis Factorial de Componentes Principales (PCA)

# PCA para variables cuantitativas
pca_res <- PCA(datos_res_factor, graph = FALSE)

# Resumen de eigenvalues y varianza explicada
eig_res <- get_eigenvalue(pca_res)
eig_res

##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1  6.26918011       23.2191856                    23.21919
## Dim.2  3.39533221       12.5753045                    35.79449
## Dim.3  2.78671685       10.3211735                    46.11566
## Dim.4  1.96466190        7.2765255                    53.39219
## Dim.5  1.77781488        6.5844995                    59.97669
## Dim.6  1.61763382        5.9912364                    65.96793
## Dim.7  1.23448946        4.5721832                    70.54011
## Dim.8  1.12284484        4.1586846                    74.69879
## Dim.9  0.99435891        3.6828108                    78.38160
## Dim.10 0.96271464        3.5656098                    81.94721
## Dim.11 0.74317558        2.7525021                    84.69972
## Dim.12 0.65714333        2.4338642                    87.13358
## Dim.13 0.54829475        2.0307213                    89.16430
## Dim.14 0.47260468        1.7503877                    90.91469
## Dim.15 0.43493424        1.6108676                    92.52556
## Dim.16 0.39145691        1.4498404                    93.97540
## Dim.17 0.30119248        1.1155277                    95.09092
## Dim.18 0.25254677        0.9353584                    96.02628
## Dim.19 0.24453559        0.9056874                    96.93197
## Dim.20 0.17651571        0.6537619                    97.58573
## Dim.21 0.16162993        0.5986294                    98.18436
## Dim.22 0.12294465        0.4553506                    98.63971
## Dim.23 0.09491442        0.3515349                    98.99125
## Dim.24 0.08754056        0.3242243                    99.31547
## Dim.25 0.08382281        0.3104549                    99.62593
## Dim.26 0.05676010        0.2102226                    99.83615
## Dim.27 0.04423986        0.1638513                   100.00000

Mapas de posicionamiento

# Mapa de variables
fviz_pca_var(pca_res,
             col.var = "contrib", # color por contribución
             gradient.cols = c("blue", "yellow", "red"),
             repel = TRUE)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Mapa de individuos/clientes
fviz_pca_ind(pca_res,
             geom.ind = "point",
             col.ind = "cos2", # color por calidad de representación
             gradient.cols = c("blue", "yellow", "red"),
             repel = TRUE)

4.2 Analisis factorial - Mapas de posicionamiento clientes comerciales

# Datos comerciales
datos_com_factor <- datos_com %>%
  select(all_of(vars_factoriales)) %>%
  na.omit()

# PCA
pca_com <- PCA(datos_com_factor, graph = FALSE)

# Mapas de variables e individuos
fviz_pca_var(pca_com, col.var = "contrib", gradient.cols = c("blue","yellow","red"), repel=TRUE)

fviz_pca_ind(pca_com, geom.ind="point", col.ind="cos2", gradient.cols=c("blue","yellow","red"), repel=TRUE)