1 Configuración y Carga de Datos

##### UNIVERSIDAD CENTRAL DEL ECUADOR #####
#### AUTOR: MARTIN SARMIENTO ####
### CARRERA: INGENIERÍA EN PETRÓLEOS #####


#### VARIABLE CAPACIDAD ####
## DATASET ##
# Cargar dataset
Datos <- read.csv("Dataset_Mundial_Final.csv", sep = ";", dec = ",", fileEncoding = "latin1")
# Estructura de los datos
str(Datos)
## 'data.frame':    58978 obs. of  29 variables:
##  $ ï..OBJECTID           : int  2 3 4 5 6 7 8 9 10 11 ...
##  $ code                  : chr  "00001-AFG-P" "00002-AFG-P" "00003-AFG-P" "00004-AFG-P" ...
##  $ plant_name            : chr  "Badghis Solar Power Plant" "Balkh solar farm" "Behsood solar farm" "Dab Pal 4 solar farm" ...
##  $ country               : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ operational_status    : chr  "cancelled - inferred 4 y" "cancelled - inferred 4 y" "cancelled - inferred 4 y" "shelved - inferred 2 y" ...
##  $ longitude             : num  62.9 67.1 70.4 66.2 65.7 ...
##  $ latitude              : num  35.1 36.7 34.4 33.8 31.7 ...
##  $ elevation             : int  918 359 629 2288 1060 1060 1392 398 410 1012 ...
##  $ area                  : num  6.74 10.72 487.73 111.8 1929.96 ...
##  $ size                  : chr  "Small" "Small" "Small" "Small" ...
##  $ slope                 : num  7.38 0.49 1.1 6.16 1.23 ...
##  $ slope_type            : chr  "Moderado" "Plano o casi plano" "Plano o casi plano" "Moderado" ...
##  $ curvature             : num  -0.024 0 0 0.045 -0.005 -0.005 -0.015 0 0 -0.009 ...
##  $ curvature_type        : chr  "Superficies cóncavas / Valles" "Superficies planas o intermedias" "Superficies planas o intermedias" "Superficies convexas / Crestas" ...
##  $ aspect                : num  96.8 358.5 36.2 305.8 248.4 ...
##  $ aspect_type           : chr  "East" "North" "Northeast" "Northwest" ...
##  $ dist_to_road          : num  7037.1 92.7 112.1 1705.3 115.8 ...
##  $ ambient_temperature   : num  14.4 17.88 21.32 8.86 19.64 ...
##  $ ghi                   : num  5.82 5.58 5.8 6.75 6.62 ...
##  $ humidity              : num  47.7 42.3 36.4 37.3 24.2 ...
##  $ wind_speed            : num  0.039 0.954 0.234 0.943 0.37 ...
##  $ wind_direction        : num  187.5 207.4 255.6 160.3 97.7 ...
##  $ dt_wind               : chr  "South" "Southwest" "West" "South" ...
##  $ solar_aptitude        : num  0.72 0.635 0.685 0.659 0.819 0.819 0.818 0.642 0.63 0.374 ...
##  $ solar_aptitude_rounded: int  7 6 7 7 8 8 8 6 6 4 ...
##  $ solar_aptittude_class : chr  "Alta" "Alta" "Alta" "Alta" ...
##  $ capacity              : num  32 40 60 3000 100 100 36 50 25 100 ...
##  $ optimal_tilt          : num  30 31 31.1 33 31 ...
##  $ pv_potential          : num  4.61 4.41 4.57 5.42 5.17 ...
# Cargamos las librerias
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gt)
library(e1071)

2 Cálculo de Intervalos y Frecuencias

# Extraer variable
Variable <- na.omit(Datos$dist_to_road)
N <- length(Variable)

# CÁLCULO LÍMITES DECIMALES 
min_dec <- min(Variable)
max_dec <- max(Variable)
k_dec <- floor(1 + 3.322 * log10(N))
rango_dec <- max_dec - min_dec
amplitud_dec <- rango_dec / k_dec

# Cortes exactos 
cortes_dec <- seq(min_dec, max_dec, length.out = k_dec + 1)
cortes_dec[length(cortes_dec)] <- max_dec + 0.0001

# Frecuencias
inter_dec <- cut(Variable, breaks = cortes_dec, include.lowest = TRUE, right = FALSE)
ni_dec <- as.vector(table(inter_dec))

# CÁLCULOS MATEMÁTICOS 
hi_dec <- (ni_dec / N) * 100
Ni_asc_dec <- cumsum(ni_dec)
Hi_asc_dec <- cumsum(hi_dec)
Ni_desc_dec <- rev(cumsum(rev(ni_dec)))
Hi_desc_dec <- rev(cumsum(rev(hi_dec)))

# Dataframe Decimal
TDF_Decimal <- data.frame(
  Li = cortes_dec[1:k_dec],
  Ls = cortes_dec[2:(k_dec+1)],
  MC = (cortes_dec[1:k_dec] + cortes_dec[2:(k_dec+1)]) / 2,
  ni = ni_dec,
  hi = hi_dec,
  Ni_asc = Ni_asc_dec,
  Ni_desc = Ni_desc_dec,
  Hi_asc = Hi_asc_dec,
  Hi_desc = Hi_desc_dec)


# CÁLCULO LÍMITES ENTEROS 
min_int <- floor(min(Variable)) 
max_int <- ceiling(max(Variable))
k_int_sug <- floor(1 + 3.322 * log10(N))
Rango_int <- max_int - min_int
Amplitud_raw <- Rango_int / k_int_sug

Amplitud_int <- ceiling(Amplitud_raw / 10) * 10
if(Amplitud_int == 0) Amplitud_int <- 10

min_int <- 0

cortes_int <- seq(from = min_int, by = Amplitud_int, length.out = k_int_sug + 100) # 
cortes_int <- cortes_int[cortes_int <= (max(Variable) + Amplitud_int)]

if(max(cortes_int) < max(Variable)) {
  cortes_int <- c(cortes_int, max(cortes_int) + Amplitud_int)
}

K_real <- length(cortes_int) - 1
lim_inf_int <- cortes_int[1:K_real]
lim_sup_int <- cortes_int[2:(K_real+1)]

# Frecuencias
inter_int <- cut(Variable, breaks = cortes_int, include.lowest = TRUE, right = FALSE)
ni_int <- as.vector(table(inter_int))

# CÁLCULOS MATEMÁTICOS
hi_int <- (ni_int / N) * 100
Ni_asc_int <- cumsum(ni_int)
Hi_asc_int <- cumsum(hi_int)
Ni_desc_int <- rev(cumsum(rev(ni_int)))
Hi_desc_int <- rev(cumsum(rev(hi_int)))

# Dataframe Entero
TDF_Enteros <- data.frame(
  Li = lim_inf_int,
  Ls = lim_sup_int,
  MC = (lim_inf_int + lim_sup_int) / 2,
  ni = ni_int,
  hi = hi_int,
  Ni_asc = Ni_asc_int,
  Ni_desc = Ni_desc_int,
  Hi_asc = Hi_asc_int,
  Hi_desc = Hi_desc_int)

3 Tabla de Distribución de Frecuencias

3.1 Tabla con Límites Decimales

# Crear Dataframe
TDF_Dec_Final <- data.frame(
  Li      = as.character(round(TDF_Decimal$Li, 2)),
  Ls      = as.character(round(TDF_Decimal$Ls, 2)),
  MC      = as.character(round(TDF_Decimal$MC, 2)),
  ni      = as.character(TDF_Decimal$ni),
  hi      = as.character(round(TDF_Decimal$hi, 2)),
  Ni_asc  = as.character(TDF_Decimal$Ni_asc),
  Ni_desc = as.character(TDF_Decimal$Ni_desc),
  Hi_asc  = as.character(round(TDF_Decimal$Hi_asc, 2)),
  Hi_desc = as.character(round(TDF_Decimal$Hi_desc, 2))
)

# Calcular Totales
totales_dec <- c("TOTAL", "-", "-", sum(TDF_Decimal$ni), round(sum(TDF_Decimal$hi), 2), "-", "-", "-", "-")
TDF_Dec_Final <- rbind(TDF_Dec_Final, totales_dec)

# Generar GT
TDF_Dec_Final %>%
  gt() %>%
  tab_header(title = md("**Tabla N°1 de Distribución de Frecuencias de Distancia de la via (m)**")) %>%
  tab_source_note(source_note = "Autor: Martin Sarmiento") %>%
  cols_label(
    Li = "Lim. Inf", 
    Ls = "Lim. Sup", 
    MC = "Marca Clase",
    ni = "Frec. Abs (ni)", 
    hi = "Frec. Rel (%)",
    Ni_asc = "Ni (Asc)", 
    Ni_desc = "Ni (Desc)",
    Hi_asc = "Hi Asc (%)", 
    Hi_desc = "Hi Desc (%)"
  ) %>%
  cols_align(align = "center", columns = everything()) %>%
  tab_options(heading.title.font.size = px(14), column_labels.background.color = "#F0F0F0")
Tabla N°1 de Distribución de Frecuencias de Distancia de la via (m)
Lim. Inf Lim. Sup Marca Clase Frec. Abs (ni) Frec. Rel (%) Ni (Asc) Ni (Desc) Hi Asc (%) Hi Desc (%)
0 143478.45 71739.23 58944 99.94 58944 58978 99.94 100
143478.45 286956.91 215217.68 7 0.01 58951 34 99.95 0.06
286956.91 430435.36 358696.13 1 0 58952 27 99.96 0.05
430435.36 573913.81 502174.59 0 0 58952 26 99.96 0.04
573913.81 717392.27 645653.04 2 0 58954 26 99.96 0.04
717392.27 860870.72 789131.49 1 0 58955 24 99.96 0.04
860870.72 1004349.17 932609.95 0 0 58955 23 99.96 0.04
1004349.17 1147827.63 1076088.4 3 0.01 58958 23 99.97 0.04
1147827.63 1291306.08 1219566.85 2 0 58960 20 99.97 0.03
1291306.08 1434784.53 1363045.31 0 0 58960 18 99.97 0.03
1434784.53 1578262.99 1506523.76 1 0 58961 18 99.97 0.03
1578262.99 1721741.44 1650002.21 11 0.02 58972 17 99.99 0.03
1721741.44 1865219.89 1793480.67 1 0 58973 6 99.99 0.01
1865219.89 2008698.35 1936959.12 1 0 58974 5 99.99 0.01
2008698.35 2152176.8 2080437.57 1 0 58975 4 99.99 0.01
2152176.8 2295655.26 2223916.03 3 0.01 58978 3 100 0.01
TOTAL - - 58978 100 - - - -
Autor: Martin Sarmiento

3.2 Tabla con Límites Enteros

# Crear Dataframe
TDF_Int_Final <- data.frame(
  Li      = as.character(TDF_Enteros$Li),
  Ls      = as.character(TDF_Enteros$Ls),
  MC      = as.character(TDF_Enteros$MC),
  ni      = as.character(TDF_Enteros$ni),
  hi      = as.character(round(TDF_Enteros$hi, 2)),
  Ni_asc  = as.character(TDF_Enteros$Ni_asc),
  Ni_desc = as.character(TDF_Enteros$Ni_desc),
  Hi_asc  = as.character(round(TDF_Enteros$Hi_asc, 2)),
  Hi_desc = as.character(round(TDF_Enteros$Hi_desc, 2))
)

# Calcular Totales
totales_int <- c("TOTAL", "-", "-", sum(TDF_Enteros$ni), round(sum(TDF_Enteros$hi), 2), "-", "-", "-", "-")
TDF_Int_Final <- rbind(TDF_Int_Final, totales_int)

# Generar GT
TDF_Int_Final %>%
  gt() %>%
  tab_header(title = md("**Tabla N°2 de Distribución de Frecuencias de Distancia de la via (m)**")) %>%
  tab_source_note(source_note = "Autor: Martin Sarmiento") %>%
  cols_label(
    Li = "Lim. Inf", 
    Ls = "Lim. Sup", 
    MC = "Marca Clase",
    ni = "Frec. Abs (ni)", 
    hi = "Frec. Rel (%)",
    Ni_asc = "Ni (Asc)", 
    Ni_desc = "Ni (Desc)",
    Hi_asc = "Hi Asc (%)", 
    Hi_desc = "Hi Desc (%)"
  ) %>%
  cols_align(align = "center", columns = everything()) %>%
  tab_options(heading.title.font.size = px(14), column_labels.background.color = "#F0F0F0")
Tabla N°2 de Distribución de Frecuencias de Distancia de la via (m)
Lim. Inf Lim. Sup Marca Clase Frec. Abs (ni) Frec. Rel (%) Ni (Asc) Ni (Desc) Hi Asc (%) Hi Desc (%)
0 143480 71740 58944 99.94 58944 58978 99.94 100
143480 286960 215220 7 0.01 58951 34 99.95 0.06
286960 430440 358700 1 0 58952 27 99.96 0.05
430440 573920 502180 0 0 58952 26 99.96 0.04
573920 717400 645660 2 0 58954 26 99.96 0.04
717400 860880 789140 1 0 58955 24 99.96 0.04
860880 1004360 932620 0 0 58955 23 99.96 0.04
1004360 1147840 1076100 3 0.01 58958 23 99.97 0.04
1147840 1291320 1219580 2 0 58960 20 99.97 0.03
1291320 1434800 1363060 0 0 58960 18 99.97 0.03
1434800 1578280 1506540 1 0 58961 18 99.97 0.03
1578280 1721760 1650020 11 0.02 58972 17 99.99 0.03
1721760 1865240 1793500 1 0 58973 6 99.99 0.01
1865240 2008720 1936980 1 0 58974 5 99.99 0.01
2008720 2152200 2080460 1 0 58975 4 99.99 0.01
2152200 2295680 2223940 3 0.01 58978 3 100 0.01
TOTAL - - 58978 100 - - - -
Autor: Martin Sarmiento

4 Análisis Gráfico

4.1 Histogramas de Cantidad

par(mar = c(8, 7, 5, 2)) 
barplot(TDF_Enteros$ni, 
        names.arg = TDF_Enteros$MC,
        main = "",
        xlab = "", 
        ylab = "",
        col = "#EED5D2",
        ylim = c(0, max(TDF_Enteros$ni) * 1.2),
        space = 0, 
        las = 2, 
        cex.names = 0.7)
mtext("Cantidad", side = 2, line = 4.5, cex = 1, font = 1)
mtext("Distancia de la via (m)", side = 1, line = 4)

mtext("Gráfica N°1: Distribución de Cantidad de Plantas Solares por Distancia de la via", 
      side = 3, 
      line = 2, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

par(mar = c(8, 7, 5, 2))
barplot(TDF_Enteros$ni, 
        main="",
        xlab = "",
        ylab = "",
        names.arg = TDF_Enteros$MC,
        col = "#EED5D2",
        space = 0,
        cex.names = 0.7,
        las = 2,
        ylim = c(0, 58978)) 
mtext("Cantidad", side = 2, line = 4.5, cex = 1, font = 1)
mtext("Distancia de la via (m)", side = 1, line = 4)

mtext("Gráfica N°2: Distribución de Cantidad de Plantas Solares por Distancia de la via", 
      side = 3, 
      line = 2, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

4.2 Histogramas Porcentuales

par(mar = c(8, 5, 5, 2))
bp3 <- barplot(TDF_Enteros$hi, 
        main = "", 
        xlab = "", 
        ylab = "Porcentaje (%)", 
        col = "#EED5D2", 
        space = 0, 
        names.arg = TDF_Enteros$MC, 
        cex.names = 0.7, 
        las = 2, 
        ylim = c(0, max(TDF_Enteros$hi) * 1.2))
mtext("Distancia de la via (m)", side = 1, line = 4)

mtext("Gráfica N°3: Distribución Porcentual de Plantas Solares por Distancia de la via", 
      side = 3, 
      line = 2, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

text(x = bp3, 
     y = TDF_Enteros$hi, 
     labels = paste0(round(TDF_Enteros$hi, 2), "%"), 
     pos = 3, cex = 0.6, col = "black")

par(mar = c(8, 5, 5, 2))
bp4 <- barplot(TDF_Enteros$hi, 
        main = "", 
        xlab = "", 
        ylab = "Porcentaje (%)", 
        col = "#EED5D2", 
        space = 0, 
        names.arg = TDF_Enteros$MC, 
        las = 2, 
        cex.names = 0.7, 
        ylim = c(0, 110)) 
mtext("Distancia de la via (m)", side = 1, line = 4)

mtext("Gráfica N°4: Distribución Porcentual de Plantas Solares por Distancia de la via", 
      side = 3, 
      line = 2, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

text(x = bp4, 
     y = TDF_Enteros$hi, 
     labels = paste0(round(TDF_Enteros$hi, 2), "%"), 
     pos = 3, cex = 0.6, col = "black")

4.3 Diagrama de Cajas (Boxplot)

par(mar = c(5, 5, 4, 2))
boxplot(Variable, 
        horizontal = TRUE,
        col = "#EED5D2",
        xlab = "Distancia de la via (m)",
        cex.main = 0.9,
        main = "Gráfica N°5: Distribución de la Distancia de la via en las Plantas Solares")

4.4 Ojivas

par(mar = c(5, 5, 7, 10), xpd = TRUE)

# Coordenadas
x_asc <- TDF_Enteros$Ls
x_desc <- TDF_Enteros$Li
y_asc <- TDF_Enteros$Ni_asc
y_desc <- TDF_Enteros$Ni_desc

# 1. Dibujar la Ascendente 
plot(x_asc, y_asc,
     type = "b", 
     main = "",
     xlab = "Distancia de la via (m)",
     ylab = "Frecuencia acumulada",
     col = "black",
     pch = 19, 
     xlim = c(min(TDF_Enteros$Li), max(x_asc)), 
     ylim = c(0, sum(TDF_Enteros$ni)),
     bty = "l"
)

# 2. Agregar la Descendente 
lines(x_desc, y_desc, col = "#CDB7B5", type = "b", pch = 19)

grid()
mtext("Gráfica N°6: Ojivas Ascendentes y Descendentes de la\nDistribución de la Distancia de la via en las Plantas Solares", 
      side = 3, 
      line = 3, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

legend("right", 
       legend = c("Ascendente", "Descendente"), 
       col = c("black", "#CDB7B5"), 
       lty = 1, 
       pch = 1, 
       cex = 0.6, 
       inset = c(0.05, 0.05),
       bty = "n")

5 Indicadores Estadísticos

## INDICADORES DE TENDENCIA CENTRAL
# Media aritmética
media <- round(mean(Variable), 2)

# Mediana
mediana <- round(median(Variable), 2)

# Moda
max_frecuencia <- max(TDF_Enteros$ni)
moda_vals <- TDF_Enteros$MC[TDF_Enteros$ni == max_frecuencia]
moda_txt <- paste(round(moda_vals, 2), collapse = ", ")

## INDICADORES DE DISPERSIÓN
# Varianza
varianza <- var(Variable)

# Desviación Estándar
sd_val <- sd(Variable)

# Coeficiente de Variación
cv <- round((sd_val / abs(media)) * 100, 2)

## INDICADORES DE FORMA
# Coeficiente de Asimetría
asimetria <- skewness(Variable, type = 2)

# Curtosis
curtosis <- kurtosis(Variable)

# Outliers
Q1 <- quantile(Variable, 0.25)
Q3 <- quantile(Variable, 0.75)
IQR_val <- Q3 - Q1
lim_inf <- Q1 - 1.5 * IQR_val
lim_sup <- Q3 + 1.5 * IQR_val

outliers_data <- Variable[Variable < lim_inf | Variable > lim_sup]
num_outliers <- length(outliers_data)

if(num_outliers > 0){
  rango_outliers <- paste0(num_outliers, " [", round(min(outliers_data), 2), "; ", round(max(outliers_data), 2), "]")
} else {
  rango_outliers <- "0 [Sin Outliers]"
}

tabla_indicadores <- data.frame(
 "Variable" = c("Distancia de la via (m)"),
 "Rango_MinMax" = paste0("[", round(min(Variable), 2), "; ", round(max(Variable), 2), "]"),
 "X" = c(media),
 "Me" = c(mediana),
 "Mo" = c(moda_txt),
 "V" = c(varianza),
 "Sd" = c(sd_val),
 "Cv" = c(cv),
 "As" = c(asimetria),
 "K" = c(curtosis),
 "Outliers" = rango_outliers)

# Generar Tabla GT
tabla_conclusiones_gt <- tabla_indicadores %>%
 gt() %>%
 tab_header(title = md("**Tabla N°3 de Conclusiones de Distancia de la via de las Plantas Solares**")) %>%
 tab_source_note(source_note = "Autor: Martin Sarmiento") %>%
 cols_label(
  Variable = "Variable",
  Rango_MinMax = "Rango",
  X = "Media (X)",
  Me = "Mediana (Me)",
 Mo = "Moda (Mo)",
  V = "Varianza (V)",
  Sd = "Desv. Est. (Sd)",
  Cv = "C.V. (%)",
  As = "Asimetría (As)",
  K = "Curtosis (K)",
  Outliers = "Outliers [Intervalo]"
 ) %>%
 tab_options(
  heading.title.font.size = px(16),
  column_labels.background.color = "#F0F0F0"
 )

tabla_conclusiones_gt
Tabla N°3 de Conclusiones de Distancia de la via de las Plantas Solares
Variable Rango Media (X) Mediana (Me) Moda (Mo) Varianza (V) Desv. Est. (Sd) C.V. (%) Asimetría (As) Curtosis (K) Outliers [Intervalo]
Distancia de la via (m) [0; 2295655.26] 3821.72 724.38 71740 1169016110 34190.88 894.65 48.74419 2617.162 7539 [6309.67; 2295655.26]
Autor: Martin Sarmiento

6 Conclusiones

La variable “Distancia de la via” fluctúa entre 0 y 2295655.26 m y sus valores se encuentran alrededor de 3821.72 m, con una desviación estándar de 34190.88, siendo una variable muy heterogénea, cuyos valores se concentran en la parte media baja de la variable con la agregación de valores atípicos de 7539 outliers; por todo lo anterior, el comportamiento de la variable es muy perjudicial.