1 Configuración y Carga de Datos

##### UNIVERSIDAD CENTRAL DEL ECUADOR #####
#### AUTOR: MARTIN SARMIENTO ####
### CARRERA: INGENIERÍA EN PETRÓLEOS #####


#### VARIABLE AREA ####
## DATASET ##
setwd("~/R/AREA")
# Cargar dataset
Datos <- read.csv("Dataset_Mundial_Final.csv", sep = ";", dec = ",", fileEncoding = "latin1")
# Estructura de los datos
str(Datos)
## 'data.frame':    58978 obs. of  29 variables:
##  $ ï..OBJECTID           : int  2 3 4 5 6 7 8 9 10 11 ...
##  $ code                  : chr  "00001-AFG-P" "00002-AFG-P" "00003-AFG-P" "00004-AFG-P" ...
##  $ plant_name            : chr  "Badghis Solar Power Plant" "Balkh solar farm" "Behsood solar farm" "Dab Pal 4 solar farm" ...
##  $ country               : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ operational_status    : chr  "cancelled - inferred 4 y" "cancelled - inferred 4 y" "cancelled - inferred 4 y" "shelved - inferred 2 y" ...
##  $ longitude             : num  62.9 67.1 70.4 66.2 65.7 ...
##  $ latitude              : num  35.1 36.7 34.4 33.8 31.7 ...
##  $ elevation             : int  918 359 629 2288 1060 1060 1392 398 410 1012 ...
##  $ area                  : num  6.74 10.72 487.73 111.8 1929.96 ...
##  $ size                  : chr  "Small" "Small" "Small" "Small" ...
##  $ slope                 : num  7.38 0.49 1.1 6.16 1.23 ...
##  $ slope_type            : chr  "Moderado" "Plano o casi plano" "Plano o casi plano" "Moderado" ...
##  $ curvature             : num  -0.024 0 0 0.045 -0.005 -0.005 -0.015 0 0 -0.009 ...
##  $ curvature_type        : chr  "Superficies cóncavas / Valles" "Superficies planas o intermedias" "Superficies planas o intermedias" "Superficies convexas / Crestas" ...
##  $ aspect                : num  96.8 358.5 36.2 305.8 248.4 ...
##  $ aspect_type           : chr  "East" "North" "Northeast" "Northwest" ...
##  $ dist_to_road          : num  7037.1 92.7 112.1 1705.3 115.8 ...
##  $ ambient_temperature   : num  14.4 17.88 21.32 8.86 19.64 ...
##  $ ghi                   : num  5.82 5.58 5.8 6.75 6.62 ...
##  $ humidity              : num  47.7 42.3 36.4 37.3 24.2 ...
##  $ wind_speed            : num  0.039 0.954 0.234 0.943 0.37 ...
##  $ wind_direction        : num  187.5 207.4 255.6 160.3 97.7 ...
##  $ dt_wind               : chr  "South" "Southwest" "West" "South" ...
##  $ solar_aptitude        : num  0.72 0.635 0.685 0.659 0.819 0.819 0.818 0.642 0.63 0.374 ...
##  $ solar_aptitude_rounded: int  7 6 7 7 8 8 8 6 6 4 ...
##  $ solar_aptittude_class : chr  "Alta" "Alta" "Alta" "Alta" ...
##  $ capacity              : num  32 40 60 3000 100 100 36 50 25 100 ...
##  $ optimal_tilt          : num  30 31 31.1 33 31 ...
##  $ pv_potential          : num  4.61 4.41 4.57 5.42 5.17 ...
# Cargamos las librerias
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gt)
library(e1071)

2 Cálculo de Intervalos y Frecuencias

# Extraer variable
Variable <- na.omit(Datos$area)
N <- length(Variable)

# CÁLCULO LÍMITES ENTEROS
# Definimos los puntos de corte
cortes_personalizados <- c(
  0,     
  30,        
  100,       
  500,
  1000,
  5000,
  10000,
  50000,
  100000,
  500000,
  1000000,
  5000000,
  10000000,
  50000000,
  100000000,
  250000000)

# Verificamos que el último corte sea mayor que el valor máximo
if (max(cortes_personalizados) <= max(Variable)) {
    cortes_personalizados[length(cortes_personalizados)] <- max(Variable) + 1
}

K_real <- length(cortes_personalizados) - 1
lim_inf_int <- cortes_personalizados[1:K_real]
lim_sup_int <- cortes_personalizados[2:(K_real+1)]

# Frecuencias (usando los nuevos cortes)
inter_int <- cut(Variable, breaks = cortes_personalizados, include.lowest = TRUE, right = FALSE)
ni_int <- as.vector(table(inter_int))

# CÁLCULOS MATEMÁTICOS 
hi_int <- (ni_int / N) * 100
Ni_asc_int <- cumsum(ni_int)
Hi_asc_int <- cumsum(hi_int)
Ni_desc_int <- rev(cumsum(rev(ni_int)))
Hi_desc_int <- rev(cumsum(rev(hi_int)))

# Dataframe Entero 
TDF_Enteros <- data.frame(
  Li = lim_inf_int,
  Ls = lim_sup_int,
  MC = (lim_inf_int + lim_sup_int) / 2,
  ni = ni_int,
  hi = hi_int,
  Ni_asc = Ni_asc_int,
  Ni_desc = Ni_desc_int,
  Hi_asc = Hi_asc_int,
  Hi_desc = Hi_desc_int)

# CÁLCULO LÍMITES DECIMALES
# Cortes exactos
cortes_dec <- cortes_personalizados
cortes_dec[length(cortes_dec)] <- max(Variable) + 0.0001

# Frecuencias 
inter_dec <- cut(Variable, breaks = cortes_dec, include.lowest = TRUE, right = FALSE)
ni_dec <- as.vector(table(inter_dec))

# CÁLCULOS MATEMÁTICOS 
hi_dec <- (ni_dec / N) * 100
Ni_asc_dec <- cumsum(ni_dec)
Hi_asc_dec <- cumsum(hi_dec)
Ni_desc_dec <- rev(cumsum(rev(ni_dec)))
Hi_desc_dec <- rev(cumsum(rev(hi_dec)))

# Dataframe Decimal
TDF_Decimal <- data.frame(
  Li = cortes_dec[1:K_real],
  Ls = cortes_dec[2:(K_real+1)],
  MC = (cortes_dec[1:K_real] + cortes_dec[2:(K_real+1)]) / 2,
  ni = ni_dec,
  hi = hi_dec,
  Ni_asc = Ni_asc_dec,
  Ni_desc = Ni_desc_dec,
  Hi_asc = Hi_asc_dec,
  Hi_desc = Hi_desc_dec)

3 Tabla de Distribución de Frecuencias

3.1 Tabla con Límites Decimales

# Crear Dataframe 
TDF_Dec_Final <- data.frame(
  Li      = format(round(TDF_Decimal$Li, 2), scientific = FALSE, trim = TRUE),
  Ls      = format(round(TDF_Decimal$Ls, 2), scientific = FALSE, trim = TRUE),
  MC      = format(round(TDF_Decimal$MC, 2), scientific = FALSE, trim = TRUE),
  ni      = format(TDF_Decimal$ni, scientific = FALSE, trim = TRUE),
  hi      = format(round(TDF_Decimal$hi, 2), scientific = FALSE, trim = TRUE),
  Ni_asc  = format(TDF_Decimal$Ni_asc, scientific = FALSE, trim = TRUE),
  Ni_desc = format(TDF_Decimal$Ni_desc, scientific = FALSE, trim = TRUE),
  Hi_asc  = format(round(TDF_Decimal$Hi_asc, 2), scientific = FALSE, trim = TRUE),
  Hi_desc = format(round(TDF_Decimal$Hi_desc, 2), scientific = FALSE, trim = TRUE),
  stringsAsFactors = FALSE
)

# Calcular Totales
totales_dec <- c("TOTAL", "-", "-", sum(TDF_Decimal$ni), round(sum(TDF_Decimal$hi), 2), "-", "-", "-", "-")
TDF_Dec_Final <- rbind(TDF_Dec_Final, totales_dec)

# Generar GT
TDF_Dec_Final %>%
 gt() %>%
 tab_header(title = md("**Tabla N°1 de Distribución de Frecuencias del Área (m²) de las Plantas Solares**")) %>%
  tab_source_note(source_note = "Autor: Martin Sarmiento") %>%
 cols_label(
  Li = "Lim. Inf", 
  Ls = "Lim. Sup", 
  MC = "Marca Clase",
  ni = "Frec. Abs (ni)", 
  hi = "Frec. Rel (%)",
  Ni_asc = "Ni (Asc)", 
  Ni_desc = "Ni (Desc)",
  Hi_asc = "Hi Asc (%)", 
  Hi_desc = "Hi Desc (%)"
 ) %>%
 cols_align(align = "center", columns = everything()) %>%
 tab_options(heading.title.font.size = px(14), column_labels.background.color = "#F0F0F0")
Tabla N°1 de Distribución de Frecuencias del Área (m²) de las Plantas Solares
Lim. Inf Lim. Sup Marca Clase Frec. Abs (ni) Frec. Rel (%) Ni (Asc) Ni (Desc) Hi Asc (%) Hi Desc (%)
0 30 15 4312 10.11 4312 42649 10.11 100.00
30 100 65 2107 4.94 6419 38337 15.05 89.89
100 500 300 2573 6.03 8992 36230 21.08 84.95
500 1000 750 1394 3.27 10386 33657 24.35 78.92
1000 5000 3000 9922 23.26 20308 32263 47.62 75.65
5000 10000 7500 1585 3.72 21893 22341 51.33 52.38
10000 50000 30000 8757 20.53 30650 20756 71.87 48.67
50000 100000 75000 4064 9.53 34714 11999 81.39 28.13
100000 500000 300000 5665 13.28 40379 7935 94.68 18.61
500000 1000000 750000 1009 2.37 41388 2270 97.04 5.32
1000000 5000000 3000000 1043 2.45 42431 1261 99.49 2.96
5000000 10000000 7500000 135 0.32 42566 218 99.81 0.51
10000000 50000000 30000000 76 0.18 42642 83 99.98 0.19
50000000 100000000 75000000 4 0.01 42646 7 99.99 0.02
100000000 234700000 167350000 3 0.01 42649 3 100.00 0.01
TOTAL - - 42649 100 - - - -
Autor: Martin Sarmiento

3.2 Tabla con Límites Enteros

# Crear Dataframe 
TDF_Int_Final <- data.frame(
  Li      = format(TDF_Enteros$Li, scientific = FALSE, trim = TRUE),
  Ls      = format(TDF_Enteros$Ls, scientific = FALSE, trim = TRUE),
  MC      = format(TDF_Enteros$MC, scientific = FALSE, trim = TRUE),
  ni      = format(TDF_Enteros$ni, scientific = FALSE, trim = TRUE),
  hi      = format(round(TDF_Enteros$hi, 2), scientific = FALSE, trim = TRUE),
  Ni_asc  = format(TDF_Enteros$Ni_asc, scientific = FALSE, trim = TRUE),
  Ni_desc = format(TDF_Enteros$Ni_desc, scientific = FALSE, trim = TRUE),
  Hi_asc  = format(round(TDF_Enteros$Hi_asc, 2), scientific = FALSE, trim = TRUE),
  Hi_desc = format(round(TDF_Enteros$Hi_desc, 2), scientific = FALSE, trim = TRUE),
  stringsAsFactors = FALSE
)

# Calcular Totales
totales_int <- c("TOTAL", "-", "-", sum(TDF_Enteros$ni), round(sum(TDF_Enteros$hi), 2), "-", "-", "-", "-")
TDF_Int_Final <- rbind(TDF_Int_Final, totales_int)

# Generar GT
TDF_Int_Final %>%
 gt() %>%
 tab_header(title = md("**Tabla N°2 de Distribución de Frecuencias del Área (m²) de las Plantas Solares**")) %>%
  tab_source_note(source_note = "Autor: Martin Sarmiento") %>%
 cols_label(
  Li = "Lim. Inf", 
  Ls = "Lim. Sup", 
  MC = "Marca Clase",
  ni = "Frec. Abs (ni)", 
  hi = "Frec. Rel (%)",
  Ni_asc = "Ni (Asc)", 
  Ni_desc = "Ni (Desc)",
  Hi_asc = "Hi Asc (%)",
  Hi_desc = "Hi Desc (%)"
 ) %>%
 cols_align(align = "center", columns = everything()) %>%
 tab_options(heading.title.font.size = px(14), column_labels.background.color = "#F0F0F0")
Tabla N°2 de Distribución de Frecuencias del Área (m²) de las Plantas Solares
Lim. Inf Lim. Sup Marca Clase Frec. Abs (ni) Frec. Rel (%) Ni (Asc) Ni (Desc) Hi Asc (%) Hi Desc (%)
0 30 15 4312 10.11 4312 42649 10.11 100.00
30 100 65 2107 4.94 6419 38337 15.05 89.89
100 500 300 2573 6.03 8992 36230 21.08 84.95
500 1000 750 1394 3.27 10386 33657 24.35 78.92
1000 5000 3000 9922 23.26 20308 32263 47.62 75.65
5000 10000 7500 1585 3.72 21893 22341 51.33 52.38
10000 50000 30000 8757 20.53 30650 20756 71.87 48.67
50000 100000 75000 4064 9.53 34714 11999 81.39 28.13
100000 500000 300000 5665 13.28 40379 7935 94.68 18.61
500000 1000000 750000 1009 2.37 41388 2270 97.04 5.32
1000000 5000000 3000000 1043 2.45 42431 1261 99.49 2.96
5000000 10000000 7500000 135 0.32 42566 218 99.81 0.51
10000000 50000000 30000000 76 0.18 42642 83 99.98 0.19
50000000 100000000 75000000 4 0.01 42646 7 99.99 0.02
100000000 250000000 175000000 3 0.01 42649 3 100.00 0.01
TOTAL - - 42649 100 - - - -
Autor: Martin Sarmiento

4 Análisis Gráfico

4.1 Histogramas de Cantidad

par(mar = c(8, 7, 5, 2)) 
barplot(TDF_Enteros$ni, 
        names.arg = TDF_Enteros$MC,
        main = "",
        xlab = "", 
        ylab = "",
        col = "#E3E0AC",
        ylim = c(0, max(TDF_Enteros$ni) * 1.2),
        space = 0, 
        las = 2, 
        cex.names = 0.7)
mtext("Cantidad", side = 2, line = 4.5, cex = 1, font = 1)
mtext("Área (m²)", side = 1, line = 6)

mtext("Gráfica N°1: Distribución de Cantidad de Plantas Solares por el Área", 
      side = 3, 
      line = 2, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

par(mar = c(8, 7, 5, 2))
barplot(TDF_Enteros$ni, 
        main="",
        xlab = "",
        ylab = "",
        names.arg = TDF_Enteros$MC,
        col = "#E3E0AC",
        ylim = c(0, 58771),
        space = 0,
        cex.names = 0.7,
        las = 2) 
mtext("Cantidad", side = 2, line = 4.5, cex = 1, font = 1)
mtext("Área (m²)", side = 1, line = 6)

mtext("Gráfica N°2: Distribución de Cantidad de Plantas Solares por el Área", 
      side = 3, 
      line = 2, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

4.2 Histogramas Porcentuales

par(mar = c(8, 5, 5, 2))
bp3 <- barplot(TDF_Enteros$hi, 
        main = "",
        xlab = "",
        ylab = "Porcentaje (%)",
        col = "#E3E0AC",
        space = 0,
        names.arg = TDF_Enteros$MC,
        cex.names = 0.7,
        las = 2,
        ylim = c(0, max(TDF_Enteros$hi) * 1.2))
mtext("Área (m²)", side = 1, line = 6)

mtext("Gráfica N°3: Distribución Porcentual de las Plantas Solares por el Área", 
      side = 3, 
      line = 2, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

text(x = bp3, 
     y = TDF_Enteros$hi, 
     labels = paste0(round(TDF_Enteros$hi, 2), "%"), 
     pos = 3, cex = 0.6, col = "black")

par(mar = c(8, 5, 5, 2))
bp4 <- barplot(TDF_Enteros$hi, 
        main = "",
        xlab = "",
        ylab = "Porcentaje (%)",
        col = "#E3E0AC",
        space = 0,
        names.arg = TDF_Enteros$MC,
        las = 2,
        cex.names = 0.7,
        ylim = c(0, 110)) 
mtext("Área (m²)", side = 1, line = 6)

mtext("Gráfica N°4: Distribución Porcentual de las Plantas Solares por el Área", 
      side = 3, 
      line = 2, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

text(x = bp4, 
     y = TDF_Enteros$hi, 
     labels = paste0(round(TDF_Enteros$hi, 2), "%"), 
     pos = 3, cex = 0.6, col = "black")

4.3 Diagrama de Cajas (Boxplot)

par(mar = c(5, 5, 4, 2))
boxplot(Variable, 
        horizontal = TRUE,
        col = "#E3E0AC",
        xlab = "Área (m²)",
        cex.main = 0.9,
         main = "Gráfica N°5: Distribución del Área en las Plantas Solares")

4.4 Ojivas

par(mar = c(5, 5, 7, 10), xpd = TRUE)

# Coordenadas
x_asc <- TDF_Enteros$Ls
x_desc <- TDF_Enteros$Li
y_asc <- TDF_Enteros$Ni_asc
y_desc <- TDF_Enteros$Ni_desc

# 1. Dibujar la Ascendente 
plot(x_asc, y_asc,
     type = "b", 
     main = "",
     xlab = "Área (m²)",
     ylab = "Frecuencia acumulada",
     col = "black",
     pch = 19, 
     xlim = c(min(x_desc), max(x_asc)), 
     ylim = c(0, sum(TDF_Enteros$ni)),
     bty = "l"
)

# 2. Agregar la Descendente 
lines(x_desc, y_desc, col = "#BDB76B", type = "b", pch = 19)

grid()
mtext("Gráfica N°6: Ojivas Ascendentes y Descendentes de la\nDistribución del Área en las Plantas Solares", 
      side = 3, 
      line = 3, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

legend("right", 
       legend = c("Ascendente", "Descendente"), 
       col = c("black", "#BDB76B"), 
       lty = 1, 
       pch = 1, 
       cex = 0.6, 
       inset = c(0.05, 0.05),
       bty = "n")

5 Indicadores Estadísticos

## INDICADORES DE TENDENCIA CENTRAL
# Media aritmética
media <- round(mean(Variable), 2)

# Mediana
mediana <- round(median(Variable), 2)

# Moda
max_frecuencia <- max(TDF_Enteros$ni)
moda_vals <- TDF_Enteros$MC[TDF_Enteros$ni == max_frecuencia]
moda_txt <- paste(round(moda_vals, 2), collapse = ", ")

## INDICADORES DE DISPERSIÓN
# Varianza
varianza <- var(Variable)

# Desviación Estándar
sd_val <- sd(Variable)

# Coeficiente de Variación
cv <- round((sd_val / abs(media)) * 100, 2)

## INDICADORES DE FORMA
# Coeficiente de Asimetría
asimetria <- skewness(Variable, type = 2)

# Curtosis
curtosis <- kurtosis(Variable)

# Outliers
Q1 <- quantile(Variable, 0.25)
Q3 <- quantile(Variable, 0.75)
IQR_val <- Q3 - Q1
lim_inf <- Q1 - 1.5 * IQR_val
lim_sup <- Q3 + 1.5 * IQR_val

outliers_data <- Variable[Variable < lim_inf | Variable > lim_sup]
num_outliers <- length(outliers_data)

if(num_outliers > 0){
  rango_outliers <- paste0(num_outliers, " [", round(min(outliers_data), 2), "; ", round(max(outliers_data), 2), "]")
} else {
  rango_outliers <- "0 [Sin Outliers]"
}


tabla_indicadores <- data.frame(
 "Variable" = c("Área (m²)"),
 "Rango_MinMax" = paste0("[", round(min(Variable), 2), "; ", round(max(Variable), 2), "]"),
 "X" = c(media),
 "Me" = c(mediana),
 "Mo" = c(moda_txt),
 "V" = c(varianza),
 "Sd" = c(sd_val),
 "Cv" = c(cv),
 "As" = c(asimetria),
 "K" = c(curtosis),
 "Outliers" = rango_outliers)

# Generar Tabla GT
tabla_conclusiones_gt <- tabla_indicadores %>%
 gt() %>%
 tab_header(title = md("**Tabla N°3 de Conclusiones de Área de las Plantas Solares**")) %>%
 tab_source_note(source_note = "Autor: Martin Sarmiento") %>%
 cols_label(
  Variable = "Variable",
  Rango_MinMax = "Rango",
  X = "Media (X)",
  Me = "Mediana (Me)",
  Mo = "Moda (Mo)",
  V = "Varianza (V)",
  Sd = "Desv. Est. (Sd)",
  Cv = "C.V. (%)",
  As = "Asimetría (As)",
  K = "Curtosis (K)",
  Outliers = "Outliers [Intervalo]"
 ) %>%
 tab_options(
  heading.title.font.size = px(16),
  column_labels.background.color = "#F0F0F0"
 )

tabla_conclusiones_gt
Tabla N°3 de Conclusiones de Área de las Plantas Solares
Variable Rango Media (X) Mediana (Me) Moda (Mo) Varianza (V) Desv. Est. (Sd) C.V. (%) Asimetría (As) Curtosis (K) Outliers [Intervalo]
Área (m²) [0.01; 234700000] 180723.2 8067.38 3000 3.999744e+12 1999936 1106.63 72.62166 7167.555 5770 [153488; 234700000]
Autor: Martin Sarmiento

6 Conclusiones

La variable “Área” fluctúa entre 0 y 234700000 m² y sus valores se encuentran alrededor de 2357.98 m², con una desviación estándar de 1620196, siendo una variable muy heterogénea, cuyos valores se concentran en la parte baja de la variable con la agregación de presencia de valores atípicos de 8934 outliers; por todo lo anterior, el comportamiento de la variable es muy perjudicial.