1 Configuración y Carga de Datos

##### UNIVERSIDAD CENTRAL DEL ECUADOR #####
#### AUTOR: MARTIN SARMIENTO ####
### CARRERA: INGENIERÍA EN PETRÓLEOS #####


#### VARIABLE LONGITUD ####
## DATASET ##
setwd("~/R/LONGITUD")
# Cargar dataset
Datos <- read.csv("Dataset_Mundial_Final.csv", sep = ";", dec = ",", fileEncoding = "latin1")
# Estructura de los datos
str(Datos)
## 'data.frame':    58978 obs. of  29 variables:
##  $ ï..OBJECTID           : int  2 3 4 5 6 7 8 9 10 11 ...
##  $ code                  : chr  "00001-AFG-P" "00002-AFG-P" "00003-AFG-P" "00004-AFG-P" ...
##  $ plant_name            : chr  "Badghis Solar Power Plant" "Balkh solar farm" "Behsood solar farm" "Dab Pal 4 solar farm" ...
##  $ country               : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ operational_status    : chr  "cancelled - inferred 4 y" "cancelled - inferred 4 y" "cancelled - inferred 4 y" "shelved - inferred 2 y" ...
##  $ longitude             : num  62.9 67.1 70.4 66.2 65.7 ...
##  $ latitude              : num  35.1 36.7 34.4 33.8 31.7 ...
##  $ elevation             : int  918 359 629 2288 1060 1060 1392 398 410 1012 ...
##  $ area                  : num  6.74 10.72 487.73 111.8 1929.96 ...
##  $ size                  : chr  "Small" "Small" "Small" "Small" ...
##  $ slope                 : num  7.38 0.49 1.1 6.16 1.23 ...
##  $ slope_type            : chr  "Moderado" "Plano o casi plano" "Plano o casi plano" "Moderado" ...
##  $ curvature             : num  -0.024 0 0 0.045 -0.005 -0.005 -0.015 0 0 -0.009 ...
##  $ curvature_type        : chr  "Superficies cóncavas / Valles" "Superficies planas o intermedias" "Superficies planas o intermedias" "Superficies convexas / Crestas" ...
##  $ aspect                : num  96.8 358.5 36.2 305.8 248.4 ...
##  $ aspect_type           : chr  "East" "North" "Northeast" "Northwest" ...
##  $ dist_to_road          : num  7037.1 92.7 112.1 1705.3 115.8 ...
##  $ ambient_temperature   : num  14.4 17.88 21.32 8.86 19.64 ...
##  $ ghi                   : num  5.82 5.58 5.8 6.75 6.62 ...
##  $ humidity              : num  47.7 42.3 36.4 37.3 24.2 ...
##  $ wind_speed            : num  0.039 0.954 0.234 0.943 0.37 ...
##  $ wind_direction        : num  187.5 207.4 255.6 160.3 97.7 ...
##  $ dt_wind               : chr  "South" "Southwest" "West" "South" ...
##  $ solar_aptitude        : num  0.72 0.635 0.685 0.659 0.819 0.819 0.818 0.642 0.63 0.374 ...
##  $ solar_aptitude_rounded: int  7 6 7 7 8 8 8 6 6 4 ...
##  $ solar_aptittude_class : chr  "Alta" "Alta" "Alta" "Alta" ...
##  $ capacity              : num  32 40 60 3000 100 100 36 50 25 100 ...
##  $ optimal_tilt          : num  30 31 31.1 33 31 ...
##  $ pv_potential          : num  4.61 4.41 4.57 5.42 5.17 ...
# Cargamos las librerias
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gt)
library(e1071)

2 Cálculo de Intervalos y Frecuencias

# Extraer variable
Variable <- na.omit(Datos$longitude)
N <- length(Variable)

# CÁLCULO LÍMITES DECIMALES
min_dec <- min(Variable)
max_dec <- max(Variable)
k_dec <- floor(1 + 3.322 * log10(N))
rango_dec <- max(Variable) - min(Variable)
amplitud_dec <- rango_dec / k_dec

# Cortes exactos
cortes_dec <- seq(min(Variable), max(Variable), length.out = k_dec + 1)
cortes_dec[length(cortes_dec)] <- max(Variable) + 0.0001

# Frecuencias
inter_dec <- cut(Variable, breaks = cortes_dec, include.lowest = TRUE, right = FALSE)
ni_dec <- as.vector(table(inter_dec))

# CÁLCULOS MATEMÁTICOS
hi_dec <- (ni_dec / N) * 100
Ni_asc_dec <- cumsum(ni_dec)
Hi_asc_dec <- cumsum(hi_dec)
Ni_desc_dec <- rev(cumsum(rev(ni_dec)))
Hi_desc_dec <- rev(cumsum(rev(hi_dec)))

# Dataframe Decimal
TDF_Decimal <- data.frame(
  Li = cortes_dec[1:k_dec],
  Ls = cortes_dec[2:(k_dec+1)],
  MC = (cortes_dec[1:k_dec] + cortes_dec[2:(k_dec+1)]) / 2,
  ni = ni_dec,
  hi = hi_dec,
  Ni_asc = Ni_asc_dec,
  Ni_desc = Ni_desc_dec,
  Hi_asc = Hi_asc_dec,
  Hi_desc = Hi_desc_dec)


# CÁLCULO LÍMITES ENTEROS 
BASE <- 10
min_int <- floor(min(Variable) / BASE) * BASE
max_int <- ceiling(max(Variable) / BASE) * BASE
k_int_sug <- floor(1 + 3.322 * log10(N))
Rango_int <- max_int - min_int

# Calculamos la amplitud necesaria para cubrir el rango exacto
Amplitud_int <- ceiling((Rango_int / k_int_sug) / 10) * 10
if(Amplitud_int == 0) Amplitud_int <- 10

# GENERACIÓN DE CORTES PRECISOS
cortes_int <- seq(from = min_int, by = Amplitud_int, length.out = k_int_sug + 1)

if(max(cortes_int) < max(Variable)) {
  cortes_int <- c(cortes_int, max(cortes_int) + Amplitud_int)
}

# Eliminamos intervalos sobrantes
while(length(cortes_int) > 2 && cortes_int[length(cortes_int)-1] >= max(Variable)) {
  cortes_int <- cortes_int[-length(cortes_int)]
}

K_real <- length(cortes_int) - 1
lim_inf_int <- cortes_int[1:K_real]
lim_sup_int <- cortes_int[2:(K_real+1)]

# Frecuencias
inter_int <- cut(Variable, breaks = cortes_int, include.lowest = TRUE, right = FALSE)
ni_int <- as.vector(table(inter_int))

# CÁLCULOS MATEMÁTICOS
hi_int <- (ni_int / N) * 100
Ni_asc_int <- cumsum(ni_int)
Hi_asc_int <- cumsum(hi_int)
Ni_desc_int <- rev(cumsum(rev(ni_int)))
Hi_desc_int <- rev(cumsum(rev(hi_int)))

# Dataframe Entero
TDF_Enteros <- data.frame(
  Li = lim_inf_int,
  Ls = lim_sup_int,
  MC = (lim_inf_int + lim_sup_int) / 2,
  ni = ni_int,
  hi = hi_int,
  Ni_asc = Ni_asc_int,
  Ni_desc = Ni_desc_int,
  Hi_asc = Hi_asc_int,
  Hi_desc = Hi_desc_int)

3 Tabla de Distribución de Frecuencias

3.1 Tabla con Límites Decimales

# Crear Dataframe
TDF_Dec_Final <- data.frame(
  Li      = as.character(round(TDF_Decimal$Li, 2)),
  Ls      = as.character(round(TDF_Decimal$Ls, 2)),
  MC      = as.character(round(TDF_Decimal$MC, 2)),
  ni      = as.character(TDF_Decimal$ni),
  hi      = as.character(round(TDF_Decimal$hi, 2)),
  Ni_asc  = as.character(TDF_Decimal$Ni_asc),
  Ni_desc = as.character(TDF_Decimal$Ni_desc),
  Hi_asc  = as.character(round(TDF_Decimal$Hi_asc, 2)),
  Hi_desc = as.character(round(TDF_Decimal$Hi_desc, 2))
)

# Calcular Totales
totales_dec <- c("TOTAL", "-", "-", sum(TDF_Decimal$ni), round(sum(TDF_Decimal$hi), 2), "-", "-", "-", "-")
TDF_Dec_Final <- rbind(TDF_Dec_Final, totales_dec)

# Generar GT
TDF_Dec_Final %>%
 gt() %>%
 tab_header(title = md("**Tabla N°1 de Distribución de Frecuencias de Longitud (°) de las Plantas Solares**")) %>%
  tab_source_note(source_note = "Autor: Martin Sarmiento") %>%
 cols_label(
  Li = "Lim. Inf", 
  Ls = "Lim. Sup", 
  MC = "Marca Clase",
  ni = "Frec. Abs (ni)", 
  hi = "Frec. Rel (%)",
  Ni_asc = "Ni (Asc)", 
  Ni_desc = "Ni (Desc)",
  Hi_asc = "Hi Asc (%)", 
  Hi_desc = "Hi Desc (%)"
 ) %>%
 cols_align(align = "center", columns = everything()) %>%
 tab_options(heading.title.font.size = px(14), column_labels.background.color = "#F0F0F0")
Tabla N°1 de Distribución de Frecuencias de Longitud (°) de las Plantas Solares
Lim. Inf Lim. Sup Marca Clase Frec. Abs (ni) Frec. Rel (%) Ni (Asc) Ni (Desc) Hi Asc (%) Hi Desc (%)
-124.1 -105.22 -114.66 1693 2.87 1693 58978 2.87 100
-105.22 -86.34 -95.78 2014 3.41 3707 57285 6.29 97.13
-86.34 -67.46 -76.9 5219 8.85 8926 55271 15.13 93.71
-67.46 -48.58 -58.02 592 1 9518 50052 16.14 84.87
-48.58 -29.7 -39.14 990 1.68 10508 49460 17.82 83.86
-29.7 -10.82 -20.26 74 0.13 10582 48470 17.94 82.18
-10.82 8.06 -1.38 7990 13.55 18572 48396 31.49 82.06
8.06 26.94 17.5 10953 18.57 29525 40406 50.06 68.51
26.94 45.82 36.38 1535 2.6 31060 29453 52.66 49.94
45.82 64.7 55.26 370 0.63 31430 27918 53.29 47.34
64.7 83.58 74.14 3271 5.55 34701 27548 58.84 46.71
83.58 102.46 93.02 2401 4.07 37102 24277 62.91 41.16
102.46 121.34 111.9 10336 17.53 47438 21876 80.43 37.09
121.34 140.22 130.78 9300 15.77 56738 11540 96.2 19.57
140.22 159.1 149.66 2216 3.76 58954 2240 99.96 3.8
159.1 177.98 168.54 24 0.04 58978 24 100 0.04
TOTAL - - 58978 100 - - - -
Autor: Martin Sarmiento

3.2 Tabla con Límites Enteros

# Crear Dataframe 
TDF_Int_Final <- data.frame(
  Li      = as.character(TDF_Enteros$Li),
  Ls      = as.character(TDF_Enteros$Ls),
  MC      = as.character(TDF_Enteros$MC),
  ni      = as.character(TDF_Enteros$ni),
  hi      = as.character(round(TDF_Enteros$hi, 2)),
  Ni_asc  = as.character(TDF_Enteros$Ni_asc),
  Ni_desc = as.character(TDF_Enteros$Ni_desc),
  Hi_asc  = as.character(round(TDF_Enteros$Hi_asc, 2)),
  Hi_desc = as.character(round(TDF_Enteros$Hi_desc, 2))
)

# Calcular Totales
totales_int <- c("TOTAL", "-", "-", sum(TDF_Enteros$ni), round(sum(TDF_Enteros$hi), 2), "-", "-", "-", "-")
TDF_Int_Final <- rbind(TDF_Int_Final, totales_int)

# Generar GT
TDF_Int_Final %>%
 gt() %>%
 tab_header(title = md("**Tabla N°2 de Distribución de Frecuencias de Longitud (°) de las Plantas Solares**")) %>%
  tab_source_note(source_note = "Autor: Martin Sarmiento") %>%
 cols_label(
  Li = "Lim. Inf", 
  Ls = "Lim. Sup", 
  MC = "Marca Clase",
  ni = "Frec. Abs (ni)", 
  hi = "Frec. Rel (%)",
  Ni_asc = "Ni (Asc)", 
  Ni_desc = "Ni (Desc)",
  Hi_asc = "Hi Asc (%)", 
  Hi_desc = "Hi Desc (%)"
 ) %>%
 cols_align(align = "center", columns = everything()) %>%
 tab_options(heading.title.font.size = px(14), column_labels.background.color = "#F0F0F0")
Tabla N°2 de Distribución de Frecuencias de Longitud (°) de las Plantas Solares
Lim. Inf Lim. Sup Marca Clase Frec. Abs (ni) Frec. Rel (%) Ni (Asc) Ni (Desc) Hi Asc (%) Hi Desc (%)
-130 -110 -120 1501 2.55 1501 58978 2.55 100
-110 -90 -100 1645 2.79 3146 57477 5.33 97.45
-90 -70 -80 5523 9.36 8669 55832 14.7 94.67
-70 -50 -60 740 1.25 9409 50309 15.95 85.3
-50 -30 -40 1099 1.86 10508 49569 17.82 84.05
-30 -10 -20 79 0.13 10587 48470 17.95 82.18
-10 10 0 9571 16.23 20158 48391 34.18 82.05
10 30 20 9681 16.41 29839 38820 50.59 65.82
30 50 40 1316 2.23 31155 29139 52.82 49.41
50 70 60 434 0.74 31589 27823 53.56 47.18
70 90 80 3598 6.1 35187 27389 59.66 46.44
90 110 100 4780 8.1 39967 23791 67.77 40.34
110 130 120 10391 17.62 50358 19011 85.38 32.23
130 150 140 8568 14.53 58926 8620 99.91 14.62
150 170 160 47 0.08 58973 52 99.99 0.09
170 190 180 5 0.01 58978 5 100 0.01
TOTAL - - 58978 100 - - - -
Autor: Martin Sarmiento

4 Análisis Gráfico

4.1 Histogramas de Cantidad

par(mar = c(8, 7, 5, 2)) 
barplot(TDF_Enteros$ni, 
        names.arg = TDF_Enteros$MC,
        main = "",,
        xlab = "", 
        ylab = "",
        col = "#FF6961",
        ylim = c(0, max(TDF_Enteros$ni) * 1.2),
        space = 0, 
        las = 2, 
        cex.names = 0.7)
mtext("Cantidad", side = 2, line = 4.5, cex = 1, font = 1)
mtext("Longitud (°)", side = 1, line = 4)

mtext("Gráfica N°1: Distribución de Cantidad de Plantas Solares por Longitud", 
      side = 3, 
      line = 2, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

par(mar = c(8, 7, 5, 2))
barplot(TDF_Enteros$ni, 
        main="",
        xlab = "",
        ylab = "",
        names.arg = TDF_Enteros$MC,
        col = "#FF6961",
        ylim = c(0, 58771),
        space = 0,
        cex.names = 0.7,
        las = 2) 
mtext("Cantidad", side = 2, line = 4.5, cex = 1, font = 1)
mtext("Longitud (°)", side = 1, line = 4)

mtext("Gráfica N°2: Distribución de Cantidad de Plantas Solares por Longitud", 
      side = 3, 
      line = 2, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

4.2 Histogramas Porcentuales

par(mar = c(8, 5, 5, 2))
bp3 <- barplot(TDF_Enteros$hi, 
        main = "",
        xlab = "",
        ylab = "Porcentaje (%)",
        col = "#FF6961",
        ylim = c(0, max(TDF_Enteros$hi) * 1.3),
        space = 0,
        names.arg = TDF_Enteros$MC,
        cex.names = 0.7,
        las = 2) 
mtext("Longitud (°)", side = 1, line = 4)

mtext("Gráfica N°3: Distribución Porcentual de las Plantas Solares por Longitud", 
      side = 3, 
      line = 2, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

text(x = bp3, 
     y = TDF_Enteros$hi, 
     labels = paste0(round(TDF_Enteros$hi, 2), "%"), 
     pos = 3, cex = 0.6, col = "black")

par(mar = c(8, 5, 5, 2))
bp4 <- barplot(TDF_Enteros$hi, 
        main = "",
        xlab = "",
        ylab = "Porcentaje (%)",
        col = "#FF6961",
        space = 0,
        names.arg = TDF_Enteros$MC,
        las = 2,
        cex.names = 0.7,
        ylim = c(0, 100))
mtext("Longitud (°)", side = 1, line = 4)

mtext("Gráfica N°4: Distribución Porcentual de las Plantas Solares por Longitud", 
      side = 3, 
      line = 2, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

text(x = bp4, 
     y = TDF_Enteros$hi, 
     labels = paste0(round(TDF_Enteros$hi, 2), "%"), 
     pos = 3, cex = 0.6, col = "black")

4.3 Diagrama de Cajas (Boxplot)

par(mar = c(5, 5, 4, 2))
boxplot(Variable, 
        horizontal = TRUE,
        col = "#FF6961",
        xlab = "Longitud (°)",
        cex.main = 0.9,
        main = "Gráfica N°5: Distribución de la Longitud en las Plantas Solares")

4.4 Ojivas

par(mar = c(5, 5, 7, 10), xpd = TRUE)

# Coordenadas
x_asc <- TDF_Enteros$Ls
x_desc <- TDF_Enteros$Li
y_asc <- TDF_Enteros$Ni_asc
y_desc <- TDF_Enteros$Ni_desc

# 1. Dibujar la Ascendente 
plot(x_asc, y_asc,
     type = "b", 
     main = "",
     xlab = "Longitud (°)",
     ylab = "Frecuencia Acumulada",
     col = "black",
     pch = 19, 
     xlim = c(min(TDF_Enteros$Li), max(x_asc)), 
     ylim = c(0, sum(TDF_Enteros$ni)))

# 2. Agregar la Descendente 
lines(x_desc, y_desc, col = "red", type = "b", pch = 19)

grid()
mtext("Gráfica N°6: Ojivas Ascendentes y Descendentes de la\nDistribución de la Longitud en las Plantas Solares", 
      side = 3, 
      line = 3, 
      adj = 0.5, 
      cex = 0.9, 
      font = 2)

legend("left", 
       legend = c("Ascendente", "Descendente"), 
       col = c("black", "red"), 
       lty = 1, 
       pch = 1, 
       cex = 0.6, 
       inset = c(0.05, 0.05),
       bty = "n")

5 Indicadores Estadísticos

## INDICADORES DE TENDENCIA CENTRAL
# Media aritmética
media <- round(mean(Variable), 2)

# Mediana
mediana <- round(median(Variable), 2)

# Moda
max_frecuencia <- max(TDF_Enteros$ni)
moda_vals <- TDF_Enteros$MC[TDF_Enteros$ni == max_frecuencia]
moda_txt <- paste(round(moda_vals, 2), collapse = ", ")

## INDICADORES DE DISPERSIÓN
# Varianza
varianza <- var(Variable)

# Desviación Estándar
sd_val <- sd(Variable)

# Coeficiente de Variación
cv <- round((sd_val / abs(media)) * 100, 2)

## INDICADORES DE FORMA
# Coeficiente de Asimetría
asimetria <- skewness(Variable, type = 2)

# Curtosis
curtosis <- kurtosis(Variable)

# Outliers
Q1 <- quantile(Variable, 0.25)
Q3 <- quantile(Variable, 0.75)
IQR_val <- Q3 - Q1
lim_inf <- Q1 - 1.5 * IQR_val
lim_sup <- Q3 + 1.5 * IQR_val

outliers_data <- Variable[Variable < lim_inf | Variable > lim_sup]
num_outliers <- length(outliers_data)

if(num_outliers > 0){
  rango_outliers <- paste0(num_outliers, " [", round(min(outliers_data), 2), "; ", round(max(outliers_data), 2), "]")
} else {
  rango_outliers <- "0 [Sin Outliers]"
}

tabla_indicadores <- data.frame(
 "Variable" = c("Longitud (°)"),
 "Rango_MinMax" = paste0("[", round(min(Variable), 2), "; ", round(max(Variable), 2), "]"),
 "X" = c(media),
 "Me" = c(mediana),
 "Mo" = c(moda_txt),
 "V" = c(varianza),
 "Sd" = c(sd_val),
 "Cv" = c(cv),
 "As" = c(asimetria),
 "K" = c(curtosis),
 "Outliers" = rango_outliers)

# Generar Tabla GT
tabla_conclusiones_gt <- tabla_indicadores %>%
 gt() %>%
 tab_header(title = md("**Tabla N°3 de Conclusiones de Longitud de las Plantas Solares**")) %>%
 tab_source_note(source_note = "Autor: Martin Sarmiento") %>%
 cols_label(
  Variable = "Variable",
  Rango_MinMax = "Rango",
  X = "Media (X)",
  Me = "Mediana (Me)",
  Mo = "Moda (Mo)",
  V = "Varianza (V)",
  Sd = "Desv. Est. (Sd)",
  Cv = "C.V. (%)",
  As = "Asimetría (As)",
  K = "Curtosis (K)",
  Outliers = "Outliers [Intervalo]"
 ) %>%
 tab_options(
  heading.title.font.size = px(16),
  column_labels.background.color = "#f0f0f0"
 )

tabla_conclusiones_gt
Tabla N°3 de Conclusiones de Longitud de las Plantas Solares
Variable Rango Media (X) Mediana (Me) Moda (Mo) Varianza (V) Desv. Est. (Sd) C.V. (%) Asimetría (As) Curtosis (K) Outliers [Intervalo]
Longitud (°) [-124.1; 177.98] 43.68 26.76 120 5977.611 77.31501 177 -0.3934365 -1.015861 0 [Sin Outliers]
Autor: Martin Sarmiento

6 Conclusiones

La variable “Longitud” fluctúa entre -124.1° y 177.98° y sus valores se encuentran alrededor de 26.76°, con una desviación estándar de 77.31501, siendo una variable muy heterogénea, cuyos valores se concentran en la parte media alta de la variable sin la presencia de valores atípicos; por todo lo anterior, el comportamiento de la variable es regular.