#==============================CARGA DE DATOS Y LIBRERIAS========================
library(gt)
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Intentar cargar e1071, si no existe, se instala automaticamente
if(!require(e1071)) {
  install.packages("e1071")
  library(e1071)
}
## Cargando paquete requerido: e1071
# Funcion personalizada para calcular la moda
get_mode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

# Configuracion de ruta
setwd("C:/Users/HP/Documents/PROYECTO ESTADISTICA/RStudio")

# Carga de datos
datos <- read.csv2("tablap.csv", header = TRUE)

# Limpieza de datos
gas_production <- datos$Total.gas.production.by.2023
gas_production <- na.omit(gas_production)

#==============================REGLA DE STURGES=================================
R <- max(gas_production) - min(gas_production)
k <- 1 + (3.3 * log10(length(gas_production)))
k <- floor(k)
A <- R / k

liminf <- seq(from = min(gas_production), by = A, length.out = k)
limsup <- liminf + A
limsup[k] <- max(gas_production)

MC <- (liminf + limsup) / 2

# Conteo de frecuencias (ni)
ni <- c()
for (i in 1:k) {
  if (i == k)
    ni[i] <- length((subset(gas_production, gas_production >= liminf[i] & gas_production <= limsup[i])))
  else
    ni[i] <- length(subset(gas_production, gas_production >= liminf[i] & gas_production < limsup[i]))
}

# Calculos de frecuencias relativas y acumuladas
hi <- (ni / length(gas_production)) * 100
Niasc <- cumsum(ni)
Nidsc <- rev(cumsum(rev(ni)))
Hiasc <- cumsum(hi)
Hidsc <- rev(cumsum(rev(hi)))

# Creacion de la Tabla de Distribucion de Frecuencias (TDF)
TDFcu_gas_production <- data.frame(
  liminf = round(liminf, 2),
  limsup = round(limsup, 2),
  MC = round(MC, 2),
  ni = ni,
  hi_perc = round(hi, 2),
  Niasc = Niasc,
  Nidsc = Nidsc,
  Hiasc_perc = round(Hiasc, 2),
  Hidsc_perc = round(Hidsc, 2)
)

print("Distribucion de la Produccion Total de Gas hacia 2023")
## [1] "Distribucion de la Produccion Total de Gas hacia 2023"
print(TDFcu_gas_production)
##      liminf   limsup       MC    ni hi_perc Niasc Nidsc Hiasc_perc Hidsc_perc
## 1     25946  5024793  2525369 11815   94.06 11815 12561      94.06     100.00
## 2   5024793 10023639  7524216   552    4.39 12367   746      98.46       5.94
## 3  10023639 15022486 12523063   107    0.85 12474   194      99.31       1.54
## 4  15022486 20021333 17521910    38    0.30 12512    87      99.61       0.69
## 5  20021333 25020180 22520756    23    0.18 12535    49      99.79       0.39
## 6  25020180 30019026 27519603    13    0.10 12548    26      99.90       0.21
## 7  30019026 35017873 32518450     4    0.03 12552    13      99.93       0.10
## 8  35017873 40016720 37517296     1    0.01 12553     9      99.94       0.07
## 9  40016720 45015566 42516143     1    0.01 12554     8      99.94       0.06
## 10 45015566 50014413 47514990     1    0.01 12555     7      99.95       0.06
## 11 50014413 55013260 52513837     3    0.02 12558     6      99.98       0.05
## 12 55013260 60012107 57512683     2    0.02 12560     3      99.99       0.02
## 13 60012107 65010953 62511530     0    0.00 12560     1      99.99       0.01
## 14 65010953 70009800 67510377     1    0.01 12561     1     100.00       0.01
#=========================GRAFICAS==========================
# 1. Histograma
Histostur_gas <- hist(gas_production, 
                      main = "Distribucion de la Produccion Total de Gas hacia 2023",
                      breaks = seq(min(gas_production), max(gas_production), length.out = k + 1),
                      xlab = "PRODUCCION DE GAS (M3)",
                      ylab = "CANTIDAD", 
                      col = "darkgoldenrod", 
                      plot = TRUE)

# 2. Ojivas
x_asc_gas <- c(min(TDFcu_gas_production$liminf), TDFcu_gas_production$limsup)
y_asc_gas <- c(0, TDFcu_gas_production$Niasc)
x_desc_gas <- c(TDFcu_gas_production$liminf, max(TDFcu_gas_production$limsup))
y_desc_gas <- c(TDFcu_gas_production$Nidsc, 0)

y_plot_range_gas <- c(0, max(c(y_asc_gas, y_desc_gas), na.rm = TRUE))
x_plot_range_gas <- range(c(x_asc_gas, x_desc_gas), na.rm = TRUE)

plot(x_asc_gas, y_asc_gas, type = "o",
     main = "Distribucion de la Produccion Total de Gas hacia 2023",
     xlab = "PRODUCCION DE GAS", ylab = "CANTIDAD",
     col = "darkgreen",
     xlim = x_plot_range_gas,
     ylim = y_plot_range_gas)
lines(x_desc_gas, y_desc_gas, col = "darkred", type = "o")

# 3. Boxplot
boxplot(gas_production, horizontal = TRUE, col = "darkorchid",
        main = "Distribucion de la Produccion Total de Gas hacia 2023",
        xlab = "PRODUCCION DE GAS")

#======================INDICADORES ESTADISTICOS===============================
# Convertir a numerico para evitar errores de tipo de dato
gas_num <- as.numeric(gas_production)

mean_val <- mean(gas_num)
median_val <- median(gas_num)
sd_val <- sd(gas_num)
var_val <- var(gas_num)
skew_val <- skewness(gas_num)
kurt_val <- kurtosis(gas_num)
cv_val <- (sd_val / mean_val) * 100

indicadores_gas_production <- data.frame(
  Indicador = c("Moda","Mediana", "Media(x)", "Desviacion Estandar", "Varianza", "Coef Variacion (%)", "Asimetria", "Curtosis"),
  Valor = c(
    round(get_mode(gas_num), 2),
    round(median_val, 2),
    round(mean_val, 2),
    round(sd_val, 2),
    round(var_val, 2),
    round(cv_val, 2),
    round(skew_val, 2),
    round(kurt_val, 2)
  )
)

print("Distribucion de la Produccion Total de Gas hacia 2023")
## [1] "Distribucion de la Produccion Total de Gas hacia 2023"
print(indicadores_gas_production)
##             Indicador        Valor
## 1                Moda 1.725565e+06
## 2             Mediana 1.001777e+06
## 3            Media(x) 1.740013e+06
## 4 Desviacion Estandar 2.784581e+06
## 5            Varianza 7.753889e+12
## 6  Coef Variacion (%) 1.600300e+02
## 7           Asimetria 7.960000e+00
## 8            Curtosis 1.124600e+02