#Tema:Variable cualitativa Formacion geologica #Autor:Edison Arteaga #Fecha:14/4/2025 #Importar datos y omitimos los na
library(readr)
setwd("/cloud/project")
datos <- read_csv("point_oil-gas-other-regulated-wells-beginning-1860.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 42045 Columns: 52
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (38): Well Name, Company Name, Well Type, Map Symbol, Well Status, Stat...
## dbl (12): API Well Number, County Code, API Hole Number, Sidetrack, Complet...
## lgl (1): Financial Security
## dttm (1): Date Last Modified
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Espacio_acres <- as.numeric(datos$`Spacing Acres`)
## Warning: NAs introduced by coercion
dimensiones <- na.omit(Espacio_acres)
max(dimensiones)
## [1] 691.78
max
## function (..., na.rm = FALSE) .Primitive("max")
min(dimensiones)
## [1] 5
min
## function (..., na.rm = FALSE) .Primitive("min")
R <- max(dimensiones) - min(dimensiones)
R
## [1] 686.78
k <- 1 + (3.3 * log10(length(dimensiones)))
k <- floor(k)
k
## [1] 11
A <- R / k
A
## [1] 62.43455
LimiteInf <- seq(from = min(dimensiones), to = max(dimensiones) - A, by = A)
LimiteInf
## [1] 5.00000 67.43455 129.86909 192.30364 254.73818 317.17273 379.60727
## [8] 442.04182 504.47636 566.91091 629.34545
LimiteSup <- seq(from = min(dimensiones) + A, to = max(dimensiones), by = A)
LimiteSup
## [1] 67.43455 129.86909 192.30364 254.73818 317.17273 379.60727 442.04182
## [8] 504.47636 566.91091 629.34545 691.78000
MC <- (LimiteInf + LimiteSup) / 2
MC
## [1] 36.21727 98.65182 161.08636 223.52091 285.95545 348.39000 410.82455
## [8] 473.25909 535.69364 598.12818 660.56273
n <- c()
for (i in 1:k) {
if (i == k)
n[i] <- length(subset(dimensiones, dimensiones >= LimiteInf[i] & dimensiones <= LimiteSup[i]))
else
n[i] <- length(subset(dimensiones, dimensiones >= LimiteInf[i] & dimensiones < LimiteSup[i]))
}
sum(n)
## [1] 1163
hi <- (n / length(dimensiones)) * 100
sum(hi)
## [1] 100
Ni_asc <- cumsum(n)
Hi_asc <- cumsum(hi)
Ni_dsc <- rev(cumsum(rev(n)))
Hi_dsc <- rev(cumsum(rev(hi)))
Tabla_dimensiones <- data.frame(LimiteInf, LimiteSup, MC, n, round(hi,2), Ni_asc, round(Hi_asc,2), Ni_dsc, round(Hi_dsc,2))
colnames(Tabla_dimensiones) <- c("LimiteInf", "LimiteSup", "MC", "ni", "hi (%)", "Ni_asc", "Hi_asc(%)", "Ni_dsc", "Hi_dsc(%)")
Tabla_dimensiones
## LimiteInf LimiteSup MC ni hi (%) Ni_asc Hi_asc(%) Ni_dsc Hi_dsc(%)
## 1 5.00000 67.43455 36.21727 503 43.25 503 43.25 1163 100.00
## 2 67.43455 129.86909 98.65182 243 20.89 746 64.14 660 56.75
## 3 129.86909 192.30364 161.08636 107 9.20 853 73.34 417 35.86
## 4 192.30364 254.73818 223.52091 35 3.01 888 76.35 310 26.66
## 5 254.73818 317.17273 285.95545 25 2.15 913 78.50 275 23.65
## 6 317.17273 379.60727 348.39000 46 3.96 959 82.46 250 21.50
## 7 379.60727 442.04182 410.82455 7 0.60 966 83.06 204 17.54
## 8 442.04182 504.47636 473.25909 1 0.09 967 83.15 197 16.94
## 9 504.47636 566.91091 535.69364 1 0.09 968 83.23 196 16.85
## 10 566.91091 629.34545 598.12818 55 4.73 1023 87.96 195 16.77
## 11 629.34545 691.78000 660.56273 140 12.04 1163 100.00 140 12.04
hist(dimensiones, main="Gráfica 8.1: Distribución de la frecuencia del espacio (acres)",
xlab = "Espacio (acres)", ylab = "Cantidad", col="lightblue", breaks=seq(min(dimensiones), max(dimensiones), A))
# HISTOGRAMA GLOBAL
hist(dimensiones, main="Gráfica 8.2: Distribución Global de frecuencia del espacio (acres)",
xlab = "Espacio (acres)", ylab = "Cantidad", col="lightblue", breaks=seq(min(dimensiones), max(dimensiones), A), ylim=c(0,length(dimensiones)))
# BARRAS LOCAL PORCENTUAL
barplot(hi, space=0, main="Gráfica 8.3: Distribución Local Porcentual del espacio (acres)",
xlab="Espacio (MC)", ylab="Porcentaje (%)", names.arg=round(MC,1), col="darkgreen", las=2)
# BARRAS GLOBAL PORCENTUAL
barplot(hi, space=0, main="Gráfica 8.4: Distribución Global Porcentual del espacio (acres)",
xlab="Espacio (MC)", ylab="Porcentaje (%)", names.arg=round(MC,1), col="darkgreen", las=2, ylim=c(0,100))
# DIAGRAMA DE CAJAS
boxplot(dimensiones, horizontal=TRUE, col="lightgreen", main="Gráfica 8.5: Diagrama de Cajas del espacio (acres)", xlab="Espacio (acres)")
# OJIVAS
plot(LimiteInf, Ni_dsc, main="Gráfica 8.6: Ojivas del Espacio (acres)", xlab="Espacio (acres)", ylab="Cantidad", type="b", col="red")
lines(LimiteSup, Ni_asc, type="b", col="blue")
legend("topright", legend=c("Ojiva descendente", "Ojiva ascendente"), col=c("red","blue"), lty=1, pch=1)
plot(LimiteInf, Hi_dsc, main="Gráfica 8.7: Ojivas Porcentuales: Espacio (acres)", xlab="Espacio (acres)", ylab="Porcentaje (%)", type="b", col="red")
lines(LimiteSup, Hi_asc, type="b", col="blue")
legend("topright", legend=c("Ojiva descendente", "Ojiva ascendente"), col=c("red","blue"), lty=1, pch=1)
# Calcular outliers con boxplot.stats
outliers <- boxplot.stats(dimensiones)$out
num_outliers <- length(outliers)
num_outliers
## [1] 197
minimooutliers <- min(outliers)
minimooutliers
## [1] 479.268
maximooutliers <- max(outliers)
maximooutliers
## [1] 691.78
media_aritmetica <- mean(dimensiones)
mediana <- median(dimensiones)
desviación_estandar <- sd(dimensiones)
coeficiente_variabilidad <- (desviación_estandar / media_aritmetica) * 100
ri <- min(dimensiones)
rs <- max(dimensiones)
library(e1071)
As <- skewness(dimensiones)
As
## [1] 1.363688
curtosis <- kurtosis(dimensiones)
curtosis
## [1] 0.2336894
Variable <- c("Espacio (acres)")
Tabla_indicadores <- data.frame(Variable, ri, rs, round(media_aritmetica,2), mediana,
round(desviación_estandar,2), round(coeficiente_variabilidad,2),
round(As,2), round(curtosis,2))
colnames(Tabla_indicadores) <- c("Variable","minimo","máximo","x̄","Me","S","Cv (%)","As","K")
Tabla_indicadores
## Variable minimo máximo x̄ Me S Cv (%) As K
## 1 Espacio (acres) 5 691.78 183.5 80 218.27 118.94 1.36 0.23
Outliers_Espacio <- outliers
min_out <- min(Outliers_Espacio)
min_out
## [1] 479.268
max_out <- max(Outliers_Espacio)
max_out
## [1] 691.78
R_out <- max_out - min_out
k_out <- floor(1 + (3.3 * log10(length(Outliers_Espacio))))
A_out <- R_out / k_out
LimiteInf_out <- seq(from = min_out, to = max_out - A_out, by = A_out)
LimiteInf_out
## [1] 479.268 505.832 532.396 558.960 585.524 612.088 638.652 665.216
LimiteSup_out <- seq(from = min_out + A_out, to = max_out, by = A_out)
LimiteSup_out
## [1] 505.832 532.396 558.960 585.524 612.088 638.652 665.216 691.780
MC_out <- (LimiteInf_out + LimiteSup_out) / 2
n_out <- c()
for (i in 1:k_out) {
if (i == k_out)
n_out[i] <- length(subset(Outliers_Espacio, Outliers_Espacio >= LimiteInf_out[i] & Outliers_Espacio <= LimiteSup_out[i]))
else
n_out[i] <- length(subset(Outliers_Espacio, Outliers_Espacio >= LimiteInf_out[i] & Outliers_Espacio < LimiteSup_out[i]))
}
hi_out <- (n_out / length(Outliers_Espacio)) * 100
Ni_asc_out <- cumsum(n_out)
Hi_asc_out <- cumsum(hi_out)
Ni_dsc_out <- rev(cumsum(rev(n_out)))
Hi_dsc_out <- rev(cumsum(rev(hi_out)))
Tabla_Outliers_DF <- data.frame(LimiteInf_out, LimiteSup_out, MC_out, n_out, round(hi_out,2),
Ni_asc_out, round(Hi_asc_out,2), Ni_dsc_out, round(Hi_dsc_out,2))
colnames(Tabla_Outliers_DF) <- c("LimiteInf", "LimiteSup", "MC", "ni", "hi (%)", "Ni_asc", "Hi_asc(%)", "Ni_dsc", "Hi_dsc(%)")
Tabla_Outliers_DF
## LimiteInf LimiteSup MC ni hi (%) Ni_asc Hi_asc(%) Ni_dsc Hi_dsc(%)
## 1 479.268 505.832 492.550 1 0.51 1 0.51 197 100.00
## 2 505.832 532.396 519.114 0 0.00 1 0.51 196 99.49
## 3 532.396 558.960 545.678 1 0.51 2 1.02 196 99.49
## 4 558.960 585.524 572.242 7 3.55 9 4.57 195 98.98
## 5 585.524 612.088 598.806 6 3.05 15 7.61 188 95.43
## 6 612.088 638.652 625.370 82 41.62 97 49.24 182 92.39
## 7 638.652 665.216 651.934 92 46.70 189 95.94 100 50.76
## 8 665.216 691.780 678.498 8 4.06 197 100.00 8 4.06
hist(Outliers_Espacio, main="Gráfica 8.8: Histograma de valores atípicos del espacio (acres)",
xlab = "Espacio (acres)", ylab = "Cantidad", col="orange", breaks=seq(min_out, max_out, A_out))
# Barras de porcentaje
barplot(hi_out, space=0, main="Gráfica 8.9: Barras porcentuales de valores atípicos del espacio (acres)",
xlab="Espacio (MC)", ylab="Porcentaje (%)", names.arg=round(MC_out,1), col="darkred", las=2)
# Diagrama de cajas para los outliers
boxplot(Outliers_Espacio, horizontal=TRUE, col="tomato",
main="Gráfica 8.10: Diagrama de cajas de valores atípicos del espacio (acres)",
xlab="Espacio (acres)")
# Ojiva en cantidad
plot(LimiteInf_out, Ni_dsc_out, main="Gráfica 8.11: Ojivas (cantidad) de valores atípicos del espacio (acres)",
xlab="Espacio (acres)", ylab="Cantidad", type="b", col="red")
lines(LimiteSup_out, Ni_asc_out, type="b", col="blue")
legend("topright", legend=c("Ojiva descendente", "Ojiva ascendente"),
col=c("red","blue"), lty=1, pch=1)
# Ojiva en porcentaje
plot(LimiteInf_out, Hi_dsc_out, main="Gráfica 8.12: Ojivas porcentuales de valores atípicos del espacio (acres)",
xlab="Espacio (acres)", ylab="Porcentaje (%)", type="b", col="red")
lines(LimiteSup_out, Hi_asc_out, type="b", col="blue")
legend("topright", legend=c("Ojiva descendente", "Ojiva ascendente"),
col=c("red","blue"), lty=1, pch=1)
# Mínimo y máximo
ri_out <- min(Outliers_Espacio)
rs_out <- max(Outliers_Espacio)
media_out <- mean(Outliers_Espacio)
mediana_out <- median(Outliers_Espacio)
sd_out <- sd(Outliers_Espacio)
cv_out <- (sd_out / media_out) * 100
As_out <- skewness(Outliers_Espacio)
curtosis_out <- kurtosis(Outliers_Espacio)
Variable_out <- c("Valores atípicos del espacio (acres)")
Tabla_indicadores_out <- data.frame(Variable_out, ri_out, rs_out, round(media_out,2),
mediana_out, round(sd_out,2),
round(cv_out,2), round(As_out,2), round(curtosis_out,2))
colnames(Tabla_indicadores_out) <- c("Variable", "Mínimo", "Máximo", "Media",
"Mediana", "Desv.Est.", "CV (%)", "Asimetría", "Curtosis")
Tabla_indicadores_out
## Variable Mínimo Máximo Media Mediana Desv.Est.
## 1 Valores atípicos del espacio (acres) 479.268 691.78 633.79 639.12 22.48
## CV (%) Asimetría Curtosis
## 1 3.55 -2.13 11.83