###UNIVERSIDAD CENTRAL DEL ECUADOR ###

##Petróleos##

#Tema: Estadistica inferencial de variables cualitativas

#grupo 2

#2025-2026

##Cargar Librería
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)

## cargar datos

setwd("C:/Users/ronal/OneDrive/Desktop")
datos <- read.csv("database (1).csv", header = TRUE, sep = ",", dec = ".")

# Variable: Cause Sub Category
cause_sub <- datos$Cause.Subcategory

# Frecuencia absoluta
freq_abs2 <- table(cause_sub)

# Convertir a data frame
Tabla2 <- as.data.frame(freq_abs2)
colnames(Tabla2) <- c("x", "ni")

# Frecuencia relativa
Tabla2$hi <- round(Tabla2$ni / sum(Tabla2$ni), 4)

# Fila TOTAL
fila_total2 <- data.frame(
  x = "TOTAL",
  ni = sum(Tabla2$ni),
  hi = 1.00
)

# Tabla final
TablaFinal2 <- rbind(Tabla2, fila_total2)

TablaFinal2
##                                                    x   ni     hi
## 1  CONSTRUCTION, INSTALLATION OR FABRICATION-RELATED  112 0.0401
## 2        DAMAGE BY OPERATOR OR OPERATOR'S CONTRACTOR   20 0.0072
## 3                  DEFECTIVE OR LOOSE TUBING/FITTING   62 0.0222
## 4                                     EARTH MOVEMENT    8 0.0029
## 5    ELECTRICAL ARCING FROM OTHER EQUIPMENT/FACILITY   11 0.0039
## 6                     ENVIRONMENTAL CRACKING-RELATED   16 0.0057
## 7                                           EXTERNAL  230 0.0823
## 8                          FAILURE OF EQUIPMENT BODY   67 0.0240
## 9                    FIRE/EXPLOSION AS PRIMARY CAUSE    4 0.0014
## 10                      FISHING OR MARITIME ACTIVITY    1 0.0004
## 11                                HEAVY RAINS/FLOODS   25 0.0089
## 12                                        HIGH WINDS    1 0.0004
## 13                               INCORRECT EQUIPMENT   10 0.0036
## 14                            INCORRECT INSTALLATION   72 0.0258
## 15                          INCORRECT VALVE POSITION   84 0.0301
## 16                                INTENTIONAL DAMAGE    3 0.0011
## 17                                          INTERNAL  362 0.1295
## 18                                         LIGHTNING   19 0.0068
## 19           MALFUNCTION OF CONTROL/RELIEF EQUIPMENT  171 0.0612
## 20                             MANUFACTURING-RELATED   70 0.0250
## 21               MARITIME EQUIPMENT OR VESSEL ADRIFT    1 0.0004
## 22                                     MISCELLANEOUS   76 0.0272
## 23                   NON-THREADED CONNECTION FAILURE  286 0.1023
## 24             OPERATOR/CONTRACTOR EXCAVATION DAMAGE   28 0.0100
## 25                           OTHER EQUIPMENT FAILURE  204 0.0730
## 26                         OTHER INCORRECT OPERATION   86 0.0308
## 27                        OTHER NATURAL FORCE DAMAGE    8 0.0029
## 28                        OTHER OUTSIDE FORCE DAMAGE   15 0.0054
## 29             OVERFILL/OVERFLOW OF TANK/VESSEL/SUMP   69 0.0247
## 30                  PIPELINE/EQUIPMENT OVERPRESSURED   37 0.0132
## 31                 PREVIOUS DAMAGE DUE TO EXCAVATION   12 0.0043
## 32                        PREVIOUS MECHANICAL DAMAGE    1 0.0004
## 33                    PUMP OR PUMP-RELATED EQUIPMENT  296 0.1059
## 34                                       TEMPERATURE   57 0.0204
## 35                     THIRD PARTY EXCAVATION DAMAGE   57 0.0204
## 36              THREADED CONNECTION/COUPLING FAILURE  151 0.0540
## 37                                           UNKNOWN   42 0.0150
## 38                 VEHICLE NOT ENGAGED IN EXCAVATION   21 0.0075
## 39                                             TOTAL 2795 1.0000
##Agruparlas subcategorias menos importantes en "OTRAS"###

top_n <- 7
orden <- sort(table(cause_sub), decreasing = TRUE)
principales <- names(orden)[1:top_n]

cause_sub_grouped <- ifelse(cause_sub %in% principales, cause_sub, "OTRAS")

freq_abs2 <- table(cause_sub_grouped)

Tabla2 <- as.data.frame(freq_abs2)
colnames(Tabla2) <- c("x", "ni")

# Ordenar 

Tabla2 <- Tabla2[order(Tabla2$x == "OTRAS"), ]

# Frecuencia relativa

Tabla2$hi <- round(Tabla2$ni / sum(Tabla2$ni), 4)

# Fila total

fila_total2 <- data.frame(
x = "TOTAL",
ni = sum(Tabla2$ni),
hi = 1.00
)

TablaFinal2 <- rbind(Tabla2, fila_total2)

TablaFinal2
##                                          x   ni     hi
## 1                                 EXTERNAL  230 0.0823
## 2                                 INTERNAL  362 0.1295
## 3  MALFUNCTION OF CONTROL/RELIEF EQUIPMENT  171 0.0612
## 4          NON-THREADED CONNECTION FAILURE  286 0.1023
## 5                  OTHER EQUIPMENT FAILURE  204 0.0730
## 7           PUMP OR PUMP-RELATED EQUIPMENT  296 0.1059
## 8     THREADED CONNECTION/COUPLING FAILURE  151 0.0540
## 6                                    OTRAS 1095 0.3918
## 11                                   TOTAL 2795 1.0000
Tabla2_graf <- subset(TablaFinal2, x != "TOTAL")
##Gráfica No.1
# Asegurar tabla sin TOTAL
Tabla2_graf <- subset(TablaFinal2, x != "TOTAL")

# Crear grafica con espacio para las etiquetas
par(mar = c(10, 4, 4, 2))

# Crear barplot y guardar posiciones en bp
bp <- barplot(
  Tabla2_graf$ni,
  names.arg = rep("", nrow(Tabla2_graf)), 
  col = "#4ECDC4",
  main = "Grafica No.1: Distribucion por Categoria de Causa",
  ylab = "Cantidad",
  las = 1,
  cex.axis = 0.9,
  width = 1.2,
  space = 0.4,
  ylim = c(0, max(Tabla2_graf$ni) * 1.25)
)

# Agregar etiquetas diagonales
text(
  x = bp,
  y = -max(Tabla2_graf$ni) * 0.07,
  labels = Tabla2_graf$x,
  srt = 45,
  adj = 1,
  xpd = TRUE,
  cex = 0.7,
  col = "black"
)

##Gráfica No.2
par(mar = c(10, 4, 4, 2))

bp <- barplot(
Tabla2_graf$hi,
col = "#4ECDC4",
main = "Grafica No.2: Frecuencia Relativa SubCause Category",
ylab = "Frecuencia Relativa",
ylim = c(0, max(Tabla2_graf$hi) * 1.3),
las = 1
)

text(
x = bp,
y = -max(Tabla2_graf$hi) * 0.07,
labels = Tabla2_graf$x,
srt = 45,
adj = 1,
xpd = TRUE,
cex = 0.7
)

##Gráfica No.3

par(mar = c(10, 4, 4, 2))

bp <- barplot(
Tabla2_graf$hi * 100,
col = "#4ECDC4",
main = "Grafica No.3: Porcentaje SubCause Category",
ylab = "Porcentaje (%)",
ylim = c(0, max(Tabla2_graf$hi * 100) * 1.3),
las = 1
)

text(
x = bp,
y = -max(Tabla2_graf$hi * 100) * 0.08,
labels = Tabla2_graf$x,
srt = 45,
adj = 1,
xpd = TRUE,
cex = 0.7
)

##Gráfica No.4

par(mar = c(4, 4, 4, 22))

azules <- colorRampPalette(c("#1f77b4", "#d4f1f9"))(nrow(Tabla2_graf))

pie(
Tabla2_graf$hi,
labels = NA,
col = azules,
main = "Grafica No.4: Distribucion de SubCause Category (Porcentaje)",
cex = 1.0,
radius = 1.0
)

legend(
x = 1.5,
y = 0.5,
legend = paste0(
Tabla2_graf$x, " - ",
round(Tabla2_graf$hi * 100, 1), "% (", Tabla2_graf$ni, " casos)"
),
fill = azules,
cex = 0.85,
bty = "n",
xpd = TRUE
)