library(readxl)
setwd("C:/Users/Usuario/Downloads")
datos <- read_excel("database.xlsx")
## Warning: Expecting numeric in C2189 / R2189C3: got 'Accident Year'
## Warning: Expecting numeric in C2215 / R2215C3: got 'Accident Year'
#   VARIABLE 2: LIQUID SUBTYPE
# ===============================

# Extraer la variable correctamente
liquid_subtype <- datos$`Liquid Subtype`

# Tablas
tabla_freq_sub <- table(liquid_subtype)
View(tabla_freq_sub)

tabla_rel_sub <- prop.table(tabla_freq_sub)
View(tabla_rel_sub)

tabla_porcent_sub <- prop.table(tabla_freq_sub) * 100
View(tabla_porcent_sub)

# Unir en un solo data frame (CORREGIDO)
tabla_completa_sub <- data.frame(
  Liquid_Subtype = names(tabla_freq_sub),
  Frecuencia = as.vector(tabla_freq_sub),
  Frec_Relativa = round(as.vector(tabla_rel_sub), 4),
  Porcentaje = round(as.vector(tabla_porcent_sub), 2)
)

View(tabla_completa_sub)
tabla_completa_sub
##                                             Liquid_Subtype Frecuencia
## 1                                        ANHYDROUS AMMONIA         55
## 2                                                BIODIESEL          2
## 3                     DIESEL, FUEL OIL, KEROSENE, JET FUEL        408
## 4                                   GASOLINE (NON-ETHANOL)        376
## 5 LPG (LIQUEFIED PETROLEUM GAS) / NGL (NATURAL GAS LIQUID)        188
## 6  MIXTURE OF REFINED PRODUCTS (TRANSMIX OR OTHER MIXTURE)         98
## 7                                                    OTHER         51
## 8                                                OTHER HVL        171
##   Frec_Relativa Porcentaje
## 1        0.0408       4.08
## 2        0.0015       0.15
## 3        0.3024      30.24
## 4        0.2787      27.87
## 5        0.1394      13.94
## 6        0.0726       7.26
## 7        0.0378       3.78
## 8        0.1268      12.68
library(ggplot2)

library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

tabla_freq_sub_df <- data.frame(
  Liquid_Subtype = names(tabla_freq_sub),
  Frecuencia = as.vector(tabla_freq_sub)
)

tabla_freq_sub_grouped <- tabla_freq_sub_df %>%
  arrange(desc(Frecuencia)) %>%
  mutate(Liquid_Subtype = ifelse(row_number() > 10, "OTROS", Liquid_Subtype)) %>%
  group_by(Liquid_Subtype) %>%
  summarise(Frecuencia = sum(Frecuencia))

ggplot(tabla_freq_sub_grouped,
       aes(x = reorder(Liquid_Subtype, Frecuencia), y = Frecuencia)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Frecuencia de Liquid Subtype ",
       x = "Subtype",
       y = "Frecuencia") +
  theme_minimal() +
  coord_flip()

tabla_segunda_sub <- data.frame(
  Liquid_Subtype = names(tabla_rel_sub),
  Frec_Relativa = as.vector(tabla_rel_sub)
)

tabla_segunda_sub_grouped <- tabla_segunda_sub %>%
  arrange(desc(Frec_Relativa)) %>%
  mutate(Liquid_Subtype = ifelse(row_number() > 10, "OTROS", Liquid_Subtype)) %>%
  group_by(Liquid_Subtype) %>%
  summarise(Frec_Relativa = sum(Frec_Relativa))

ggplot(tabla_segunda_sub_grouped,
       aes(x = reorder(Liquid_Subtype, Frec_Relativa), y = Frec_Relativa)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  labs(title = "Frecuencia Relativa de Liquid Subtype ",
       x = "Subtype",
       y = "Frecuencia Relativa") +
  theme_minimal() +
  coord_flip()

tabla_por_sub <- data.frame(
  Liquid_Subtype = names(tabla_porcent_sub),
  Porcentaje = as.vector(tabla_porcent_sub)
)

tabla_por_sub_grouped <- tabla_por_sub %>%
  arrange(desc(Porcentaje)) %>%
  mutate(Liquid_Subtype = ifelse(row_number() > 10, "OTROS", Liquid_Subtype)) %>%
  group_by(Liquid_Subtype) %>%
  summarise(Porcentaje = sum(Porcentaje))

ggplot(tabla_por_sub_grouped,
       aes(x = reorder(Liquid_Subtype, Porcentaje), y = Porcentaje)) +
  geom_bar(stat = "identity", fill = "orange") +
  labs(title = "Porcentaje de Liquid Subtype ",
       x = "Subtype",
       y = "Porcentaje (%)") +
  theme_minimal() +
  coord_flip()