#==============================ENCABEZADO===================================
# TEMA: EI Variables Nominal - ECORREGION
# AUTOR: GRUPO 4
# FECHA: 08-02-2026
#=====================CARGA DE DATOS============
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
setwd("C:/Users/HP/Documents/PROYECTO ESTADISTICA/RStudio")
datos <- read.csv("tablap.csv", header = TRUE, dec = ",", sep = ";")
#==================GR??FICA ORIGINAL COMPLETA==========
df_total <- as.data.frame(table(datos$Ecoregion))
names(df_total) <- c("Variable", "ni")
df_total$hi <- df_total$ni / sum(df_total$ni)
barplot(df_total$hi, names.arg = df_total$Variable, col = "steelblue",
main = "Grafica Nro.1: Distribuci??n de Ecorregiones", las = 2, cex.names = 0.6)

#=========PARTE A=========
eco_data <- datos$Ecoregion
df_total <- as.data.frame(table(eco_data))
df_A <- df_total[1:4, ] # Filtramos las primeras 4
names(df_A) <- c("Variable", "ni")
# 1. JUSTIFICACI??N (Variable Nominal - Sector A)
# 2. REALIZAMOS LA TDF (ni, hi)
df_A$hi <- df_A$ni / sum(df_A$ni)
cat("\n--- TDF SECTOR A ---\n")
##
## --- TDF SECTOR A ---
print(df_A)
## Variable ni hi
## 1 Chihuahuan Deserts 2150 0.3119558909
## 2 Colorado Plateaus 3792 0.5502031341
## 3 High Plains 946 0.1372605920
## 4 New Mexico Mountains 4 0.0005803831
# 3. GRAFICA 1: REALIDAD (hi)
barplot(df_A$hi, names.arg = df_A$Variable, col = "darkgreen",
main = "Grafica Nro.2: Distribuci??n de Ecorregiones", las = 2, cex.names = 0.7, ylim = c(0, max(df_A$hi) + 0.1))

# 4. CONJETURA: Modelo BINOMIAL (Forma de campana)
# 5. MODELO Y COMPARACI??N
n_A <- nrow(df_A) - 1
xi_A <- 0:n_A
prob_exito <- sum(xi_A * df_A$hi) / n_A
p_bin <- dbinom(xi_A, size = n_A, prob = prob_exito)
#--- GRAFICA 2: MODELO TEORICO ---
barplot(p_bin, names.arg = df_A$Variable, col = "skyblue",
main = "Grafica Nro.3: Distribuci??n de Ecorregiones", las = 2, cex.names = 0.7, ylim = c(0, max(p_bin) + 0.1))

#--- GRAFICA 3: COMPARACI??N ---
max_pA <- max(max(df_A$hi), max(p_bin)) + 0.1
barplot(rbind(df_A$hi, p_bin), beside = TRUE, col = c("darkgreen", "skyblue"),
main = "Grafica Nro.4: Distribuci??n de Ecorregiones", names.arg = df_A$Variable, las = 2, cex.names = 0.7, ylim = c(0, max_pA))
legend("topright", legend = c("Realidad", "Modelo"), fill = c("darkgreen", "skyblue"), bty = "n")

#--- VEREDICTO ---
r_A <- cor(df_A$hi, p_bin)
chi_A <- sum(((df_A$hi - p_bin)^2) / p_bin)
crit_A <- qchisq(0.85, df = n_A)
ver_A <- data.frame(Prueba = c("Pearson (r)", "Chi2"), Valor = c(round(r_A, 4), round(chi_A, 4)),
Criterio = c("r >= 0.70", paste("X2 <", round(crit_A, 2))),
Resultado = c(ifelse(r_A >= 0.7, "APROBADO", "REPROBADO"), ifelse(chi_A < crit_A, "APROBADO", "REPROBADO")))
print(ver_A, row.names = FALSE)
## Prueba Valor Criterio Resultado
## Pearson (r) 0.9516 r >= 0.70 APROBADO
## Chi2 0.0679 X2 < 5.32 APROBADO
# 6. C??LCULO DE PROBABILIDADES
cat("\n6. RESPUESTA PROB. BINOMIAL:", round(p_bin[2]*100, 2), "%\n")
##
## 6. RESPUESTA PROB. BINOMIAL: 43.38 %
#==========PARTE B============
#--- PREPARACI??N DE DATOS SECTOR B ---
df_B <- df_total[5:7, ] # Filtramos de la 5 a la 7
names(df_B) <- c("Variable", "ni")
# 1. JUSTIFICACI??N (Variable Nominal - Sector B)
# 2. REALIZAMOS LA TDF (ni, hi)
df_B$hi <- df_B$ni / sum(df_B$ni)
cat("\n--- TDF SECTOR B ---\n")
##
## --- TDF SECTOR B ---
print(df_B)
## Variable ni hi
## 5 New Mexico Plateau 3745 0.66061034
## 6 Southern Rockies 1747 0.30816723
## 7 Southwestern Tablelands 177 0.03122244
# 3. GRAFICA 1: REALIDAD (hi)
barplot(df_B$hi, names.arg = df_B$Variable, col = "darkgreen",
main = "Grafica Nro.5: Distribuci??n de Ecorregiones", las = 2, cex.names = 0.7, ylim = c(0, max(df_B$hi) + 0.1))

# 4. CONJETURA: Modelo GEOM??TRICO (Descenso r??pido)
# 5. MODELO Y COMPARACI??N
xi_B <- 0:(nrow(df_B)-1)
p_param <- 1 / (sum(xi_B * df_B$hi) + 1)
p_geom <- dgeom(xi_B, prob = p_param)
#--- GRAFICA 2: MODELO TEORICO ---
barplot(p_geom, names.arg = df_B$Variable, col = "purple",
main = "Grafica Nro.6: Distribuci??n de Ecorregiones", las = 2, cex.names = 0.7, ylim = c(0, max(p_geom) + 0.1))

#--- GRAFICA 3: COMPARACI??N ---
max_pB <- max(max(df_B$hi), max(p_geom)) + 0.1
barplot(rbind(df_B$hi, p_geom), beside = TRUE, col = c("darkgreen", "purple"),
main = "Grafica Nro.7: Distribuci??n de Ecorregiones", names.arg = df_B$Variable, las = 2, cex.names = 0.7, ylim = c(0, max_pB))
legend("topright", legend = c("Realidad", "Modelo"), fill = c("darkgreen", "purple"), bty = "n")

#--- VEREDICTO ---
r_B <- cor(df_B$hi, p_geom)
chi_B <- sum(((df_B$hi - p_geom)^2) / p_geom)
crit_B <- qchisq(0.85, df = nrow(df_B) - 1)
ver_B <- data.frame(Prueba = c("Pearson (r)", "Chi2"), Valor = c(round(r_B, 4), round(chi_B, 4)),
Criterio = c("r >= 0.70", paste("X2 <", round(crit_B, 2))),
Resultado = c(ifelse(r_B >= 0.7, "APROBADO", "REPROBADO"), ifelse(chi_B < crit_B, "APROBADO", "REPROBADO")))
print(ver_B, row.names = FALSE)
## Prueba Valor Criterio Resultado
## Pearson (r) 0.9687 r >= 0.70 APROBADO
## Chi2 0.0780 X2 < 3.79 APROBADO
# 6. C??LCULO DE PROBABILIDADES
cat("\n6. RESPUESTA PROB. GEOM??TRICA:", round(p_geom[1]*100, 2), "%\n")
##
## 6. RESPUESTA PROB. GEOM??TRICA: 72.96 %