Estadistica Multivariable

########## ESTADISTICA Multivariable ############
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") 
text(x = 1, y = 1,
     labels = "ESTADÍSTICA MULTIVARIABLE",
     cex = 2, 
     col = "blue", 
     font =6) 

#Carga de datos
datos <- read.csv("C:\\Users\\Usuario\\Downloads\\water_pollution_disease.csv"
                  , encoding = "UTF-8")
head(datos)     
##     Country  Region Year Water.Source.Type Contaminant.Level..ppm. pH.Level
## 1    Mexico   North 2015              Lake                    6.06     7.12
## 2    Brazil    West 2017              Well                    5.24     7.84
## 3 Indonesia Central 2022              Pond                    0.24     6.43
## 4   Nigeria    East 2016              Well                    7.91     6.71
## 5    Mexico   South 2005              Well                    0.12     8.16
## 6  Ethiopia    West 2013               Tap                    2.93     8.21
##   Turbidity..NTU. Dissolved.Oxygen..mg.L. Nitrate.Level..mg.L.
## 1            3.93                    4.28                 8.28
## 2            4.79                    3.86                15.74
## 3            0.79                    3.42                36.67
## 4            1.96                    3.12                36.92
## 5            4.22                    9.15                49.35
## 6            4.03                    8.66                31.35
##   Lead.Concentration..µg.L. Bacteria.Count..CFU.mL. Water.Treatment.Method
## 1                      7.89                    3344             Filtration
## 2                     14.68                    2122                Boiling
## 3                      9.96                    2330                   None
## 4                      6.77                    3779                Boiling
## 5                     12.51                    4182             Filtration
## 6                     16.74                     880                   None
##   Access.to.Clean.Water....of.Population. Diarrheal.Cases.per.100.000.people
## 1                                   33.60                                472
## 2                                   89.54                                122
## 3                                   35.29                                274
## 4                                   57.53                                  3
## 5                                   36.60                                466
## 6                                   69.48                                258
##   Cholera.Cases.per.100.000.people Typhoid.Cases.per.100.000.people
## 1                               33                               44
## 2                               27                                8
## 3                               39                               50
## 4                               33                               13
## 5                               31                               68
## 6                               22                               55
##   Infant.Mortality.Rate..per.1.000.live.births. GDP.per.Capita..USD.
## 1                                         76.16                57057
## 2                                         77.30                17220
## 3                                         48.45                86022
## 4                                         95.66                31166
## 5                                         58.78                25661
## 6                                         70.13                84334
##   Healthcare.Access.Index..0.100. Urbanization.Rate....
## 1                           96.92                 84.61
## 2                           84.73                 73.37
## 3                           58.37                 72.86
## 4                           39.07                 71.07
## 5                           23.03                 55.55
## 6                           53.45                 86.11
##   Sanitation.Coverage....of.Population. Rainfall..mm.per.year. Temperature...C.
## 1                                 63.23                   2800             4.94
## 2                                 29.12                   1572            16.93
## 3                                 93.56                   2074            21.73
## 4                                 94.25                    937             3.79
## 5                                 69.23                   2295            31.44
## 6                                 51.11                   2530             8.01
##   Population.Density..people.per.km..
## 1                                 593
## 2                                 234
## 3                                  57
## 4                                 555
## 5                                 414
## 6                                 775
str(datos)      
## 'data.frame':    3000 obs. of  24 variables:
##  $ Country                                      : chr  "Mexico" "Brazil" "Indonesia" "Nigeria" ...
##  $ Region                                       : chr  "North" "West" "Central" "East" ...
##  $ Year                                         : int  2015 2017 2022 2016 2005 2013 2022 2024 2014 2013 ...
##  $ Water.Source.Type                            : chr  "Lake" "Well" "Pond" "Well" ...
##  $ Contaminant.Level..ppm.                      : num  6.06 5.24 0.24 7.91 0.12 2.93 0.06 3.76 0.63 9.14 ...
##  $ pH.Level                                     : num  7.12 7.84 6.43 6.71 8.16 8.21 6.11 6.42 6.29 6.45 ...
##  $ Turbidity..NTU.                              : num  3.93 4.79 0.79 1.96 4.22 4.03 3.12 1.35 1.42 0.62 ...
##  $ Dissolved.Oxygen..mg.L.                      : num  4.28 3.86 3.42 3.12 9.15 8.66 6.97 9.99 9.67 7.59 ...
##  $ Nitrate.Level..mg.L.                         : num  8.28 15.74 36.67 36.92 49.35 ...
##  $ Lead.Concentration..µg.L.                    : num  7.89 14.68 9.96 6.77 12.51 ...
##  $ Bacteria.Count..CFU.mL.                      : int  3344 2122 2330 3779 4182 880 2977 1172 159 2493 ...
##  $ Water.Treatment.Method                       : chr  "Filtration" "Boiling" "None" "Boiling" ...
##  $ Access.to.Clean.Water....of.Population.      : num  33.6 89.5 35.3 57.5 36.6 ...
##  $ Diarrheal.Cases.per.100.000.people           : int  472 122 274 3 466 258 208 397 265 261 ...
##  $ Cholera.Cases.per.100.000.people             : int  33 27 39 33 31 22 23 0 23 2 ...
##  $ Typhoid.Cases.per.100.000.people             : int  44 8 50 13 68 55 90 10 29 38 ...
##  $ Infant.Mortality.Rate..per.1.000.live.births.: num  76.2 77.3 48.5 95.7 58.8 ...
##  $ GDP.per.Capita..USD.                         : int  57057 17220 86022 31166 25661 84334 6726 76593 5470 72858 ...
##  $ Healthcare.Access.Index..0.100.              : num  96.9 84.7 58.4 39.1 23 ...
##  $ Urbanization.Rate....                        : num  84.6 73.4 72.9 71.1 55.5 ...
##  $ Sanitation.Coverage....of.Population.        : num  63.2 29.1 93.6 94.2 69.2 ...
##  $ Rainfall..mm.per.year.                       : int  2800 1572 2074 937 2295 2530 1573 940 2150 2083 ...
##  $ Temperature...C.                             : num  4.94 16.93 21.73 3.79 31.44 ...
##  $ Population.Density..people.per.km..          : int  593 234 57 555 414 775 584 111 538 250 ...
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.1
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Relación entre variables
# Paso 1: Convertir y limpiar datos
pH <- as.numeric(gsub(",", ".", datos$pH.Level))
Plomo <- as.numeric(gsub(",", ".", datos$Lead.Concentration..µg.L.))

length(Plomo); length(pH)
## [1] 3000
## [1] 3000
# Tabla pares de valores
tabla <- data.frame(Plomo, pH) %>%
  filter(!is.na(Plomo), !is.na(pH), Plomo > 0, pH > 0)
# Eliminar outliers antes de agrupar
remove_outliers <- function(x) {
  Q1 <- quantile(x, 0.25)
  Q3 <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  x >= (Q1 - 1.5 * IQR) & x <= (Q3 + 1.5 * IQR)
}
tabla_filtrada <- tabla %>%
  filter(remove_outliers(Plomo), remove_outliers(pH))

# Paso 2: Agrupar por rangos (menos rangos: 5 o 6 mejor que 10)
tabla_promedios <- tabla_filtrada %>%
  mutate(RangoPlomo = cut(Plomo, breaks = 6)) %>%
  group_by(RangoPlomo) %>%
  summarise(
    Plomo = mean(Plomo),
    pH = mean(pH)
  ) %>%
  ungroup()

# Conjetura de modelo matemático
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") 
text(x = 1, y = 1,
     labels = "Modelo Potencial",
     cex = 2, # Tamaño del texto
     col = "blue",
     font =6)

# Paso 3: Ajustar modelo potencial con los puntos promedio
x <- tabla_promedios$Plomo
y <- tabla_promedios$pH

x_log <- log(x)
y_log <- log(y)
# Cálculo de parámetros a y b
regresion_potencial <- lm(y_log ~ x_log)
regresion_potencial
## 
## Call:
## lm(formula = y_log ~ x_log)
## 
## Coefficients:
## (Intercept)        x_log  
##    1.976478     0.002563
a <- exp(coef(regresion_potencial)[1])
b <- coef(regresion_potencial)[2]
a
## (Intercept) 
##    7.217279
b
##       x_log 
## 0.002563461
#Formamos la ecuación
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") 
text(x = 1, y = 1,
     labels = " Ecuación Potencial \n y = a x ˆ b \n y = 7.2172 x ˆ 0.00253 ",
     cex = 2, 
     col = "blue", 
     font =6) 

# Grafica
plot(x, y,
     pch = 19,
     col = "skyblue",
     xlab = "Concentración de Plomo (µg/L)",
     ylab = "pH",
     main = " Gráfica 1. Regresión Potencial: Nivel de ph en función de 
     Concentración de Plomo (µg/L)")
# Curva roja ajustada a los promedios
x_vals <- seq(min(x), max(x), length.out = 1000)
y_pred <- a * x_vals^b
lines(x_vals, y_pred, col = "red", lwd = 2)

# Test de Pearson
r <- cor(x, y)
cat("Correlación de Pearson:", round(cor(x, y), 4), "\n")
## Correlación de Pearson: 0.6703
#Restricciones
# Coeficientes de tu modelo
a <- 7.272  # o el valor que obtuviste
b <- 0.00253  # tu exponente

# RESTRICCIONES Y > 0
y_rango <- a * x_vals^b
indices_validos <- which(y_rango > 0)
x_validos <- x_vals[indices_validos]

cat("Restricción: Y > 0 se cumple en el rango:\n")
## Restricción: Y > 0 se cumple en el rango:
cat(round(min(x_validos), 2), "< x <", round(max(x_validos), 2), "\n")
## 1.62 < x < 18.33
# Restricciones
# x > 0
# 14 =4.79*xˆ0.102
Y=exp(1)^((log(14/7.217)/0.002))
Y
## [1] 7.685377e+143
# Estimaciones
pH_15 <- 7.217*15^0.002
pH_15
## [1] 7.256194
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") 
text(x = 1, y = 1,
     labels = "¿Que cantidad de pH se espera
cuando se tenga 15(µg/L) de Plomo?
\n R= pH de 7.25 ",
     cex = 2, # Tamaño del texto (ajustable)
     col = "blue", # Color del texto
     font = 6)

Resumen del Modelo Potencial
Variables Nombres Restricciones Coef. Pearson Estimación
x Plomo (µg/L) 1.62 < x < 18.33 0.67 15 (µg/L)
y pH 1 < y <14 7.25