Estadistica Multivariable
########## ESTADISTICA Multivariable ############
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
text(x = 1, y = 1,
labels = "ESTADÍSTICA MULTIVARIABLE",
cex = 2,
col = "blue",
font =6)

#Carga de datos
datos <- read.csv("C:\\Users\\Usuario\\Downloads\\water_pollution_disease.csv"
, encoding = "UTF-8")
head(datos)
## Country Region Year Water.Source.Type Contaminant.Level..ppm. pH.Level
## 1 Mexico North 2015 Lake 6.06 7.12
## 2 Brazil West 2017 Well 5.24 7.84
## 3 Indonesia Central 2022 Pond 0.24 6.43
## 4 Nigeria East 2016 Well 7.91 6.71
## 5 Mexico South 2005 Well 0.12 8.16
## 6 Ethiopia West 2013 Tap 2.93 8.21
## Turbidity..NTU. Dissolved.Oxygen..mg.L. Nitrate.Level..mg.L.
## 1 3.93 4.28 8.28
## 2 4.79 3.86 15.74
## 3 0.79 3.42 36.67
## 4 1.96 3.12 36.92
## 5 4.22 9.15 49.35
## 6 4.03 8.66 31.35
## Lead.Concentration..µg.L. Bacteria.Count..CFU.mL. Water.Treatment.Method
## 1 7.89 3344 Filtration
## 2 14.68 2122 Boiling
## 3 9.96 2330 None
## 4 6.77 3779 Boiling
## 5 12.51 4182 Filtration
## 6 16.74 880 None
## Access.to.Clean.Water....of.Population. Diarrheal.Cases.per.100.000.people
## 1 33.60 472
## 2 89.54 122
## 3 35.29 274
## 4 57.53 3
## 5 36.60 466
## 6 69.48 258
## Cholera.Cases.per.100.000.people Typhoid.Cases.per.100.000.people
## 1 33 44
## 2 27 8
## 3 39 50
## 4 33 13
## 5 31 68
## 6 22 55
## Infant.Mortality.Rate..per.1.000.live.births. GDP.per.Capita..USD.
## 1 76.16 57057
## 2 77.30 17220
## 3 48.45 86022
## 4 95.66 31166
## 5 58.78 25661
## 6 70.13 84334
## Healthcare.Access.Index..0.100. Urbanization.Rate....
## 1 96.92 84.61
## 2 84.73 73.37
## 3 58.37 72.86
## 4 39.07 71.07
## 5 23.03 55.55
## 6 53.45 86.11
## Sanitation.Coverage....of.Population. Rainfall..mm.per.year. Temperature...C.
## 1 63.23 2800 4.94
## 2 29.12 1572 16.93
## 3 93.56 2074 21.73
## 4 94.25 937 3.79
## 5 69.23 2295 31.44
## 6 51.11 2530 8.01
## Population.Density..people.per.km..
## 1 593
## 2 234
## 3 57
## 4 555
## 5 414
## 6 775
str(datos)
## 'data.frame': 3000 obs. of 24 variables:
## $ Country : chr "Mexico" "Brazil" "Indonesia" "Nigeria" ...
## $ Region : chr "North" "West" "Central" "East" ...
## $ Year : int 2015 2017 2022 2016 2005 2013 2022 2024 2014 2013 ...
## $ Water.Source.Type : chr "Lake" "Well" "Pond" "Well" ...
## $ Contaminant.Level..ppm. : num 6.06 5.24 0.24 7.91 0.12 2.93 0.06 3.76 0.63 9.14 ...
## $ pH.Level : num 7.12 7.84 6.43 6.71 8.16 8.21 6.11 6.42 6.29 6.45 ...
## $ Turbidity..NTU. : num 3.93 4.79 0.79 1.96 4.22 4.03 3.12 1.35 1.42 0.62 ...
## $ Dissolved.Oxygen..mg.L. : num 4.28 3.86 3.42 3.12 9.15 8.66 6.97 9.99 9.67 7.59 ...
## $ Nitrate.Level..mg.L. : num 8.28 15.74 36.67 36.92 49.35 ...
## $ Lead.Concentration..µg.L. : num 7.89 14.68 9.96 6.77 12.51 ...
## $ Bacteria.Count..CFU.mL. : int 3344 2122 2330 3779 4182 880 2977 1172 159 2493 ...
## $ Water.Treatment.Method : chr "Filtration" "Boiling" "None" "Boiling" ...
## $ Access.to.Clean.Water....of.Population. : num 33.6 89.5 35.3 57.5 36.6 ...
## $ Diarrheal.Cases.per.100.000.people : int 472 122 274 3 466 258 208 397 265 261 ...
## $ Cholera.Cases.per.100.000.people : int 33 27 39 33 31 22 23 0 23 2 ...
## $ Typhoid.Cases.per.100.000.people : int 44 8 50 13 68 55 90 10 29 38 ...
## $ Infant.Mortality.Rate..per.1.000.live.births.: num 76.2 77.3 48.5 95.7 58.8 ...
## $ GDP.per.Capita..USD. : int 57057 17220 86022 31166 25661 84334 6726 76593 5470 72858 ...
## $ Healthcare.Access.Index..0.100. : num 96.9 84.7 58.4 39.1 23 ...
## $ Urbanization.Rate.... : num 84.6 73.4 72.9 71.1 55.5 ...
## $ Sanitation.Coverage....of.Population. : num 63.2 29.1 93.6 94.2 69.2 ...
## $ Rainfall..mm.per.year. : int 2800 1572 2074 937 2295 2530 1573 940 2150 2083 ...
## $ Temperature...C. : num 4.94 16.93 21.73 3.79 31.44 ...
## $ Population.Density..people.per.km.. : int 593 234 57 555 414 775 584 111 538 250 ...
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.1
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Relación entre variables
# Paso 1: Convertir y limpiar datos
pH <- as.numeric(gsub(",", ".", datos$pH.Level))
Plomo <- as.numeric(gsub(",", ".", datos$Lead.Concentration..µg.L.))
length(Plomo); length(pH)
## [1] 3000
## [1] 3000
# Tabla pares de valores
tabla <- data.frame(Plomo, pH) %>%
filter(!is.na(Plomo), !is.na(pH), Plomo > 0, pH > 0)
# Eliminar outliers antes de agrupar
remove_outliers <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
x >= (Q1 - 1.5 * IQR) & x <= (Q3 + 1.5 * IQR)
}
tabla_filtrada <- tabla %>%
filter(remove_outliers(Plomo), remove_outliers(pH))
# Paso 2: Agrupar por rangos (menos rangos: 5 o 6 mejor que 10)
tabla_promedios <- tabla_filtrada %>%
mutate(RangoPlomo = cut(Plomo, breaks = 6)) %>%
group_by(RangoPlomo) %>%
summarise(
Plomo = mean(Plomo),
pH = mean(pH)
) %>%
ungroup()
# Conjetura de modelo matemático
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
text(x = 1, y = 1,
labels = "Modelo Potencial",
cex = 2, # Tamaño del texto
col = "blue",
font =6)

# Paso 3: Ajustar modelo potencial con los puntos promedio
x <- tabla_promedios$Plomo
y <- tabla_promedios$pH
x_log <- log(x)
y_log <- log(y)
# Cálculo de parámetros a y b
regresion_potencial <- lm(y_log ~ x_log)
regresion_potencial
##
## Call:
## lm(formula = y_log ~ x_log)
##
## Coefficients:
## (Intercept) x_log
## 1.976478 0.002563
a <- exp(coef(regresion_potencial)[1])
b <- coef(regresion_potencial)[2]
a
## (Intercept)
## 7.217279
b
## x_log
## 0.002563461
#Formamos la ecuación
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
text(x = 1, y = 1,
labels = " Ecuación Potencial \n y = a x ˆ b \n y = 7.2172 x ˆ 0.00253 ",
cex = 2,
col = "blue",
font =6)

# Grafica
plot(x, y,
pch = 19,
col = "skyblue",
xlab = "Concentración de Plomo (µg/L)",
ylab = "pH",
main = " Gráfica 1. Regresión Potencial: Nivel de ph en función de
Concentración de Plomo (µg/L)")
# Curva roja ajustada a los promedios
x_vals <- seq(min(x), max(x), length.out = 1000)
y_pred <- a * x_vals^b
lines(x_vals, y_pred, col = "red", lwd = 2)

# Test de Pearson
r <- cor(x, y)
cat("Correlación de Pearson:", round(cor(x, y), 4), "\n")
## Correlación de Pearson: 0.6703
#Restricciones
# Coeficientes de tu modelo
a <- 7.272 # o el valor que obtuviste
b <- 0.00253 # tu exponente
# RESTRICCIONES Y > 0
y_rango <- a * x_vals^b
indices_validos <- which(y_rango > 0)
x_validos <- x_vals[indices_validos]
cat("Restricción: Y > 0 se cumple en el rango:\n")
## Restricción: Y > 0 se cumple en el rango:
cat(round(min(x_validos), 2), "< x <", round(max(x_validos), 2), "\n")
## 1.62 < x < 18.33
# Restricciones
# x > 0
# 14 =4.79*xˆ0.102
Y=exp(1)^((log(14/7.217)/0.002))
Y
## [1] 7.685377e+143
# Estimaciones
pH_15 <- 7.217*15^0.002
pH_15
## [1] 7.256194
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
text(x = 1, y = 1,
labels = "¿Que cantidad de pH se espera
cuando se tenga 15(µg/L) de Plomo?
\n R= pH de 7.25 ",
cex = 2, # Tamaño del texto (ajustable)
col = "blue", # Color del texto
font = 6)


Resumen del Modelo Potencial
x |
Plomo (µg/L) |
1.62 < x < 18.33 |
0.67 |
15 (µg/L) |
y |
pH |
1 < y <14 |
|
7.25 |