###################### ESTADISTICA Multivariable ###############################
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
text(x = 1, y = 1,
     labels = "ESTADÍSTICA MULTIVARIABLE",
     cex = 2, 
     col = "blue", 
     font =6) 

#Carga de datos
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.1
datos <- read.csv("C:\\Users\\Usuario\\Downloads\\water_pollution_disease.csv", encoding = "UTF-8")
head(datos)     # Muestra las primeras filas
##     Country  Region Year Water.Source.Type Contaminant.Level..ppm. pH.Level
## 1    Mexico   North 2015              Lake                    6.06     7.12
## 2    Brazil    West 2017              Well                    5.24     7.84
## 3 Indonesia Central 2022              Pond                    0.24     6.43
## 4   Nigeria    East 2016              Well                    7.91     6.71
## 5    Mexico   South 2005              Well                    0.12     8.16
## 6  Ethiopia    West 2013               Tap                    2.93     8.21
##   Turbidity..NTU. Dissolved.Oxygen..mg.L. Nitrate.Level..mg.L.
## 1            3.93                    4.28                 8.28
## 2            4.79                    3.86                15.74
## 3            0.79                    3.42                36.67
## 4            1.96                    3.12                36.92
## 5            4.22                    9.15                49.35
## 6            4.03                    8.66                31.35
##   Lead.Concentration..µg.L. Bacteria.Count..CFU.mL. Water.Treatment.Method
## 1                      7.89                    3344             Filtration
## 2                     14.68                    2122                Boiling
## 3                      9.96                    2330                   None
## 4                      6.77                    3779                Boiling
## 5                     12.51                    4182             Filtration
## 6                     16.74                     880                   None
##   Access.to.Clean.Water....of.Population. Diarrheal.Cases.per.100.000.people
## 1                                   33.60                                472
## 2                                   89.54                                122
## 3                                   35.29                                274
## 4                                   57.53                                  3
## 5                                   36.60                                466
## 6                                   69.48                                258
##   Cholera.Cases.per.100.000.people Typhoid.Cases.per.100.000.people
## 1                               33                               44
## 2                               27                                8
## 3                               39                               50
## 4                               33                               13
## 5                               31                               68
## 6                               22                               55
##   Infant.Mortality.Rate..per.1.000.live.births. GDP.per.Capita..USD.
## 1                                         76.16                57057
## 2                                         77.30                17220
## 3                                         48.45                86022
## 4                                         95.66                31166
## 5                                         58.78                25661
## 6                                         70.13                84334
##   Healthcare.Access.Index..0.100. Urbanization.Rate....
## 1                           96.92                 84.61
## 2                           84.73                 73.37
## 3                           58.37                 72.86
## 4                           39.07                 71.07
## 5                           23.03                 55.55
## 6                           53.45                 86.11
##   Sanitation.Coverage....of.Population. Rainfall..mm.per.year. Temperature...C.
## 1                                 63.23                   2800             4.94
## 2                                 29.12                   1572            16.93
## 3                                 93.56                   2074            21.73
## 4                                 94.25                    937             3.79
## 5                                 69.23                   2295            31.44
## 6                                 51.11                   2530             8.01
##   Population.Density..people.per.km..
## 1                                 593
## 2                                 234
## 3                                  57
## 4                                 555
## 5                                 414
## 6                                 775
str(datos)      # Muestra la estructura del data frame
## 'data.frame':    3000 obs. of  24 variables:
##  $ Country                                      : chr  "Mexico" "Brazil" "Indonesia" "Nigeria" ...
##  $ Region                                       : chr  "North" "West" "Central" "East" ...
##  $ Year                                         : int  2015 2017 2022 2016 2005 2013 2022 2024 2014 2013 ...
##  $ Water.Source.Type                            : chr  "Lake" "Well" "Pond" "Well" ...
##  $ Contaminant.Level..ppm.                      : num  6.06 5.24 0.24 7.91 0.12 2.93 0.06 3.76 0.63 9.14 ...
##  $ pH.Level                                     : num  7.12 7.84 6.43 6.71 8.16 8.21 6.11 6.42 6.29 6.45 ...
##  $ Turbidity..NTU.                              : num  3.93 4.79 0.79 1.96 4.22 4.03 3.12 1.35 1.42 0.62 ...
##  $ Dissolved.Oxygen..mg.L.                      : num  4.28 3.86 3.42 3.12 9.15 8.66 6.97 9.99 9.67 7.59 ...
##  $ Nitrate.Level..mg.L.                         : num  8.28 15.74 36.67 36.92 49.35 ...
##  $ Lead.Concentration..µg.L.                    : num  7.89 14.68 9.96 6.77 12.51 ...
##  $ Bacteria.Count..CFU.mL.                      : int  3344 2122 2330 3779 4182 880 2977 1172 159 2493 ...
##  $ Water.Treatment.Method                       : chr  "Filtration" "Boiling" "None" "Boiling" ...
##  $ Access.to.Clean.Water....of.Population.      : num  33.6 89.5 35.3 57.5 36.6 ...
##  $ Diarrheal.Cases.per.100.000.people           : int  472 122 274 3 466 258 208 397 265 261 ...
##  $ Cholera.Cases.per.100.000.people             : int  33 27 39 33 31 22 23 0 23 2 ...
##  $ Typhoid.Cases.per.100.000.people             : int  44 8 50 13 68 55 90 10 29 38 ...
##  $ Infant.Mortality.Rate..per.1.000.live.births.: num  76.2 77.3 48.5 95.7 58.8 ...
##  $ GDP.per.Capita..USD.                         : int  57057 17220 86022 31166 25661 84334 6726 76593 5470 72858 ...
##  $ Healthcare.Access.Index..0.100.              : num  96.9 84.7 58.4 39.1 23 ...
##  $ Urbanization.Rate....                        : num  84.6 73.4 72.9 71.1 55.5 ...
##  $ Sanitation.Coverage....of.Population.        : num  63.2 29.1 93.6 94.2 69.2 ...
##  $ Rainfall..mm.per.year.                       : int  2800 1572 2074 937 2295 2530 1573 940 2150 2083 ...
##  $ Temperature...C.                             : num  4.94 16.93 21.73 3.79 31.44 ...
##  $ Population.Density..people.per.km..          : int  593 234 57 555 414 775 584 111 538 250 ...
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.1
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Relación entre variables
Bacterias <- as.numeric(gsub(",", ".", datos$Bacteria.Count..CFU.mL.))
Temperatura <- as.numeric(gsub(",", ".", datos$Temperature...C.))
datos$Bacterias <- Bacterias
datos$Temperatura <- Temperatura

datos_limpios <- subset(datos, !is.na(Bacterias) & !is.na(Temperatura) &   
                          Bacterias > 0 & Temperatura > 0)


datos_limpios_prom <- datos_limpios %>%
  group_by(Temperatura) %>%
  summarise(Bacterias_promedio = mean(Bacterias, na.rm = TRUE)) %>%
  ungroup()

# Verificar que las longitudes sean iguales
cat("Longitud Bacterias:", length(Temperatura), "\n")
## Longitud Bacterias: 3000
cat("Longitud Diarrea:", length(Bacterias), "\n")
## Longitud Diarrea: 3000
# Conjetura de modelo matemático
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") 
text(x = 1, y = 1,
     labels = "Modelo Logarítmico",
     cex = 2, 
     col = "blue",
     font =6)

# Variables limpias para modelo
x_prom <- datos_limpios_prom$Temperatura
y_prom <- datos_limpios_prom$Bacterias_promedio
# Ajustar modelo logarítmico con promedios
modelo_log_prom <- lm(Bacterias_promedio ~ log(Temperatura), data =    
                        datos_limpios_prom)
# Coeficientes
beta0 <- coef(modelo_log_prom)[1]
beta1 <- coef(modelo_log_prom)[2]
# Mostrar ecuación del modelo
cat("Ecuación del modelo logarítmico:\n")
## Ecuación del modelo logarítmico:
cat("Bacterias_promedio =", round(beta0, 4), "+", round(beta1, 4), "*  
    ln(Temperatura)\n")
## Bacterias_promedio = 2576.077 + -41.0375 *  
##     ln(Temperatura)
# Mostrar resumen completo del modelo para evaluar
summary(modelo_log_prom)
## 
## Call:
## lm(formula = Bacterias_promedio ~ log(Temperatura), data = datos_limpios_prom)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2577.80  -993.43    20.63   966.47  2561.19 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2576.08      81.44  31.631   <2e-16 ***
## log(Temperatura)   -41.04      28.42  -1.444    0.149    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1288 on 2092 degrees of freedom
## Multiple R-squared:  0.0009954,  Adjusted R-squared:  0.0005179 
## F-statistic: 2.084 on 1 and 2092 DF,  p-value: 0.149
# Predecir para cada temperatura
datos_limpios_prom$pred <- predict(modelo_log_prom, newdata =  
                                     datos_limpios_prom)

# Calcular residual absoluto
datos_limpios_prom <- datos_limpios_prom %>%
  mutate(residual = abs(Bacterias_promedio - pred))

# Filtrar puntos cuya diferencia residual sea menor a un umbral
umbral <- 0.01 * (max(datos_limpios_prom$Bacterias_promedio) - 
                    min(datos_limpios_prom$Bacterias_promedio))
datos_filtrados <- filter(datos_limpios_prom, residual <= umbral)

#Formamos la ecuación
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") 
text(x = 1, y = 1,
     labels = " Ecuación logarítmica \n Y = a + b*ln(x) \n 
     Y = 2576.007 + (-41.0375)*ln(x)",
     cex = 2, 
     col = "blue", 
     font =6) 

# Gráfico
plot(datos_filtrados$Temperatura, datos_filtrados$Bacterias_promedio,
     pch = 19, col = "forestgreen",
     main = "Gráfica 1. Regresión logaritmica: Bacterias(CFU/mL)en función 
     de Temperatura (°C)",
     xlab = "Temperatura (°C)", ylab = " Bacterias(CFU/mL)",
     ylim = c(0,4500),
     xlim = c(0,30))

#Agregar curva del modelo
x_seq <- seq(min(datos_filtrados$Temperatura), 
             max(datos_filtrados$Temperatura), length.out = 300)
y_pred <- predict(modelo_log_prom, newdata = data.frame(Temperatura = x_seq))
lines(x_seq, y_pred, col = "blue", lwd = 2)

r_con_log <- cor(datos_filtrados$Temperatura, 
                 datos_filtrados$Bacterias_promedio)
cat("Correlación de Pearson (datos filtrados):", round(r_con_log, 4), "\n")
## Correlación de Pearson (datos filtrados): -0.7197
# Calcular y del modelo logarítmico
y_log <- beta0 + beta1 * log(x_seq)

# Filtrar solo valores donde y > 0
indices_validos <- which(y_log > 0)
x_validos <- x_seq[indices_validos]
y_validos <- y_log[indices_validos]

cat("Restricción: Y > 0 se cumple en el rango:\n")
## Restricción: Y > 0 se cumple en el rango:
cat(round(min(x_validos), 2), "< x <", round(max(x_validos), 2), "\n")
## 1.04 < x < 39.43
# Coeficientes del modelo logarítmico con intercepto
a <- 2576.007   # Intercepto
b <- -41.0375     # Pendiente

# Valores de bacterias para estimar
x_temp <- c( 15)

# Calcular estimaciones
y_estimado <- a + b * log(x_temp)

# Verificar resultados
print("Estimaciones de bacterias para diferentes temperaturas:")
## [1] "Estimaciones de bacterias para diferentes temperaturas:"
for (i in 1:length(x_temp)) {
  mensaje <- paste("Para", x_temp[i], " Temperatura(°C) -> Y estimado =", 
                   round(y_estimado[i], 2), "bacterias")
  print(mensaje)
}
## [1] "Para 15  Temperatura(°C) -> Y estimado = 2464.88 bacterias"
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") 
text(x = 1, y = 1,
     labels = "Cuantas bacterias se esperaria 
cuando se tenga una temperatura de  15°C?
\n R= 2464,88 bacterias",
     cex = 2, # Tamaño del texto (ajustable)
     col = "blue", # Color del texto
     font = 6)

Conclusiones

Resumen del Modelo Logaritmico
Variables Nombres Restricciones Coef. Pearson Estimación
x Temperatura (°C) 1.04 < x <39.43 -0.71 15°C Temperatura
y Bacterias (CFU/ml) y > 0 2464.88 bacterias