Estadistica Multivariable

############ ESTADISTICA Multivariable ############
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") 
text(x = 1, y = 1,
     labels = "ESTADÍSTICA MULTIVARIABLE",
     cex = 2, 
     col = "blue", 
     font =6) 

#Carga de datos
datos <- read.csv("C:\\Users\\Usuario\\Downloads\\water_pollution_disease.csv"
                  , encoding = "UTF-8")
head(datos)     # Muestra las primeras filas
##     Country  Region Year Water.Source.Type Contaminant.Level..ppm. pH.Level
## 1    Mexico   North 2015              Lake                    6.06     7.12
## 2    Brazil    West 2017              Well                    5.24     7.84
## 3 Indonesia Central 2022              Pond                    0.24     6.43
## 4   Nigeria    East 2016              Well                    7.91     6.71
## 5    Mexico   South 2005              Well                    0.12     8.16
## 6  Ethiopia    West 2013               Tap                    2.93     8.21
##   Turbidity..NTU. Dissolved.Oxygen..mg.L. Nitrate.Level..mg.L.
## 1            3.93                    4.28                 8.28
## 2            4.79                    3.86                15.74
## 3            0.79                    3.42                36.67
## 4            1.96                    3.12                36.92
## 5            4.22                    9.15                49.35
## 6            4.03                    8.66                31.35
##   Lead.Concentration..µg.L. Bacteria.Count..CFU.mL. Water.Treatment.Method
## 1                      7.89                    3344             Filtration
## 2                     14.68                    2122                Boiling
## 3                      9.96                    2330                   None
## 4                      6.77                    3779                Boiling
## 5                     12.51                    4182             Filtration
## 6                     16.74                     880                   None
##   Access.to.Clean.Water....of.Population. Diarrheal.Cases.per.100.000.people
## 1                                   33.60                                472
## 2                                   89.54                                122
## 3                                   35.29                                274
## 4                                   57.53                                  3
## 5                                   36.60                                466
## 6                                   69.48                                258
##   Cholera.Cases.per.100.000.people Typhoid.Cases.per.100.000.people
## 1                               33                               44
## 2                               27                                8
## 3                               39                               50
## 4                               33                               13
## 5                               31                               68
## 6                               22                               55
##   Infant.Mortality.Rate..per.1.000.live.births. GDP.per.Capita..USD.
## 1                                         76.16                57057
## 2                                         77.30                17220
## 3                                         48.45                86022
## 4                                         95.66                31166
## 5                                         58.78                25661
## 6                                         70.13                84334
##   Healthcare.Access.Index..0.100. Urbanization.Rate....
## 1                           96.92                 84.61
## 2                           84.73                 73.37
## 3                           58.37                 72.86
## 4                           39.07                 71.07
## 5                           23.03                 55.55
## 6                           53.45                 86.11
##   Sanitation.Coverage....of.Population. Rainfall..mm.per.year. Temperature...C.
## 1                                 63.23                   2800             4.94
## 2                                 29.12                   1572            16.93
## 3                                 93.56                   2074            21.73
## 4                                 94.25                    937             3.79
## 5                                 69.23                   2295            31.44
## 6                                 51.11                   2530             8.01
##   Population.Density..people.per.km..
## 1                                 593
## 2                                 234
## 3                                  57
## 4                                 555
## 5                                 414
## 6                                 775
str(datos)      # Muestra la estructura del data frame
## 'data.frame':    3000 obs. of  24 variables:
##  $ Country                                      : chr  "Mexico" "Brazil" "Indonesia" "Nigeria" ...
##  $ Region                                       : chr  "North" "West" "Central" "East" ...
##  $ Year                                         : int  2015 2017 2022 2016 2005 2013 2022 2024 2014 2013 ...
##  $ Water.Source.Type                            : chr  "Lake" "Well" "Pond" "Well" ...
##  $ Contaminant.Level..ppm.                      : num  6.06 5.24 0.24 7.91 0.12 2.93 0.06 3.76 0.63 9.14 ...
##  $ pH.Level                                     : num  7.12 7.84 6.43 6.71 8.16 8.21 6.11 6.42 6.29 6.45 ...
##  $ Turbidity..NTU.                              : num  3.93 4.79 0.79 1.96 4.22 4.03 3.12 1.35 1.42 0.62 ...
##  $ Dissolved.Oxygen..mg.L.                      : num  4.28 3.86 3.42 3.12 9.15 8.66 6.97 9.99 9.67 7.59 ...
##  $ Nitrate.Level..mg.L.                         : num  8.28 15.74 36.67 36.92 49.35 ...
##  $ Lead.Concentration..µg.L.                    : num  7.89 14.68 9.96 6.77 12.51 ...
##  $ Bacteria.Count..CFU.mL.                      : int  3344 2122 2330 3779 4182 880 2977 1172 159 2493 ...
##  $ Water.Treatment.Method                       : chr  "Filtration" "Boiling" "None" "Boiling" ...
##  $ Access.to.Clean.Water....of.Population.      : num  33.6 89.5 35.3 57.5 36.6 ...
##  $ Diarrheal.Cases.per.100.000.people           : int  472 122 274 3 466 258 208 397 265 261 ...
##  $ Cholera.Cases.per.100.000.people             : int  33 27 39 33 31 22 23 0 23 2 ...
##  $ Typhoid.Cases.per.100.000.people             : int  44 8 50 13 68 55 90 10 29 38 ...
##  $ Infant.Mortality.Rate..per.1.000.live.births.: num  76.2 77.3 48.5 95.7 58.8 ...
##  $ GDP.per.Capita..USD.                         : int  57057 17220 86022 31166 25661 84334 6726 76593 5470 72858 ...
##  $ Healthcare.Access.Index..0.100.              : num  96.9 84.7 58.4 39.1 23 ...
##  $ Urbanization.Rate....                        : num  84.6 73.4 72.9 71.1 55.5 ...
##  $ Sanitation.Coverage....of.Population.        : num  63.2 29.1 93.6 94.2 69.2 ...
##  $ Rainfall..mm.per.year.                       : int  2800 1572 2074 937 2295 2530 1573 940 2150 2083 ...
##  $ Temperature...C.                             : num  4.94 16.93 21.73 3.79 31.44 ...
##  $ Population.Density..people.per.km..          : int  593 234 57 555 414 775 584 111 538 250 ...
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.1
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Limpiar y resumir los datos por año
datos_limpios <- datos %>%
  group_by(Year) %>%
  summarise(Nivel_promedio =
              mean(as.numeric(as.character(Contaminant.Level..ppm.)), na.rm = 
                     TRUE))
# Conjetura de modelo matemático
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") 
text(x = 1, y = 1,
     labels = "Modelo Polinómico",
     cex = 2, # Tamaño del texto
     col = "blue",
     font =6)

# Variables para el modelo
x <- datos_limpios$Year
y <- datos_limpios$Nivel_promedio

# Guardar los años originales (x_real)
x_real <- datos_limpios$Year
x_esc <- scale(x_real)
x_esc_vector <- as.vector(x_esc)
y <- datos_limpios$Nivel_promedio
#Escalado
x_esc <- scale(datos_limpios$Year)
x_esc_vector <- as.vector(x_esc) 
y <- datos_limpios$Nivel_promedio
x <- scale(datos_limpios$Year)
# Crear términos polinómicos hasta grado 5
x2 <- x_esc_vector^2
x3 <- x_esc_vector^3
x4 <- x_esc_vector^4
x5 <- x_esc_vector^5

# Ajustar modelo de regresión polinómica
modelo_poli <- lm(y ~ x_esc_vector + x2 + x3 + x4 + x5)
modelo_poli
## 
## Call:
## lm(formula = y ~ x_esc_vector + x2 + x3 + x4 + x5)
## 
## Coefficients:
##  (Intercept)  x_esc_vector            x2            x3            x4  
##      5.02507       0.40061       0.03423      -0.48725      -0.06455  
##           x5  
##      0.11528
# Obtener coeficientes
beta0 <- modelo_poli$coefficients[1]
beta1 <- modelo_poli$coefficients[2]
beta2 <- modelo_poli$coefficients[3]
beta3 <- modelo_poli$coefficients[4]
beta4 <- modelo_poli$coefficients[5]
beta5 <- modelo_poli$coefficients[6]

a<-beta0
b<-beta1
c<-beta2
d<-beta3
e<-beta4
f<-beta5

# Mostrar ecuación
cat("Ecuación del modelo polinómico (grado 5):\n")
## Ecuación del modelo polinómico (grado 5):
cat("Y =", round(a, 4), "+", round(b, 4), "*x +", 
    round(c, 4), "*x^2 +", round(d, 4), "*x^3 +", round(e, 4), "*x^4 +",
    round(f, 4), "*x^5")
## Y = 5.0251 + 0.4006 *x + 0.0342 *x^2 + -0.4872 *x^3 + -0.0646 *x^4 + 0.1153 *x^5
#Formamos la ecuación
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
text(x = 1, y = 1,
     labels = paste("Y =", round(a, 4), "+", round(b, 4), "*x +", 
                    round(c, 4), "*x^2 +", round(d, 4), "*x^3 +", 
                    round(e, 4),"*x^4 +",
                    round(f, 4), "*x^5"),
     cex = 1, col = "blue", font = 6)

# Resumen del modelo
summary(modelo_poli)
## 
## Call:
## lm(formula = y ~ x_esc_vector + x2 + x3 + x4 + x5)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34211 -0.15720 -0.04067  0.17172  0.36241 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.02507    0.09003  55.818   <2e-16 ***
## x_esc_vector  0.40061    0.21675   1.848   0.0802 .  
## x2            0.03423    0.19784   0.173   0.8645    
## x3           -0.48725    0.30014  -1.623   0.1210    
## x4           -0.06455    0.07730  -0.835   0.4140    
## x5            0.11528    0.09243   1.247   0.2275    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2392 on 19 degrees of freedom
## Multiple R-squared:  0.3694, Adjusted R-squared:  0.2034 
## F-statistic: 2.226 on 5 and 19 DF,  p-value: 0.09394
# Graficar puntos con eje real
plot(x_real, y, col = "red", pch = 19,
     main = "Gráfica 1 .Regresión Polinómica: Nivel de contaminante (ppm)
     en función de Año",
     xlab = "Año", ylab = "Nivel de contaminante (ppm)")
# Mostrar el eje X forzando todos los años hasta 2024
axis(1, at = seq(min(x_real), 2024, by = 3))


# Graficar la curva
# Secuencia para curva ajustada
x_seq_real <- seq(min(x_real), max(x_real), length.out = 300)
x_seq_esc <- (x_seq_real - mean(x_real)) / sd(x_real)
x_seq2 <- x_seq_esc^2
x_seq3 <- x_seq_esc^3
x_seq4 <- x_seq_esc^4
x_seq5 <- x_seq_esc^5

datos_pred <- data.frame(
  x_esc_vector = x_seq_esc,
  x2 = x_seq2,
  x3 = x_seq3,
  x4 = x_seq4,
  x5 = x_seq5
)
# Predecir
y_pred <- predict(modelo_poli, newdata = datos_pred)

lines(x_seq_real, y_pred, col = "blue", lwd = 2)

# Coeficiente de Pearson
cor(y, fitted(modelo_poli))
## [1] 0.6077691
# RESTRICCIONES Y > 0
y_rango <- a + b * x_seq_real + c * x_seq2^2 + d * x_seq3^3 + e * x_seq4^4 
+ f * x_seq5^5
##   [1] -2.342316e+04 -1.980510e+04 -1.672693e+04 -1.411095e+04 -1.189024e+04
##   [6] -1.000718e+04 -8.412270e+03 -7.062969e+03 -5.922795e+03 -4.960481e+03
##  [11] -4.149262e+03 -3.466251e+03 -2.891899e+03 -2.409528e+03 -2.004925e+03
##  [16] -1.665992e+03 -1.382443e+03 -1.145546e+03 -9.478934e+02 -7.832105e+02
##  [21] -6.461895e+02 -5.323459e+02 -4.378953e+02 -3.596490e+02 -2.949231e+02
##  [26] -2.414624e+02 -1.973739e+02 -1.610712e+02 -1.312268e+02 -1.067313e+02
##  [31] -8.665893e+01 -7.023838e+01 -5.682800e+01 -4.589484e+01 -3.699690e+01
##  [36] -2.976822e+01 -2.390629e+01 -1.916147e+01 -1.532807e+01 -1.223693e+01
##  [41] -9.749193e+00 -7.751025e+00 -6.149323e+00 -4.868060e+00 -3.845273e+00
##  [46] -3.030561e+00 -2.383002e+00 -1.869443e+00 -1.463076e+00 -1.142266e+00
##  [51] -8.895958e-01 -6.910691e-01 -5.354650e-01 -4.138077e-01 -3.189334e-01
##  [56] -2.451382e-01 -1.878910e-01 -1.436015e-01 -1.094317e-01 -8.314386e-02
##  [61] -6.297839e-02 -4.755514e-02 -3.579456e-02 -2.685458e-02 -2.008022e-02
##  [66] -1.496351e-02 -1.111163e-02 -8.221753e-03 -6.061162e-03 -4.451565e-03
##  [71] -3.256821e-03 -2.373324e-03 -1.722496e-03 -1.244949e-03 -8.959618e-04
##  [76] -6.419815e-04 -4.579301e-04 -3.251366e-04 -2.297559e-04 -1.615636e-04
##  [81] -1.130410e-04 -7.868283e-05 -5.447660e-05 -3.751084e-05 -2.568314e-05
##  [86] -1.748270e-05 -1.182929e-05 -7.954560e-06 -5.314882e-06 -3.527768e-06
##  [91] -2.325626e-06 -1.522342e-06 -9.892631e-07 -6.380091e-07 -4.082632e-07
##  [96] -2.591363e-07 -1.631024e-07 -1.017650e-07 -6.292112e-08 -3.853897e-08
## [101] -2.337468e-08 -1.403328e-08 -8.335966e-09 -4.897106e-09 -2.843810e-09
## [106] -1.631609e-09 -9.243727e-10 -5.168196e-10 -2.849825e-10 -1.548786e-10
## [111] -8.289782e-11 -4.366505e-11 -2.261504e-11 -1.150634e-11 -5.745456e-12
## [116] -2.812504e-12 -1.348143e-12 -6.319747e-13 -2.893207e-13 -1.291551e-13
## [121] -5.612626e-14 -2.369939e-14 -9.703604e-15 -3.843816e-15 -1.469333e-15
## [126] -5.404653e-16 -1.906831e-16 -6.429522e-17 -2.063370e-17 -6.272811e-18
## [131] -1.796722e-18 -4.818467e-19 -1.201066e-19 -2.758762e-20 -5.779738e-21
## [136] -1.090977e-21 -1.827931e-22 -2.669107e-23 -3.319416e-24 -3.414666e-25
## [141] -2.797086e-26 -1.734159e-27 -7.588349e-29 -2.120555e-30 -3.255989e-32
## [146] -2.157389e-34 -4.030109e-37 -8.956038e-41 -2.546227e-46 -3.005148e-58
## [151]  3.005148e-58  2.546227e-46  8.956038e-41  4.030109e-37  2.157389e-34
## [156]  3.255989e-32  2.120555e-30  7.588349e-29  1.734159e-27  2.797086e-26
## [161]  3.414666e-25  3.319416e-24  2.669107e-23  1.827931e-22  1.090977e-21
## [166]  5.779738e-21  2.758762e-20  1.201066e-19  4.818467e-19  1.796722e-18
## [171]  6.272811e-18  2.063370e-17  6.429522e-17  1.906831e-16  5.404653e-16
## [176]  1.469333e-15  3.843816e-15  9.703604e-15  2.369939e-14  5.612626e-14
## [181]  1.291551e-13  2.893207e-13  6.319747e-13  1.348143e-12  2.812504e-12
## [186]  5.745456e-12  1.150634e-11  2.261504e-11  4.366505e-11  8.289782e-11
## [191]  1.548786e-10  2.849825e-10  5.168196e-10  9.243727e-10  1.631609e-09
## [196]  2.843810e-09  4.897106e-09  8.335966e-09  1.403328e-08  2.337468e-08
## [201]  3.853897e-08  6.292112e-08  1.017650e-07  1.631024e-07  2.591363e-07
## [206]  4.082632e-07  6.380091e-07  9.892631e-07  1.522342e-06  2.325626e-06
## [211]  3.527768e-06  5.314882e-06  7.954560e-06  1.182929e-05  1.748270e-05
## [216]  2.568314e-05  3.751084e-05  5.447660e-05  7.868283e-05  1.130410e-04
## [221]  1.615636e-04  2.297559e-04  3.251366e-04  4.579301e-04  6.419815e-04
## [226]  8.959618e-04  1.244949e-03  1.722496e-03  2.373324e-03  3.256821e-03
## [231]  4.451565e-03  6.061162e-03  8.221753e-03  1.111163e-02  1.496351e-02
## [236]  2.008022e-02  2.685458e-02  3.579456e-02  4.755514e-02  6.297839e-02
## [241]  8.314386e-02  1.094317e-01  1.436015e-01  1.878910e-01  2.451382e-01
## [246]  3.189334e-01  4.138077e-01  5.354650e-01  6.910691e-01  8.895958e-01
## [251]  1.142266e+00  1.463076e+00  1.869443e+00  2.383002e+00  3.030561e+00
## [256]  3.845273e+00  4.868060e+00  6.149323e+00  7.751025e+00  9.749193e+00
## [261]  1.223693e+01  1.532807e+01  1.916147e+01  2.390629e+01  2.976822e+01
## [266]  3.699690e+01  4.589484e+01  5.682800e+01  7.023838e+01  8.665893e+01
## [271]  1.067313e+02  1.312268e+02  1.610712e+02  1.973739e+02  2.414624e+02
## [276]  2.949231e+02  3.596490e+02  4.378953e+02  5.323459e+02  6.461895e+02
## [281]  7.832105e+02  9.478934e+02  1.145546e+03  1.382443e+03  1.665992e+03
## [286]  2.004925e+03  2.409528e+03  2.891899e+03  3.466251e+03  4.149262e+03
## [291]  4.960481e+03  5.922795e+03  7.062969e+03  8.412270e+03  1.000718e+04
## [296]  1.189024e+04  1.411095e+04  1.672693e+04  1.980510e+04  2.342316e+04
indices_validos <- which(y_rango > 0)
x_validos <- x_seq_real[indices_validos]
cat("Restricción: Y > 0 se cumple en el rango:\n")
## Restricción: Y > 0 se cumple en el rango:
cat(round(min(x_validos), 2), "< x <", round(max(x_validos), 2), "\n")
## 2000 < x < 2024
#Estimaciones
modelo_poli <- lm(y ~ x_esc_vector + x2 + x3 + x4 + x5)
# Calcular media y desviación estándar de los años originales
media_x <- mean(x_real)
sd_x <- sd(x_real)
# Año a estimar
año_nuevo <- 2021
x_esc_nuevo <- (año_nuevo - media_x) / sd_x

# Crear todos los términos necesarios con nombres correctos
nuevo_dato <- data.frame(
  x_esc_vector = x_esc_nuevo,
  x2 = x_esc_nuevo^2,
  x3 = x_esc_nuevo^3,
  x4 = x_esc_nuevo^4,
  x5 = x_esc_nuevo^5
)

# Predecir
prediccion_2021 <- predict(modelo_poli, newdata = nuevo_dato)

# Mostrar resultado
cat("Predicción para el año 2021:", round(prediccion_2021, 4), "\n")
## Predicción para el año 2021: 4.846
#
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") 
text(x = 1, y = 1,
     labels = 
     "¿Que cantidad de nivel de 
     contaminante (ppm) se espera para 
     el año 2021?
     R= 4.846 (ppm)",
     cex = 2, 
     col = "blue", 
     font = 6)

Conclusiones

Resumen del Modelo Polinómico
Variables Nombres Restricciones Coef. Pearson Estimación
x Año 2001 < x <2024 0.60 2024
y Nivel_contaminante (ppm) y > 0 4.846 (ppm)