#UNIVERSIDAD CENTRAL DE ECUADOR
#Facultad de Ingeniería en Geología,Minas, Petroleos y Ambiental
#INGENIERIA AMBIENTAL
#AUTHOR: SOFIA HEREDIA
#FECHA: 14-05-2025
#carga de datos
options(repos = c(CRAN = "https://cran.rstudio.com"))
install.packages("readxl")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'readxl'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problema al copiar
## C:\Users\Usuario\AppData\Local\R\win-library\4.4\00LOCK\readxl\libs\x64\readxl.dll
## a C:\Users\Usuario\AppData\Local\R\win-library\4.4\readxl\libs\x64\readxl.dll:
## Permission denied
## Warning: restored 'readxl'
##
## The downloaded binary packages are in
## C:\Users\Usuario\AppData\Local\Temp\RtmpCaxkQd\downloaded_packages
install.packages("readr")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'readr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'readr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problema al copiar
## C:\Users\Usuario\AppData\Local\R\win-library\4.4\00LOCK\readr\libs\x64\readr.dll
## a C:\Users\Usuario\AppData\Local\R\win-library\4.4\readr\libs\x64\readr.dll:
## Permission denied
## Warning: restored 'readr'
##
## The downloaded binary packages are in
## C:\Users\Usuario\AppData\Local\Temp\RtmpCaxkQd\downloaded_packages
library(readxl)
library(readr)
datos <- read_csv("C:/Users/Usuario/Downloads/water_pollution_disease (2).csv")
## Rows: 3000 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Country, Region, Water Source Type, Water Treatment Method
## dbl (20): Year, Contaminant Level (ppm), pH Level, Turbidity (NTU), Dissolve...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(datos)
## # A tibble: 6 × 24
## Country Region Year `Water Source Type` Contaminant Level (pp…¹ `pH Level`
## <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 Mexico North 2015 Lake 6.06 7.12
## 2 Brazil West 2017 Well 5.24 7.84
## 3 Indonesia Central 2022 Pond 0.24 6.43
## 4 Nigeria East 2016 Well 7.91 6.71
## 5 Mexico South 2005 Well 0.12 8.16
## 6 Ethiopia West 2013 Tap 2.93 8.21
## # ℹ abbreviated name: ¹`Contaminant Level (ppm)`
## # ℹ 18 more variables: `Turbidity (NTU)` <dbl>,
## # `Dissolved Oxygen (mg/L)` <dbl>, `Nitrate Level (mg/L)` <dbl>,
## # `Lead Concentration (µg/L)` <dbl>, `Bacteria Count (CFU/mL)` <dbl>,
## # `Water Treatment Method` <chr>,
## # `Access to Clean Water (% of Population)` <dbl>,
## # `Diarrheal Cases per 100,000 people` <dbl>, …
# ED VARIABLE CUANTITATIVA CONTINUA
str(datos)
## spc_tbl_ [3,000 × 24] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Country : chr [1:3000] "Mexico" "Brazil" "Indonesia" "Nigeria" ...
## $ Region : chr [1:3000] "North" "West" "Central" "East" ...
## $ Year : num [1:3000] 2015 2017 2022 2016 2005 ...
## $ Water Source Type : chr [1:3000] "Lake" "Well" "Pond" "Well" ...
## $ Contaminant Level (ppm) : num [1:3000] 6.06 5.24 0.24 7.91 0.12 2.93 0.06 3.76 0.63 9.14 ...
## $ pH Level : num [1:3000] 7.12 7.84 6.43 6.71 8.16 8.21 6.11 6.42 6.29 6.45 ...
## $ Turbidity (NTU) : num [1:3000] 3.93 4.79 0.79 1.96 4.22 4.03 3.12 1.35 1.42 0.62 ...
## $ Dissolved Oxygen (mg/L) : num [1:3000] 4.28 3.86 3.42 3.12 9.15 8.66 6.97 9.99 9.67 7.59 ...
## $ Nitrate Level (mg/L) : num [1:3000] 8.28 15.74 36.67 36.92 49.35 ...
## $ Lead Concentration (µg/L) : num [1:3000] 7.89 14.68 9.96 6.77 12.51 ...
## $ Bacteria Count (CFU/mL) : num [1:3000] 3344 2122 2330 3779 4182 ...
## $ Water Treatment Method : chr [1:3000] "Filtration" "Boiling" "None" "Boiling" ...
## $ Access to Clean Water (% of Population) : num [1:3000] 33.6 89.5 35.3 57.5 36.6 ...
## $ Diarrheal Cases per 100,000 people : num [1:3000] 472 122 274 3 466 258 208 397 265 261 ...
## $ Cholera Cases per 100,000 people : num [1:3000] 33 27 39 33 31 22 23 0 23 2 ...
## $ Typhoid Cases per 100,000 people : num [1:3000] 44 8 50 13 68 55 90 10 29 38 ...
## $ Infant Mortality Rate (per 1,000 live births): num [1:3000] 76.2 77.3 48.5 95.7 58.8 ...
## $ GDP per Capita (USD) : num [1:3000] 57057 17220 86022 31166 25661 ...
## $ Healthcare Access Index (0-100) : num [1:3000] 96.9 84.7 58.4 39.1 23 ...
## $ Urbanization Rate (%) : num [1:3000] 84.6 73.4 72.9 71.1 55.5 ...
## $ Sanitation Coverage (% of Population) : num [1:3000] 63.2 29.1 93.6 94.2 69.2 ...
## $ Rainfall (mm per year) : num [1:3000] 2800 1572 2074 937 2295 ...
## $ Temperature (°C) : num [1:3000] 4.94 16.93 21.73 3.79 31.44 ...
## $ Population Density (people per km²) : num [1:3000] 593 234 57 555 414 775 584 111 538 250 ...
## - attr(*, "spec")=
## .. cols(
## .. Country = col_character(),
## .. Region = col_character(),
## .. Year = col_double(),
## .. `Water Source Type` = col_character(),
## .. `Contaminant Level (ppm)` = col_double(),
## .. `pH Level` = col_double(),
## .. `Turbidity (NTU)` = col_double(),
## .. `Dissolved Oxygen (mg/L)` = col_double(),
## .. `Nitrate Level (mg/L)` = col_double(),
## .. `Lead Concentration (µg/L)` = col_double(),
## .. `Bacteria Count (CFU/mL)` = col_double(),
## .. `Water Treatment Method` = col_character(),
## .. `Access to Clean Water (% of Population)` = col_double(),
## .. `Diarrheal Cases per 100,000 people` = col_double(),
## .. `Cholera Cases per 100,000 people` = col_double(),
## .. `Typhoid Cases per 100,000 people` = col_double(),
## .. `Infant Mortality Rate (per 1,000 live births)` = col_double(),
## .. `GDP per Capita (USD)` = col_double(),
## .. `Healthcare Access Index (0-100)` = col_double(),
## .. `Urbanization Rate (%)` = col_double(),
## .. `Sanitation Coverage (% of Population)` = col_double(),
## .. `Rainfall (mm per year)` = col_double(),
## .. `Temperature (°C)` = col_double(),
## .. `Population Density (people per km²)` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
GDP_per_cápita <- datos$`GDP per Capita (USD)`
GDP_per_cápita <- na.omit(GDP_per_cápita)
# PROCEDIMIENTO MANUAL
min <-min(GDP_per_cápita)
max <-max(GDP_per_cápita)
R <-max-min
K <- floor(1+3.33*log10(length(GDP_per_cápita)))
A <-R/K
Li <-round(seq(from=min,to=max-A,by=A),2)
Ls <-round(seq(from=min+A,to=max,by=A),2)
Mc <-(Li+Ls)/2
ni<-c()
for (i in 1:K) {
if (i < K) {
ni[i] <- length(subset(GDP_per_cápita, GDP_per_cápita >= Li[i] & GDP_per_cápita < Ls[i]))
} else {
ni[i] <- length(subset(GDP_per_cápita, GDP_per_cápita >= Li[i] & GDP_per_cápita <= Ls[i]))
}
}
sum(ni)
## [1] 3000
hi <-ni/sum(ni)*100
Ni_asc<-cumsum(ni)
Hi_asc<-cumsum(hi)
Ni_desc<-rev(cumsum(rev(ni)))
Hi_desc<-rev(cumsum(rev(hi)))
TDF_GDP <- data.frame(
Li, Ls, Mc, ni, round(hi, 2), Ni_asc, Ni_desc, round(Hi_asc, 2), round(Hi_desc, 2)
)
colnames(TDF_GDP) <- c("Li","Ls","Mc","ni","hi","Ni_asc(%)","Ni_desc(%)","Hi_asc","Hi_desc")
#Crear fila de totales
totales <-c(
Li="-",
Ls="-",
Mc="-",
ni=sum(ni),
hi=sum(hi),
Ni_asc="-",
Ni_desc="-",
Hi_asc="-",
Hi_desc="-")
TDF_GDP_total <-rbind(TDF_GDP,totales)
View(TDF_GDP_total)
# Tabla de Distribución de frecuencia
Histograma_GDP <- hist(GDP_per_cápita, main="Gráfica N: Distribución de GDP per cápita por persona en las regiones analizadas",
xlab = "GDP per cápita",
ylab = "cantidad",col = "purple")

limites <- Histograma_GDP$breaks
liminf <- limites[1:10]
liminsup <- limites[2:11]
MC <- Histograma_GDP$mids
ni <- Histograma_GDP$counts
hi <- ni/sum(ni)*100
Niasc <- cumsum(ni)
Hiasc <- cumsum(hi)
Nides <- rev(cumsum(rev(ni)))
Hides <- rev(cumsum(rev(hi)))
TDF_fuente <- data.frame(liminf,liminsup,MC,ni,round(hi,2),
Niasc,Nides,round(Hiasc,2),
round(Hides,2))
# crear de fila de totales
totales <- c(
liminf= "-",
liminsup= "-",
MC= "-",
ni= sum(ni),
hi= sum(hi),
Niasc= "-",
Nides= "-",
Hiasc= "-",
Hides= "-")
TDF_GDP <- rbind(TDF_GDP,totales)
colnames(TDF_GDP) <- c("Limininf","Liminsup","MC","ni","hi(%)",
"Ni asc","Hi asc(%)","Ni desc","Hi desc(%)")
View(TDF_GDP)
# Estetíca de la tabla
install.packages("kableExtra")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'kableExtra' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Usuario\AppData\Local\Temp\RtmpCaxkQd\downloaded_packages
install.packages("dplyr")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Usuario\AppData\Local\Temp\RtmpCaxkQd\downloaded_packages
library(kableExtra)
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
##
## The following object is masked from 'package:kableExtra':
##
## group_rows
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
kable(TDF_GDP_total, align = "c",
caption = "Tabla de Distribución de Frecuencias de GDP per cápita por persona de las regiones seleccionadas") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Tabla de Distribución de Frecuencias de GDP per cápita por persona de
las regiones seleccionadas
|
Li
|
Ls
|
Mc
|
ni
|
hi
|
Ni_asc(%)
|
Ni_desc(%)
|
Hi_asc
|
Hi_desc
|
|
521
|
8806.58
|
4663.79
|
247
|
8.23
|
247
|
3000
|
8.23
|
100
|
|
8806.58
|
17092.17
|
12949.375
|
232
|
7.73
|
479
|
2753
|
15.97
|
91.77
|
|
17092.17
|
25377.75
|
21234.96
|
286
|
9.53
|
765
|
2521
|
25.5
|
84.03
|
|
25377.75
|
33663.33
|
29520.54
|
255
|
8.5
|
1020
|
2235
|
34
|
74.5
|
|
33663.33
|
41948.92
|
37806.125
|
242
|
8.07
|
1262
|
1980
|
42.07
|
66
|
|
41948.92
|
50234.5
|
46091.71
|
259
|
8.63
|
1521
|
1738
|
50.7
|
57.93
|
|
50234.5
|
58520.08
|
54377.29
|
245
|
8.17
|
1766
|
1479
|
58.87
|
49.3
|
|
58520.08
|
66805.67
|
62662.875
|
251
|
8.37
|
2017
|
1234
|
67.23
|
41.13
|
|
66805.67
|
75091.25
|
70948.46
|
239
|
7.97
|
2256
|
983
|
75.2
|
32.77
|
|
75091.25
|
83376.83
|
79234.04
|
235
|
7.83
|
2491
|
744
|
83.03
|
24.8
|
|
83376.83
|
91662.42
|
87519.625
|
267
|
8.9
|
2758
|
509
|
91.93
|
16.97
|
|
91662.42
|
99948
|
95805.21
|
242
|
8.07
|
3000
|
242
|
100
|
8.07
|
|
|
|
|
3000
|
100
|
|
|
|
|
View(TDF_GDP_total)
# GRAFICAS
# Histograma
hist(GDP_per_cápita, breaks = 10,
main = "Gráfica N°1: Distribución de GDP per cápita por persona de las regiones analizadas",
xlab = "GDP per cápita",
ylab = "Cantidad",
ylim = c(0, max(ni)),
col = "purple",
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_GDP$breaks,
labels = Histograma_GDP$breaks, las = 1,
cex.axis = 0.9)

# Global
hist(GDP_per_cápita, breaks = 10,
main = "Gráfica N°2: Distribución de GDP per cápita por persona de las regiones analizadas",
xlab = "GDP per cápita",
ylab = "Cantidad",
ylim = c(0, length(GDP_per_cápita)),
col = "purple",
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_GDP$breaks,
labels = Histograma_GDP$breaks, las = 1,
cex.axis = 0.9)

TDF_GDP$hi <- as.numeric(TDF_GDP$hi)
barplot(TDF_GDP$hi,
space=0,
col = "skyblue",
main ="Gráfica N°3: Distribución Porcentual de GDP per cápita por persona de las regiones analizadas ",
xlab="GDP per cápita (%)",
ylab="Porcentaje (%)",
names.arg= TDF_GDP$MC,
ylim = c(0,100))

# Local
hist(GDP_per_cápita, breaks = 10,
main = "Gráfica N°4: Distribución de GDP per cápita por persona de las regiones analizadas",
xlab = "GDP per cápita (%)",
ylab = "Cantidad",
ylim = c(0,max(ni)),
col = "purple",
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_GDP$breaks,
labels = Histograma_GDP$breaks, las = 1,
cex.axis = 0.9)

barplot(TDF_GDP$hi,space=0,
col = "lightblue",
main ="Gráfica N°5: Distribución Porcentual de GDP per cápita por persona de las regiones analizadas ",
xlab="GDP per cápita (%)",
ylab="Porcentaje (%)",
ylim = c(0,14),
names.arg = TDF_GDP$MC)

# Diagrama de Ojiva Ascendente y Descendente
plot(Li, Ni_desc,
main = "Gráfica N°6: Frecuencias Acumuladas Ascendentes y Descendentes del GDP per cápita",
xlab = "GDP per cápita (USD)",
ylab = "Frecuencia acumulada absoluta",
xlim = c(min(Li), max(Ls)),
ylim = c(0, max(Ni_asc) * 1.05),
col = "skyblue",
cex.axis = 0.8,
type = "o",
lwd = 3,
las = 1,
xaxt = "n")
lines(Ls, Ni_asc,
col = "pink",
type = "o",
lwd = 3)
axis(1, at = seq(round(min(Li), -2), round(max(Ls), -2), by = 10000))

# Diagrama de Ojiva Ascendente y Descendente Porcentual
plot(Li, Hi_desc,
main = "Gráfica N°7: Ojivas Ascendente y Descendente del GDP per cápita",
xlab = "GDP per cápita (USD)",
ylab = "Frecuencia acumulada relativa (%)",
xlim = c(min(Li), max(Ls)),
ylim = c(0, 100),
col = "red",
type = "o",
lwd = 2,
xaxt = "n")
lines(Ls, Hi_asc,
col = "blue",
type = "o",
lwd = 2)
axis(1, at = seq(round(min(Li), -2), round(max(Ls), -2), by = 10000))

# Diagrama de caja
boxplot(GDP_per_cápita,
main = "Gráfica N°: Distribución de frecuencias de de GDP per cápita por persona de las regiones analizadas",
ylab = "GDP per cápita",
col = "lightblue",
horizontal = TRUE)

# INDICADORES ESTADISTICOS
# Indicadores de Tendencia Central
# Media aritmética
media <- round(mean(GDP_per_cápita), 2)
media
## [1] 50036.2
# Moda
max_frecuencia <- max(TDF_GDP_total$ni)
moda <- TDF_GDP_total$MC[TDF_GDP_total$ni == max_frecuencia]
moda
## NULL
# Mediana
mediana <- median(GDP_per_cápita)
mediana
## [1] 49621.5
# INDICADORES DE DISPERSIÓN #
# Varianza
varianza <- var(GDP_per_cápita)
varianza
## [1] 817888531
# Desviación Estándar
sd <- sd(GDP_per_cápita)
sd
## [1] 28598.75
# Coeficiente de Variación
cv <- round((sd / media) * 100, 2)
cv
## [1] 57.16
# INDICADORES DE FORMA #
# Asimetría
install.packages("e1071")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'e1071' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'e1071'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problema al copiar
## C:\Users\Usuario\AppData\Local\R\win-library\4.4\00LOCK\e1071\libs\x64\e1071.dll
## a C:\Users\Usuario\AppData\Local\R\win-library\4.4\e1071\libs\x64\e1071.dll:
## Permission denied
## Warning: restored 'e1071'
##
## The downloaded binary packages are in
## C:\Users\Usuario\AppData\Local\Temp\RtmpCaxkQd\downloaded_packages
library(e1071)
asimetria <- skewness(GDP_per_cápita, type = 2)
asimetria
## [1] 0.0291647
#Curtosis
curtosis <- kurtosis(GDP_per_cápita)
curtosis
## [1] -1.208876
tabla_indicadores <- data.frame("Variable" =c("GDP per cápita por persona (%)"),
"Rango" = c("[10.03 ;89.98]"),
"X" = c(media),
"Me" = c(round(mediana,2)),
"Mo" = c("No hay moda"),
"V" = c(round(varianza,2)),
"Sd" = c(round(sd,2)),
"Cv" = c(cv),
"As" = c(round(asimetria,4)),
"K" = c(round(curtosis,2)),
"Valores Atipicos" = "No hay presencia de valores atipicos")
library(knitr)
kable(tabla_indicadores, align = 'c', caption = "Conclusiones de la variable
GDP per cápita por persona de las regiones analizadas ")
Conclusiones de la variable GDP per cápita por persona de las
regiones analizadas
| GDP per cápita por persona (%) |
[10.03 ;89.98] |
50036.2 |
49621.5 |
No hay moda |
817888531 |
28598.75 |
57.16 |
0.0292 |
-1.21 |
No hay presencia de valores atipicos |