project2 2.0

Quarto

Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.

Running Code

When you click the Render button a document will be generated that includes both content and the output of embedded code. You can embed code like this:

1 + 1
[1] 2

You can add options to executable code like this

[1] 4

The echo: false option disables the printing of code (only output is displayed).

library(readr)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
df <- read_csv("~/Desktop/water_pollution_disease.csv")
Rows: 3000 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): Country, Region, Water Source Type, Water Treatment Method
dbl (20): Year, Contaminant Level (ppm), pH Level, Turbidity (NTU), Dissolve...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
river_data <- df %>%
  filter(`Water Source Type` == "River")
summary(river_data)
   Country             Region               Year      Water Source Type 
 Length:538         Length:538         Min.   :2000   Length:538        
 Class :character   Class :character   1st Qu.:2005   Class :character  
 Mode  :character   Mode  :character   Median :2012   Mode  :character  
                                       Mean   :2012                     
                                       3rd Qu.:2018                     
                                       Max.   :2024                     
 Contaminant Level (ppm)    pH Level     Turbidity (NTU)
 Min.   :0.010           Min.   :6.010   Min.   :0.010  
 1st Qu.:2.530           1st Qu.:6.640   1st Qu.:1.340  
 Median :4.995           Median :7.280   Median :2.585  
 Mean   :4.953           Mean   :7.265   Mean   :2.547  
 3rd Qu.:7.428           3rd Qu.:7.918   3rd Qu.:3.690  
 Max.   :9.950           Max.   :8.490   Max.   :4.990  
 Dissolved Oxygen (mg/L) Nitrate Level (mg/L) Lead Concentration (µg/L)
 Min.   : 3.000          Min.   : 0.06        Min.   : 0.010           
 1st Qu.: 4.737          1st Qu.:12.78        1st Qu.: 5.320           
 Median : 6.200          Median :25.97        Median : 9.670           
 Mean   : 6.420          Mean   :25.51        Mean   : 9.875           
 3rd Qu.: 8.300          3rd Qu.:38.31        3rd Qu.:14.752           
 Max.   :10.000          Max.   :49.92        Max.   :19.940           
 Bacteria Count (CFU/mL) Water Treatment Method
 Min.   :  10            Length:538            
 1st Qu.:1201            Class :character      
 Median :2380            Mode  :character      
 Mean   :2473                                  
 3rd Qu.:3843                                  
 Max.   :4994                                  
 Access to Clean Water (% of Population) Diarrheal Cases per 100,000 people
 Min.   :30.08                           Min.   :  0.0                     
 1st Qu.:48.24                           1st Qu.:109.2                     
 Median :64.12                           Median :239.0                     
 Mean   :64.58                           Mean   :242.0                     
 3rd Qu.:81.37                           3rd Qu.:371.5                     
 Max.   :99.94                           Max.   :499.0                     
 Cholera Cases per 100,000 people Typhoid Cases per 100,000 people
 Min.   : 0.00                    Min.   : 0.00                   
 1st Qu.:14.00                    1st Qu.:23.00                   
 Median :25.50                    Median :48.00                   
 Mean   :25.27                    Mean   :49.07                   
 3rd Qu.:38.00                    3rd Qu.:73.75                   
 Max.   :49.00                    Max.   :99.00                   
 Infant Mortality Rate (per 1,000 live births) GDP per Capita (USD)
 Min.   : 2.06                                 Min.   :  572       
 1st Qu.:27.94                                 1st Qu.:26855       
 Median :53.01                                 Median :51919       
 Mean   :52.15                                 Mean   :51433       
 3rd Qu.:76.35                                 3rd Qu.:77048       
 Max.   :99.88                                 Max.   :99947       
 Healthcare Access Index (0-100) Urbanization Rate (%)
 Min.   : 0.19                   Min.   :10.10        
 1st Qu.:25.14                   1st Qu.:29.34        
 Median :50.33                   Median :48.45        
 Mean   :50.42                   Mean   :49.25        
 3rd Qu.:76.04                   3rd Qu.:69.06        
 Max.   :99.74                   Max.   :89.96        
 Sanitation Coverage (% of Population) Rainfall (mm per year) Temperature (°C)
 Min.   :20.01                         Min.   : 205.0         Min.   : 0.150  
 1st Qu.:41.34                         1st Qu.: 889.8         1st Qu.: 9.912  
 Median :60.17                         Median :1634.0         Median :21.165  
 Mean   :61.08                         Mean   :1601.0         Mean   :20.562  
 3rd Qu.:81.28                         3rd Qu.:2320.5         3rd Qu.:31.485  
 Max.   :99.87                         Max.   :2993.0         Max.   :39.980  
 Population Density (people per km²)
 Min.   : 10.0                      
 1st Qu.:278.8                      
 Median :529.5                      
 Mean   :521.6                      
 3rd Qu.:765.8                      
 Max.   :999.0                      
library(dplyr)
colnames(river_data)[which(names(river_data) == "Nitrate Level (mg/L)")] <- "Nitrate"
colnames(river_data)[which(names(river_data) == "Dissolved Oxygen (mg/L)")] <- "DissolvedOxygen"
colnames(river_data)[which(names(river_data) == "pH Level")] <- "pH"
colnames(river_data)[which(names(river_data) == "Turbidity (NTU)")] <- "Turbidity"
colnames(river_data)[which(names(river_data) == "Temperature (°C)")] <- "Temperature"
colnames(river_data)[which(names(river_data) == "Rainfall (mm per year)")] <- "Rainfall"
colnames(river_data)[which(names(river_data) == "Bacteria Count (CFU/mL)")] <- "Bacteria"
colnames(river_data)[which(names(river_data) == "Lead Concentration (µg/L)")] <- "Lead"
multi_data <- river_data %>%
  select(DissolvedOxygen, Nitrate, pH, Turbidity, Temperature, Rainfall, Bacteria, Lead) %>%
  filter(if_all(everything(), ~ !is.na(.)))
model_multi <- lm(DissolvedOxygen ~ Nitrate + pH + Turbidity + Temperature + Rainfall + Bacteria + Lead,
                  data = multi_data)

summary(model_multi)

Call:
lm(formula = DissolvedOxygen ~ Nitrate + pH + Turbidity + Temperature + 
    Rainfall + Bacteria + Lead, data = multi_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.8303 -1.7388 -0.2013  1.8426  3.9137 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  6.128e+00  9.755e-01   6.282 6.97e-10 ***
Nitrate     -1.770e-02  6.212e-03  -2.849  0.00456 ** 
pH           4.962e-02  1.229e-01   0.404  0.68663    
Turbidity   -2.686e-02  6.247e-02  -0.430  0.66734    
Temperature  5.150e-03  7.508e-03   0.686  0.49306    
Rainfall     6.189e-05  1.092e-04   0.567  0.57110    
Bacteria     5.045e-05  6.062e-05   0.832  0.40562    
Lead         1.230e-02  1.583e-02   0.777  0.43750    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.054 on 530 degrees of freedom
Multiple R-squared:  0.01924,   Adjusted R-squared:  0.006291 
F-statistic: 1.486 on 7 and 530 DF,  p-value: 0.1697
plot(model_multi, which = 1)

shapiro.test(residuals(model_multi))

    Shapiro-Wilk normality test

data:  residuals(model_multi)
W = 0.95445, p-value = 7.817e-12
plot(model_multi, which = 4)

model <- lm(
  DissolvedOxygen ~ Nitrate + pH + Turbidity + Temperature + Rainfall + Bacteria + Lead,
  data = multi_data
)
summary(model)

Call:
lm(formula = DissolvedOxygen ~ Nitrate + pH + Turbidity + Temperature + 
    Rainfall + Bacteria + Lead, data = multi_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.8303 -1.7388 -0.2013  1.8426  3.9137 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  6.128e+00  9.755e-01   6.282 6.97e-10 ***
Nitrate     -1.770e-02  6.212e-03  -2.849  0.00456 ** 
pH           4.962e-02  1.229e-01   0.404  0.68663    
Turbidity   -2.686e-02  6.247e-02  -0.430  0.66734    
Temperature  5.150e-03  7.508e-03   0.686  0.49306    
Rainfall     6.189e-05  1.092e-04   0.567  0.57110    
Bacteria     5.045e-05  6.062e-05   0.832  0.40562    
Lead         1.230e-02  1.583e-02   0.777  0.43750    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.054 on 530 degrees of freedom
Multiple R-squared:  0.01924,   Adjusted R-squared:  0.006291 
F-statistic: 1.486 on 7 and 530 DF,  p-value: 0.1697
shapiro.test(residuals(model))

    Shapiro-Wilk normality test

data:  residuals(model)
W = 0.95445, p-value = 7.817e-12
summary(model_multi)$coefficients["Nitrate", "Pr(>|t|)"]
[1] 0.004558633
library(car)
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:dplyr':

    recode
vif(model_multi)
    Nitrate          pH   Turbidity Temperature    Rainfall    Bacteria 
   1.009911    1.018408    1.008851    1.008609    1.006151    1.013103 
       Lead 
   1.009869