project 2

Quarto

Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.

Running Code

When you click the Render button a document will be generated that includes both content and the output of embedded code. You can embed code like this:

1 + 1
[1] 2

You can add options to executable code like this

[1] 4

The echo: false option disables the printing of code (only output is displayed).

library(readr)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
df <- read_csv("~/Desktop/water_pollution_disease.csv")
Rows: 3000 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): Country, Region, Water Source Type, Water Treatment Method
dbl (20): Year, Contaminant Level (ppm), pH Level, Turbidity (NTU), Dissolve...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
river_data <- df %>%
  filter(`Water Source Type` == "River")
summary(river_data)
   Country             Region               Year      Water Source Type 
 Length:538         Length:538         Min.   :2000   Length:538        
 Class :character   Class :character   1st Qu.:2005   Class :character  
 Mode  :character   Mode  :character   Median :2012   Mode  :character  
                                       Mean   :2012                     
                                       3rd Qu.:2018                     
                                       Max.   :2024                     
 Contaminant Level (ppm)    pH Level     Turbidity (NTU)
 Min.   :0.010           Min.   :6.010   Min.   :0.010  
 1st Qu.:2.530           1st Qu.:6.640   1st Qu.:1.340  
 Median :4.995           Median :7.280   Median :2.585  
 Mean   :4.953           Mean   :7.265   Mean   :2.547  
 3rd Qu.:7.428           3rd Qu.:7.918   3rd Qu.:3.690  
 Max.   :9.950           Max.   :8.490   Max.   :4.990  
 Dissolved Oxygen (mg/L) Nitrate Level (mg/L) Lead Concentration (µg/L)
 Min.   : 3.000          Min.   : 0.06        Min.   : 0.010           
 1st Qu.: 4.737          1st Qu.:12.78        1st Qu.: 5.320           
 Median : 6.200          Median :25.97        Median : 9.670           
 Mean   : 6.420          Mean   :25.51        Mean   : 9.875           
 3rd Qu.: 8.300          3rd Qu.:38.31        3rd Qu.:14.752           
 Max.   :10.000          Max.   :49.92        Max.   :19.940           
 Bacteria Count (CFU/mL) Water Treatment Method
 Min.   :  10            Length:538            
 1st Qu.:1201            Class :character      
 Median :2380            Mode  :character      
 Mean   :2473                                  
 3rd Qu.:3843                                  
 Max.   :4994                                  
 Access to Clean Water (% of Population) Diarrheal Cases per 100,000 people
 Min.   :30.08                           Min.   :  0.0                     
 1st Qu.:48.24                           1st Qu.:109.2                     
 Median :64.12                           Median :239.0                     
 Mean   :64.58                           Mean   :242.0                     
 3rd Qu.:81.37                           3rd Qu.:371.5                     
 Max.   :99.94                           Max.   :499.0                     
 Cholera Cases per 100,000 people Typhoid Cases per 100,000 people
 Min.   : 0.00                    Min.   : 0.00                   
 1st Qu.:14.00                    1st Qu.:23.00                   
 Median :25.50                    Median :48.00                   
 Mean   :25.27                    Mean   :49.07                   
 3rd Qu.:38.00                    3rd Qu.:73.75                   
 Max.   :49.00                    Max.   :99.00                   
 Infant Mortality Rate (per 1,000 live births) GDP per Capita (USD)
 Min.   : 2.06                                 Min.   :  572       
 1st Qu.:27.94                                 1st Qu.:26855       
 Median :53.01                                 Median :51919       
 Mean   :52.15                                 Mean   :51433       
 3rd Qu.:76.35                                 3rd Qu.:77048       
 Max.   :99.88                                 Max.   :99947       
 Healthcare Access Index (0-100) Urbanization Rate (%)
 Min.   : 0.19                   Min.   :10.10        
 1st Qu.:25.14                   1st Qu.:29.34        
 Median :50.33                   Median :48.45        
 Mean   :50.42                   Mean   :49.25        
 3rd Qu.:76.04                   3rd Qu.:69.06        
 Max.   :99.74                   Max.   :89.96        
 Sanitation Coverage (% of Population) Rainfall (mm per year) Temperature (°C)
 Min.   :20.01                         Min.   : 205.0         Min.   : 0.150  
 1st Qu.:41.34                         1st Qu.: 889.8         1st Qu.: 9.912  
 Median :60.17                         Median :1634.0         Median :21.165  
 Mean   :61.08                         Mean   :1601.0         Mean   :20.562  
 3rd Qu.:81.28                         3rd Qu.:2320.5         3rd Qu.:31.485  
 Max.   :99.87                         Max.   :2993.0         Max.   :39.980  
 Population Density (people per km²)
 Min.   : 10.0                      
 1st Qu.:278.8                      
 Median :529.5                      
 Mean   :521.6                      
 3rd Qu.:765.8                      
 Max.   :999.0                      
names(river_data)
 [1] "Country"                                      
 [2] "Region"                                       
 [3] "Year"                                         
 [4] "Water Source Type"                            
 [5] "Contaminant Level (ppm)"                      
 [6] "pH Level"                                     
 [7] "Turbidity (NTU)"                              
 [8] "Dissolved Oxygen (mg/L)"                      
 [9] "Nitrate Level (mg/L)"                         
[10] "Lead Concentration (µg/L)"                    
[11] "Bacteria Count (CFU/mL)"                      
[12] "Water Treatment Method"                       
[13] "Access to Clean Water (% of Population)"      
[14] "Diarrheal Cases per 100,000 people"           
[15] "Cholera Cases per 100,000 people"             
[16] "Typhoid Cases per 100,000 people"             
[17] "Infant Mortality Rate (per 1,000 live births)"
[18] "GDP per Capita (USD)"                         
[19] "Healthcare Access Index (0-100)"              
[20] "Urbanization Rate (%)"                        
[21] "Sanitation Coverage (% of Population)"        
[22] "Rainfall (mm per year)"                       
[23] "Temperature (°C)"                             
[24] "Population Density (people per km²)"          
colnames(river_data)[which(names(river_data) == "Nitrate Level (mg/L)")] <- "Nitrate"
colnames(river_data)[which(names(river_data) == "Dissolved Oxygen (mg/L)")] <- "DissolvedOxygen"
with(subset(river_data, !is.na(Nitrate) & !is.na(DissolvedOxygen)),
     plot(Nitrate, DissolvedOxygen,
          xlab = "Nitrate (mg/L)", ylab = "Dissolved Oxygen (mg/L)",
          main = "Scatterplot: Nitrate vs Dissolved Oxygen"))
abline(lm(DissolvedOxygen~Nitrate, data=river_data, col="blue", lwd=2))
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
 extra arguments 'col', 'lwd' will be disregarded

cor.test(river_data$Nitrate, river_data$DissolvedOxygen)

    Pearson's product-moment correlation

data:  river_data$Nitrate and river_data$DissolvedOxygen
t = -2.8187, df = 536, p-value = 0.005
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.20331299 -0.03669623
sample estimates:
       cor 
-0.1208557 
model <- lm(DissolvedOxygen ~ Nitrate, data = river_data)
summary(model)

Call:
lm(formula = DissolvedOxygen ~ Nitrate, data = river_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.7040 -1.7445 -0.1465  1.7775  3.7577 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  6.862752   0.180264  38.071   <2e-16 ***
Nitrate     -0.017366   0.006161  -2.819    0.005 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.047 on 536 degrees of freedom
Multiple R-squared:  0.01461,   Adjusted R-squared:  0.01277 
F-statistic: 7.945 on 1 and 536 DF,  p-value: 0.005