project 2

Quarto

Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.

data exploration

library(readr)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

river_data <- read_csv("~/Desktop/water_pollution_disease.csv") %>%
  filter(`Water Source Type` == "River") %>%
  rename(
    Nitrate         = `Nitrate Level (mg/L)`,
    DissolvedOxygen = `Dissolved Oxygen (mg/L)`
  )

Rows: 3000 Columns: 24

── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): Country, Region, Water Source Type, Water Treatment Method
dbl (20): Year, Contaminant Level (ppm), pH Level, Turbidity (NTU), Dissolve...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library(dplyr)
library(tidyr)
library(readr)
library(knitr)

summary_tbl <- tibble::tibble(
  Variable = c("Nitrate", "DissolvedOxygen"),
  N        = nrow(river_data),
  Mean     = c(mean(river_data$Nitrate, na.rm=TRUE),
               mean(river_data$DissolvedOxygen, na.rm=TRUE)),
  SD       = c(sd(river_data$Nitrate, na.rm=TRUE),
               sd(river_data$DissolvedOxygen, na.rm=TRUE)),
  Min      = c(min(river_data$Nitrate, na.rm=TRUE),
               min(river_data$DissolvedOxygen, na.rm=TRUE)),
  Max      = c(max(river_data$Nitrate, na.rm=TRUE),
               max(river_data$DissolvedOxygen, na.rm=TRUE))
) %>%
  mutate(across(Mean:Max, ~ round(.x, 2)))

knitr::kable(
  summary_tbl,
  caption   = "Summary Table 1: Summary Statistics for Nitrate and Dissolved Oxygen",
  col.names = c("Variable", "N", "Mean", "SD", "Min", "Max")
)

Summary Table 1: Summary Statistics for Nitrate and Dissolved Oxygen
Variable	N	Mean	SD	Min	Max
Nitrate	538	25.51	14.34	0.06	49.92
DissolvedOxygen	538	6.42	2.06	3.00	10.00

histogram for dissolved oxygen

library(ggplot2)

ggplot(river_data, aes(x = DissolvedOxygen)) +
  geom_histogram(binwidth=0.3,
                 fill="lightblue", color="black", alpha=0.7) +
  labs(title="Distribution of Dissolved Oxygen",
       x="Dissolved Oxygen (mg/L)",
       y="Frequency") +
  theme_minimal()

histogram for Nitrate level

library(ggplot2)

ggplot(river_data, aes(x = Nitrate)) +
  geom_histogram(
    binwidth = 2,
    fill     = "lightblue",
    color    = "black",
    alpha    = 0.7
  ) +
  labs(
    title = "Distribution of Nitrate Level in Rivers",
    x     = "Nitrate (mg/L)",
    y     = "Frequency"
  ) +
  theme_minimal()

country comparison

with(subset(river_data, !is.na(Nitrate) & !is.na(DissolvedOxygen)),
     plot(Nitrate, DissolvedOxygen,
          xlab = "Nitrate (mg/L)", ylab = "Dissolved Oxygen (mg/L)",
          main = "Scatterplot: Nitrate vs Dissolved Oxygen"))

abline(lm(DissolvedOxygen ~ Nitrate, data = river_data),
       col = "blue", lwd = 2)

The scatter plot for linear function

cor.test(river_data$Nitrate, river_data$DissolvedOxygen)


    Pearson's product-moment correlation

data:  river_data$Nitrate and river_data$DissolvedOxygen
t = -2.8187, df = 536, p-value = 0.005
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.20331299 -0.03669623
sample estimates:
       cor 
-0.1208557

model <- lm(DissolvedOxygen ~ Nitrate, data = river_data)
summary(model)


Call:
lm(formula = DissolvedOxygen ~ Nitrate, data = river_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.7040 -1.7445 -0.1465  1.7775  3.7577 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  6.862752   0.180264  38.071   <2e-16 ***
Nitrate     -0.017366   0.006161  -2.819    0.005 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.047 on 536 degrees of freedom
Multiple R-squared:  0.01461,   Adjusted R-squared:  0.01277 
F-statistic: 7.945 on 1 and 536 DF,  p-value: 0.005

Coding for multiple linear regression

library(dplyr)
colnames(river_data)[which(names(river_data) == "Nitrate Level (mg/L)")] <- "Nitrate"
colnames(river_data)[which(names(river_data) == "Dissolved Oxygen (mg/L)")] <- "DissolvedOxygen"
colnames(river_data)[which(names(river_data) == "pH Level")] <- "pH"
colnames(river_data)[which(names(river_data) == "Turbidity (NTU)")] <- "Turbidity"
colnames(river_data)[which(names(river_data) == "Temperature (°C)")] <- "Temperature"
colnames(river_data)[which(names(river_data) == "Rainfall (mm per year)")] <- "Rainfall"
colnames(river_data)[which(names(river_data) == "Bacteria Count (CFU/mL)")] <- "Bacteria"
colnames(river_data)[which(names(river_data) == "Lead Concentration (µg/L)")] <- "Lead"

multi_data <- river_data %>%
  select(DissolvedOxygen, Nitrate, pH, Turbidity, Temperature, Rainfall, Bacteria, Lead) %>%
  filter(if_all(everything(), ~ !is.na(.)))
model_multi <- lm(DissolvedOxygen ~ Nitrate + pH + Turbidity + Temperature + Rainfall + Bacteria + Lead,
                  data = multi_data)

summary(model_multi)


Call:
lm(formula = DissolvedOxygen ~ Nitrate + pH + Turbidity + Temperature + 
    Rainfall + Bacteria + Lead, data = multi_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.8303 -1.7388 -0.2013  1.8426  3.9137 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  6.128e+00  9.755e-01   6.282 6.97e-10 ***
Nitrate     -1.770e-02  6.212e-03  -2.849  0.00456 ** 
pH           4.962e-02  1.229e-01   0.404  0.68663    
Turbidity   -2.686e-02  6.247e-02  -0.430  0.66734    
Temperature  5.150e-03  7.508e-03   0.686  0.49306    
Rainfall     6.189e-05  1.092e-04   0.567  0.57110    
Bacteria     5.045e-05  6.062e-05   0.832  0.40562    
Lead         1.230e-02  1.583e-02   0.777  0.43750    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.054 on 530 degrees of freedom
Multiple R-squared:  0.01924,   Adjusted R-squared:  0.006291 
F-statistic: 1.486 on 7 and 530 DF,  p-value: 0.1697

fitted_vals <- fitted(model_multi)
resid_vals  <- residuals(model_multi)


plot(fitted_vals, resid_vals,
     xlab = "Fitted values",
     ylab = "Residuals",
     main = "Residuals vs Fitted with Thick blue Smooth Line",
     pch = 16, col = "darkgray", cex = 0.9)

sm <- lowess(fitted_vals, resid_vals, f = 2/3)  
lines(sm, col = "blue", lwd = 3)

shapiro.test(residuals(model_multi))


    Shapiro-Wilk normality test

data:  residuals(model_multi)
W = 0.95445, p-value = 7.817e-12

model <- lm(
  DissolvedOxygen ~ Nitrate + pH + Turbidity + Temperature + Rainfall + Bacteria + Lead,
  data = multi_data
)
summary(model)


Call:
lm(formula = DissolvedOxygen ~ Nitrate + pH + Turbidity + Temperature + 
    Rainfall + Bacteria + Lead, data = multi_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.8303 -1.7388 -0.2013  1.8426  3.9137 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  6.128e+00  9.755e-01   6.282 6.97e-10 ***
Nitrate     -1.770e-02  6.212e-03  -2.849  0.00456 ** 
pH           4.962e-02  1.229e-01   0.404  0.68663    
Turbidity   -2.686e-02  6.247e-02  -0.430  0.66734    
Temperature  5.150e-03  7.508e-03   0.686  0.49306    
Rainfall     6.189e-05  1.092e-04   0.567  0.57110    
Bacteria     5.045e-05  6.062e-05   0.832  0.40562    
Lead         1.230e-02  1.583e-02   0.777  0.43750    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.054 on 530 degrees of freedom
Multiple R-squared:  0.01924,   Adjusted R-squared:  0.006291 
F-statistic: 1.486 on 7 and 530 DF,  p-value: 0.1697

shapiro.test(residuals(model))


    Shapiro-Wilk normality test

data:  residuals(model)
W = 0.95445, p-value = 7.817e-12

summary(model_multi)$coefficients["Nitrate", "Pr(>|t|)"]

[1] 0.004558633

library(car)

Loading required package: carData


Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

vif(model_multi)

    Nitrate          pH   Turbidity Temperature    Rainfall    Bacteria 
   1.009911    1.018408    1.008851    1.008609    1.006151    1.013103 
       Lead 
   1.009869