##libraries

Import data

fishermen = read.csv("https://raw.githubusercontent.com/connormckee1323/STA-321/main/fishermen_mercury.csv", header = TRUE)

Pairwise Association

pairs(fishermen, main ="Pair-wise Association: Scatter Plot") #this makes a pairwise association with each variables to see possible linear regression

Fit an ordinary least square regression (SLR) to capture the linear relationship between the two variables.

FID = which(fishermen$fisherman=="1") #creating variable for fisherman with value 1
Fyes = fishermen[FID,]                #creating object for fishermen
Fno = fishermen[-FID,]                #creating object for non-fishermen

plot(fishermen$MeHg, fishermen$TotHg, pch=16, col="white", #creating scatterplot
                        xlab = "Methlyn Mercury",          #x-axis name
                        ylab = "Total Mercury",            #y-axis name
                        main = "Methlyn Mercury vs Total Mercury",  #graph name
                        col.main = "navy",                 #main title color
                        cex.main = 0.8,                    #main title size
                        bty="n")
points(fishermen$TotHg[FID], fishermen$MeHg[FID], pch=16, col=alpha("darkred", 0.5)) #if a fisherman, the point will be darkred
points(fishermen$TotHg[-FID], fishermen$MeHg[-FID], pch=19, col=alpha("blue", 0.5)) #if not a fisherman, the point will be blue

We see a positive linear relationship between methlyn mercury and total mercury.

Residual Analysis

total = fishermen$TotHg #creating object from fishermen data for Tothg
methlyn = fishermen$MeHg #creating object from fishermen data for MeHg
m0 = lm(total~methlyn) 
par(mfrow = c(2,2)) #creates a 2x2 table of the graphs
plot(m0) #residual analysis for my linear regression

By looking at the normal Q-Q plot the residuals violate the normality assumption.

Estimating the bootstrap confidence intervals of regression coefficients

total = fishermen$TotHg #creating object from fishermen data for Tothg
methlyn = fishermen$MeHg #creating object from fishermen data for MeHg
m0 = lm(total~methlyn)   #creating object for linear regression
E = resid(m0)            # Original residuals
a.hat = coef(m0)[1]
b.hat = coef(m0)[2]
##
B = 1000                 # generating 1000 bootstrap samples
bt.alpha = rep(0, B)
bt.beta = bt.alpha
for(i in 1:B){
  bt.e = sample(E, replace = TRUE)          # bootstrap residuals
  y.hat = a.hat + b.hat*methlyn + bt.e     # bootstrap heights
  ## bootstrap SLR
  bt.m = lm(y.hat ~ methlyn)
  bt.alpha[i] = coef(bt.m)[1]
  bt.beta[i] = coef(bt.m)[2]
 }
alpha.CI = quantile(bt.alpha, c(0.025, 0.975))
beta.CI = quantile(bt.beta, c(0.025, 0.975))
##
per.025 = c(alpha.CI[1],beta.CI[1])     # lower CI for alpha and beta
per.975 = c(alpha.CI[2],beta.CI[2])     # upper CI for alpha and beta  

#Adding confidence limits from the SLR from my orginal sample


lm.inference = as.data.frame((summary(m0))$coef)
lm.inference$per.025 = per.025
lm.inference$per.975 = per.975
kable(as.matrix(lm.inference))
Estimate Std. Error t value Pr(>|t|) per.025 per.975
(Intercept) 0.0269212 0.0164865 1.632925 0.1048499 -0.0041639 0.0609286
methlyn 1.0286576 0.0035667 288.402949 0.0000000 1.0223115 1.0359729

The 95% confidence interval for the intercept is (-0.003416, 0.0615877). The 95% confidence interval for the slope is (1.022316, 1.0364294). We have a p-value of 0 for the 95% confidence interval of the slope.

Constructing bootstrap confidence intervals of regression coefficients

B <- 1000    # number of bootstrap replicates
# define empty vectors to store bootstrap regression coefficients
boot.beta0 <- NULL 
boot.beta1 <- NULL
## bootstrap regression models using for-loop
vec.id <- 1:length(total)   # vector of observation ID
for(i in 1:B){
  boot.id <- sample(vec.id, length(total), replace = TRUE)   # bootstrap obs ID.
  boot.total <- total[boot.id]           # bootstrap price
  boot.methlyn <- methlyn[boot.id]     # corresponding bootstrap distance
  ## regression
  boot.reg <-lm(total[boot.id] ~ methlyn[boot.id]) 
  boot.beta0[i] <- coef(boot.reg)[1]   # bootstrap intercept
  boot.beta1[i] <- coef(boot.reg)[2]   # bootstrap slope
}
##  95% bootstrap confidence intervals
boot.beta0.ci <- quantile(boot.beta0, c(0.025, 0.975), type = 2)
boot.beta1.ci <- quantile(boot.beta1, c(0.025, 0.975), type = 2)
boot.coef <- data.frame(rbind(boot.beta0.ci, boot.beta1.ci)) 
names(boot.coef) <- c("2.5%", "97.5%")
kable(boot.coef, caption="Bootstrap confidence intervals of regression coefficients.")
Bootstrap confidence intervals of regression coefficients.
2.5% 97.5%
boot.beta0.ci 0.0062304 0.0503766
boot.beta1.ci 1.0218428 1.0332058

The 95% confidence interval of the intercept is (0.0066099, 0.048841). The 95% bootstrap confidence interval of the slope is (1.0221222, 1.033939). They are both positive so both variables are positively associated. They are statistically correlated because 0 is not in the interval. The bootstrap confidence intervals or regression are better to use because they have a smaller confidence interval.