##libraries
fishermen = read.csv("https://raw.githubusercontent.com/connormckee1323/STA-321/main/fishermen_mercury.csv", header = TRUE)
pairs(fishermen, main ="Pair-wise Association: Scatter Plot") #this makes a pairwise association with each variables to see possible linear regression
FID = which(fishermen$fisherman=="1") #creating variable for fisherman with value 1
Fyes = fishermen[FID,] #creating object for fishermen
Fno = fishermen[-FID,] #creating object for non-fishermen
plot(fishermen$MeHg, fishermen$TotHg, pch=16, col="white", #creating scatterplot
xlab = "Methlyn Mercury", #x-axis name
ylab = "Total Mercury", #y-axis name
main = "Methlyn Mercury vs Total Mercury", #graph name
col.main = "navy", #main title color
cex.main = 0.8, #main title size
bty="n")
points(fishermen$TotHg[FID], fishermen$MeHg[FID], pch=16, col=alpha("darkred", 0.5)) #if a fisherman, the point will be darkred
points(fishermen$TotHg[-FID], fishermen$MeHg[-FID], pch=19, col=alpha("blue", 0.5)) #if not a fisherman, the point will be blue
We see a positive linear relationship between methlyn mercury and total
mercury.
total = fishermen$TotHg #creating object from fishermen data for Tothg
methlyn = fishermen$MeHg #creating object from fishermen data for MeHg
m0 = lm(total~methlyn)
par(mfrow = c(2,2)) #creates a 2x2 table of the graphs
plot(m0) #residual analysis for my linear regression
By looking at the normal Q-Q plot the residuals violate the normality
assumption.
total = fishermen$TotHg #creating object from fishermen data for Tothg
methlyn = fishermen$MeHg #creating object from fishermen data for MeHg
m0 = lm(total~methlyn) #creating object for linear regression
E = resid(m0) # Original residuals
a.hat = coef(m0)[1]
b.hat = coef(m0)[2]
##
B = 1000 # generating 1000 bootstrap samples
bt.alpha = rep(0, B)
bt.beta = bt.alpha
for(i in 1:B){
bt.e = sample(E, replace = TRUE) # bootstrap residuals
y.hat = a.hat + b.hat*methlyn + bt.e # bootstrap heights
## bootstrap SLR
bt.m = lm(y.hat ~ methlyn)
bt.alpha[i] = coef(bt.m)[1]
bt.beta[i] = coef(bt.m)[2]
}
alpha.CI = quantile(bt.alpha, c(0.025, 0.975))
beta.CI = quantile(bt.beta, c(0.025, 0.975))
##
per.025 = c(alpha.CI[1],beta.CI[1]) # lower CI for alpha and beta
per.975 = c(alpha.CI[2],beta.CI[2]) # upper CI for alpha and beta
#Adding confidence limits from the SLR from my orginal sample
lm.inference = as.data.frame((summary(m0))$coef)
lm.inference$per.025 = per.025
lm.inference$per.975 = per.975
kable(as.matrix(lm.inference))
| Estimate | Std. Error | t value | Pr(>|t|) | per.025 | per.975 | |
|---|---|---|---|---|---|---|
| (Intercept) | 0.0269212 | 0.0164865 | 1.632925 | 0.1048499 | -0.0041639 | 0.0609286 |
| methlyn | 1.0286576 | 0.0035667 | 288.402949 | 0.0000000 | 1.0223115 | 1.0359729 |
The 95% confidence interval for the intercept is (-0.003416, 0.0615877). The 95% confidence interval for the slope is (1.022316, 1.0364294). We have a p-value of 0 for the 95% confidence interval of the slope.
B <- 1000 # number of bootstrap replicates
# define empty vectors to store bootstrap regression coefficients
boot.beta0 <- NULL
boot.beta1 <- NULL
## bootstrap regression models using for-loop
vec.id <- 1:length(total) # vector of observation ID
for(i in 1:B){
boot.id <- sample(vec.id, length(total), replace = TRUE) # bootstrap obs ID.
boot.total <- total[boot.id] # bootstrap price
boot.methlyn <- methlyn[boot.id] # corresponding bootstrap distance
## regression
boot.reg <-lm(total[boot.id] ~ methlyn[boot.id])
boot.beta0[i] <- coef(boot.reg)[1] # bootstrap intercept
boot.beta1[i] <- coef(boot.reg)[2] # bootstrap slope
}
## 95% bootstrap confidence intervals
boot.beta0.ci <- quantile(boot.beta0, c(0.025, 0.975), type = 2)
boot.beta1.ci <- quantile(boot.beta1, c(0.025, 0.975), type = 2)
boot.coef <- data.frame(rbind(boot.beta0.ci, boot.beta1.ci))
names(boot.coef) <- c("2.5%", "97.5%")
kable(boot.coef, caption="Bootstrap confidence intervals of regression coefficients.")
| 2.5% | 97.5% | |
|---|---|---|
| boot.beta0.ci | 0.0062304 | 0.0503766 |
| boot.beta1.ci | 1.0218428 | 1.0332058 |
The 95% confidence interval of the intercept is (0.0066099, 0.048841). The 95% bootstrap confidence interval of the slope is (1.0221222, 1.033939). They are both positive so both variables are positively associated. They are statistically correlated because 0 is not in the interval. The bootstrap confidence intervals or regression are better to use because they have a smaller confidence interval.