cor(eelgrass, sdi)
## [1] 0.8468619
What do you conclude? The plot probably tipped you off, no?
We can test this, if we really want. To do this, we use a t-test, testing if the correlation coefficient, r, is different from 0:
cor.test(eelgrass, sdi)
##
## Pearson's product-moment correlation
##
## data: eelgrass and sdi
## t = 13.882, df = 76, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7693097 0.8998192
## sample estimates:
## cor
## 0.8468619
Given this output, a linear model is a good fit for the data.
hist(eelgrass, col="pink")
hist(sdi, col="red")
I think they look fairly normal!
boxplot(eelgrass, col="pink", xlab="Eelgrass")
boxplot(sdi, col = "red",xlab="sdi")
The assumption of equal variance looks correct.
Now we will build our linear model. Recall the model formula: Y∼ β0+β1X1+ϵ where β0 is the estimated intercept and β1, is the estimated slope associated with explanatory variable X1, and ϵ is the unaccounted for error.
Before we do this, let’s order our x variable (this makes the downstream processes flow better).
head(eelData)
## Plot Eelgrass SDI Temp
## 1 1 756 4.18 15.152
## 2 2 486 1.73 13.774
## 3 3 588 3.21 16.225
## 4 4 742 4.73 11.387
## 5 5 586 3.35 9.021
## 6 6 976 5.04 16.545
eelData <- eelData[order(eelData$Eelgrass),] #order your data by eelgrass from smallest to largest
regression1 <- lm(SDI ~ Eelgrass, data=eelData)
summary(regression1)
##
## Call:
## lm(formula = SDI ~ Eelgrass, data = eelData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.66517 -0.67506 -0.07709 0.58083 2.15192
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5081482 0.2280076 2.229 0.0288 *
## Eelgrass 0.0047423 0.0003416 13.882 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8307 on 76 degrees of freedom
## Multiple R-squared: 0.7172, Adjusted R-squared: 0.7135
## F-statistic: 192.7 on 1 and 76 DF, p-value: < 2.2e-16
regression1$coefficients
## (Intercept) Eelgrass
## 0.508148180 0.004742318
Adjusted R-squared: 0.7135
Anova1 <- anova(regression1)
You can calculate the F-stat like this too:
Anova1$`Sum Sq`[1]/Anova1$`Mean Sq`[2]
## [1] 192.7175
#where
Anova1$`Sum Sq`[1] #132.9865
## [1] 132.9865
#and
Anova1$`Mean Sq`[2] #0.6900594
## [1] 0.6900594
par(mfrow=c(1,2))
plot(regression1, which = 1:2) #specifies both residual and qqplot
Sxx <- sum((eelgrass-mean(eelgrass))^2) #the x variable
Sxx
## [1] 5913249
Sxy <- sum((eelgrass-mean(eelgrass))*(sdi-mean(sdi)))
Sxy
## [1] 28042.51
B1 <- Sxy/Sxx #SLOPE rise/run, coefficient value
B1
## [1] 0.004742318
B0 <- (mean(sdi))-B1*mean(eelgrass)
B0 #The intercept
## [1] 0.5081482
#Now calculate the variance components
TotalSS <- sum((sdi-mean(sdi))^2)
TotalSS
## [1] 185.431
YHat <- (mean(sdi)+(B1*(eelgrass-mean(eelgrass))))
YHat #these are all of the predicted values based upon your coefficient value (B1 in the equation above)
## [1] 4.0933408 2.8129148 3.2966313 4.0269483 3.2871467 5.1366508 3.8799364
## [8] 0.6646447 2.5615720 2.5331181 4.0554022 4.1549909 3.3487968 0.7357795
## [15] 4.0980831 3.0405461 4.0980831 1.9023897 3.7044707 5.0844853 2.5805412
## [22] 2.4856949 1.3238269 5.0892276 1.6605315 0.5318598 2.9409574 5.3405705
## [29] 1.1246495 2.1821865 3.7044707 2.7038415 2.5378604 2.5995105 5.4970670
## [36] 2.0351746 4.3114874 3.7708631 0.6077369 4.6576766 3.8609672 5.2267548
## [43] 0.8021719 1.9166167 4.7240691 2.0494016 3.5100356 3.6238513 2.1821865
## [50] 2.5426027 5.5018093 3.9463289 2.8650803 4.1929294 3.7376669 3.8941634
## [57] 4.0696292 2.5757989 2.7275531 3.8182863 3.4246739 3.2397235 0.7689757
## [64] 4.2688065 2.5995105 5.2836626 4.9943812 5.3642820 2.7038415 3.4863240
## [71] 4.4347877 4.4442723 4.5486033 4.2118987 4.2403526 4.8046885 4.8568540
## [78] 4.8947925
RegSS <- sum((YHat-mean(sdi))^2)
RegSS
## [1] 132.9865
ErrorSS <- TotalSS-RegSS
ErrorSS
## [1] 52.44452
#Calculate the degrees of freedom:
n <- length(sdi)
n
## [1] 78
#Calcuate the variance component
MSReg <- RegSS/1
MSReg
## [1] 132.9865
MSError <- ErrorSS/(n-2) #n-2 degrees of freedom
#Calculate the Fstat:
Fstat <- MSReg/MSError
Fstat
## [1] 192.7175
#Find F Critical
Fcrit <- qf(0.05, 1, 76, lower.tail = F)
Fcrit
## [1] 3.96676
#Now R2 or the coefficient of determination
R2 <- RegSS/TotalSS
R2
## [1] 0.7171751
est <- summary(regression1)
est
##
## Call:
## lm(formula = SDI ~ Eelgrass, data = eelData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.66517 -0.67506 -0.07709 0.58083 2.15192
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5081482 0.2280076 2.229 0.0288 *
## Eelgrass 0.0047423 0.0003416 13.882 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8307 on 76 degrees of freedom
## Multiple R-squared: 0.7172, Adjusted R-squared: 0.7135
## F-statistic: 192.7 on 1 and 76 DF, p-value: < 2.2e-16
est$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.508148180 0.2280076268 2.228646 2.879345e-02
## Eelgrass 0.004742318 0.0003416097 13.882271 1.540975e-22
eelslope <- est$coefficients[2,1] #extract the slope coefficient
eelslope
## [1] 0.004742318
anotherway <- resid(regression1)
regression1$fitted.values
## 26 39 8 14 63 43 29 23
## 0.5318598 0.6077369 0.6646447 0.7357795 0.7689757 0.8021719 1.1246495 1.3238269
## 25 18 44 36 46 30 49 22
## 1.6605315 1.9023897 1.9166167 2.0351746 2.0494016 2.1821865 2.1821865 2.4856949
## 10 33 50 9 58 21 34 65
## 2.5331181 2.5378604 2.5426027 2.5615720 2.5757989 2.5805412 2.5995105 2.5995105
## 32 69 59 2 53 27 16 62
## 2.7038415 2.7038415 2.7275531 2.8129148 2.8650803 2.9409574 3.0405461 3.2397235
## 5 3 13 61 70 47 48 19
## 3.2871467 3.2966313 3.3487968 3.4246739 3.4863240 3.5100356 3.6238513 3.7044707
## 31 55 38 60 41 7 56 52
## 3.7044707 3.7376669 3.7708631 3.8182863 3.8609672 3.8799364 3.8941634 3.9463289
## 4 11 57 1 15 17 12 54
## 4.0269483 4.0554022 4.0696292 4.0933408 4.0980831 4.0980831 4.1549909 4.1929294
## 74 75 64 37 71 72 73 40
## 4.2118987 4.2403526 4.2688065 4.3114874 4.4347877 4.4442723 4.5486033 4.6576766
## 45 76 77 78 67 20 24 6
## 4.7240691 4.8046885 4.8568540 4.8947925 4.9943812 5.0844853 5.0892276 5.1366508
## 42 66 28 68 35 51
## 5.2267548 5.2836626 5.3405705 5.3642820 5.4970670 5.5018093
pred <- regression1$fitted.values
par(mfrow=c(1,1)) #Sets graphing parameters
plot(eelData$Eelgrass, eelData$SDI)+ #Plot the observed data
lines(eelData$Eelgrass, pred, lwd=2, col="blueviolet")+ #Line of best fit
points(mean(eelData$Eelgrass), mean(eelData$SDI), pch=16, col="red", cex=2)|> #We can add the means as a red dot
abline(h=mean(eelData$SDI), lty=2) #And a horizontal line at the mean of Y
## integer(0)
Confidence Intervals: “A confidence interval, in statistics, refers to the probability that a population parameter will fall between a set of values for a certain proportion of times. Analysts often use confidence intervals that contain either 95% or 99% of expected observations. Thus, if a point estimate is generated from a statistical model of 10.00 with a 95% confidence interval of 9.50 - 10.50, it can be inferred that there is a 95% probability that the true value falls within that range.”
confint(regression1) #We can use the function confint to extract the CIs.
## 2.5 % 97.5 %
## (Intercept) 0.054031635 0.962264725
## Eelgrass 0.004061944 0.005422693
AnotherwayCI <- predict(regression1, se.fit=T,interval="confidence") #We can also retrieve them from the predict function
#Run CIs to see what this returns
AnotherwayCI
## $fit
## fit lwr upr
## 26 0.5318598 0.08083997 0.9828796
## 39 0.6077369 0.16659644 1.0488773
## 8 0.6646447 0.23088217 1.0984072
## 14 0.7357795 0.31119884 1.1603601
## 63 0.7689757 0.34866379 1.1892876
## 43 0.8021719 0.38611802 1.2182258
## 29 1.1246495 0.74932741 1.4999717
## 23 1.3238269 0.97297647 1.6746774
## 25 1.6605315 1.34945425 1.9716088
## 18 1.9023897 1.61824478 2.1865347
## 44 1.9166167 1.63400324 2.1992301
## 36 2.0351746 1.76506145 2.3052879
## 46 2.0494016 1.78075515 2.3180481
## 30 2.1821865 1.92684886 2.4375242
## 49 2.1821865 1.92684886 2.4375242
## 22 2.4856949 2.25769650 2.7136933
## 10 2.5331181 2.30892814 2.7573080
## 33 2.5378604 2.31404350 2.7616773
## 50 2.5426027 2.31915740 2.7660480
## 9 2.5615720 2.33959838 2.7835456
## 58 2.5757989 2.35491355 2.7966843
## 21 2.5805412 2.36001560 2.8010669
## 34 2.5995105 2.38040858 2.8186125
## 65 2.5995105 2.38040858 2.8186125
## 32 2.7038415 2.49211531 2.9155677
## 69 2.7038415 2.49211531 2.9155677
## 59 2.7275531 2.51739060 2.9377156
## 2 2.8129148 2.60801246 3.0178172
## 53 2.8650803 2.66309416 3.0670665
## 27 2.9409574 2.74278459 3.1391303
## 16 3.0405461 2.84656323 3.2345290
## 62 3.2397235 3.05112871 3.4283183
## 5 3.2871467 3.09921609 3.4750772
## 3 3.2966313 3.10880427 3.4844583
## 13 3.3487968 3.16136363 3.5362300
## 61 3.4246739 3.23728075 3.6120670
## 70 3.4863240 3.29849827 3.6741498
## 47 3.5100356 3.32193296 3.6981383
## 48 3.6238513 3.43357658 3.8141259
## 19 3.7044707 3.51183308 3.8971083
## 31 3.7044707 3.51183308 3.8971083
## 55 3.7376669 3.54386398 3.9314698
## 38 3.7708631 3.57578554 3.9659407
## 60 3.8182863 3.62120234 4.0153703
## 41 3.8609672 3.66189584 4.0600385
## 7 3.8799364 3.67992800 4.0799449
## 56 3.8941634 3.69343078 4.0948960
## 52 3.9463289 3.74278782 4.1498700
## 4 4.0269483 3.81861203 4.2352846
## 11 4.0554022 3.84524761 4.2655568
## 57 4.0696292 3.85854167 4.2807167
## 1 4.0933408 3.88066400 4.3060175
## 15 4.0980831 3.88508337 4.3110828
## 17 4.0980831 3.88508337 4.3110828
## 12 4.1549909 3.93798684 4.3719950
## 54 4.1929294 3.97312784 4.4127310
## 74 4.2118987 3.99066138 4.4331361
## 75 4.2403526 4.01691683 4.4637884
## 64 4.2688065 4.04311986 4.4944932
## 37 4.3114874 4.08232947 4.5406453
## 71 4.4347877 4.19500484 4.6745705
## 72 4.4442723 4.20363775 4.6849069
## 73 4.5486033 4.29830253 4.7989041
## 40 4.6576766 4.39673452 4.9186187
## 45 4.7240691 4.45640874 4.9917294
## 76 4.8046885 4.52864813 5.0807289
## 77 4.8568540 4.57527097 5.1384370
## 78 4.8947925 4.60912288 5.1804622
## 67 4.9943812 4.69777673 5.2909857
## 20 5.0844853 4.77774968 5.3912209
## 24 5.0892276 4.78195297 5.3965022
## 6 5.1366508 4.82395556 5.4493460
## 42 5.2267548 4.90361629 5.5498933
## 66 5.2836626 4.95383763 5.6134876
## 28 5.3405705 5.00399376 5.6771471
## 68 5.3642820 5.02487382 5.7036903
## 35 5.4970670 5.14161728 5.8525166
## 51 5.5018093 5.14578120 5.8578373
##
## $se.fit
## [1] 0.22645278 0.22149244 0.21778806 0.21317792 0.21103463 0.20889673
## [7] 0.18844570 0.17615869 0.15618895 0.14266650 0.14189755 0.13562129
## [13] 0.13488485 0.12820262 0.12820262 0.11447583 0.11256364 0.11237634
## [19] 0.11218977 0.11145084 0.11090446 0.11072384 0.11000901 0.11000901
## [25] 0.10630573 0.10630573 0.10552061 0.10287955 0.10141535 0.09950071
## [31] 0.09739698 0.09469165 0.09435816 0.09430617 0.09410842 0.09408833
## [37] 0.09430554 0.09444457 0.09553512 0.09672151 0.09672151 0.09730661
## [43] 0.09794661 0.09895400 0.09995183 0.10042235 0.10078595 0.10219605
## [49] 0.10460368 0.10551664 0.10598504 0.10678299 0.10694514 0.10694514
## [55] 0.10895569 0.11036031 0.11108118 0.11218500 0.11331515 0.11505803
## [61] 0.12039269 0.12082034 0.12567366 0.13101657 0.13438973 0.13859726
## [67] 0.14138017 0.14343204 0.14892232 0.15400904 0.15427968 0.15700131
## [73] 0.16224480 0.16560202 0.16899198 0.17041366 0.17846793 0.17875833
##
## $df
## [1] 76
##
## $residual.scale
## [1] 0.8306982
AnotherwayCI[2]
## $se.fit
## [1] 0.22645278 0.22149244 0.21778806 0.21317792 0.21103463 0.20889673
## [7] 0.18844570 0.17615869 0.15618895 0.14266650 0.14189755 0.13562129
## [13] 0.13488485 0.12820262 0.12820262 0.11447583 0.11256364 0.11237634
## [19] 0.11218977 0.11145084 0.11090446 0.11072384 0.11000901 0.11000901
## [25] 0.10630573 0.10630573 0.10552061 0.10287955 0.10141535 0.09950071
## [31] 0.09739698 0.09469165 0.09435816 0.09430617 0.09410842 0.09408833
## [37] 0.09430554 0.09444457 0.09553512 0.09672151 0.09672151 0.09730661
## [43] 0.09794661 0.09895400 0.09995183 0.10042235 0.10078595 0.10219605
## [49] 0.10460368 0.10551664 0.10598504 0.10678299 0.10694514 0.10694514
## [55] 0.10895569 0.11036031 0.11108118 0.11218500 0.11331515 0.11505803
## [61] 0.12039269 0.12082034 0.12567366 0.13101657 0.13438973 0.13859726
## [67] 0.14138017 0.14343204 0.14892232 0.15400904 0.15427968 0.15700131
## [73] 0.16224480 0.16560202 0.16899198 0.17041366 0.17846793 0.17875833
library(ggplot2)
ggplot(eelData, aes(Eelgrass, SDI))+
geom_point()+
geom_smooth(method=lm)
## `geom_smooth()` using formula = 'y ~ x'
plot(eelData$Eelgrass, eelData$SDI)+#Plot the observed data
lines(eelData$Eelgrass, pred, lwd=2, col="rosybrown")+
lines(eelData$Eelgrass, AnotherwayCI$fit[,2], lty=2, col="red")+ #Run the CIs$fit[,2] part on its own so you know what this code is doing.
lines(eelData$Eelgrass, AnotherwayCI$fit[,3], lty=2, col="red") #Do you know what this is doing?
## integer(0)
Now you will build a model that includes eelgrass shoot density and water temperature. First define your new variable. Alternately, you can include the “data” argument in your model call. Be sure to watch capitalization.
temp <- eelData$Temp
hist(temp, col="orange")
ggplot(eelData, aes(Temp, SDI, col=Temp))+
geom_point()
We will use the * to include the interaction between the temperature and eelgrass. If we did not want the interaction, we would just specify the model as SDI ~ Eelgrass + Temp for the main effects only.
reg2<-lm(SDI~Eelgrass * Temp, data=eelData)
summary(reg2)
##
## Call:
## lm(formula = SDI ~ Eelgrass * Temp, data = eelData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6811 -0.5891 -0.1116 0.5848 2.1845
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.7046046 1.1004413 1.549 0.126
## Eelgrass 0.0024158 0.0017009 1.420 0.160
## Temp -0.0910854 0.0827317 -1.101 0.274
## Eelgrass:Temp 0.0001789 0.0001279 1.399 0.166
##
## Residual standard error: 0.8292 on 74 degrees of freedom
## Multiple R-squared: 0.7256, Adjusted R-squared: 0.7145
## F-statistic: 65.22 on 3 and 74 DF, p-value: < 2.2e-16
anova(reg2)
## Analysis of Variance Table
##
## Response: SDI
## Df Sum Sq Mean Sq F value Pr(>F)
## Eelgrass 1 132.986 132.986 193.3990 <2e-16 ***
## Temp 1 0.215 0.215 0.3125 0.5778
## Eelgrass:Temp 1 1.345 1.345 1.9563 0.1661
## Residuals 74 50.884 0.688
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Identify the important parts of the model output. What do you conclude?
#Function comprised of a function that calcs. sums of squares, sends to optimizer to minimize 2 parameters in the regression model (slope and intercept)
#Guess starting points p=c(x,x); for slope, some small value but intercept appears to pass thru origin
set.seed(78)
p<-c(0,0.01)
minSS<-function(p){
beta0<-p[1]
beta1<-p[2]
Y<-beta0+beta1*eelgrass
SS<-sum((Y-sdi)^2)
return(SS)
}
#Our function is called MinSS
#We now pass our estimate to an optimizer (optim()) to reduce the SS
par.est<-optim(p,minSS) # With numerical recipes for optmizing
par.est
## $par
## [1] 0.508695919 0.004741658
##
## $value
## [1] 52.44452
##
## $counts
## function gradient
## 89 NA
##
## $convergence
## [1] 0
##
## $message
## NULL
#Convergence=very good, it converged to a single value
#No. of iterations = 81
#To extract the values
params<-par.est$par
params
## [1] 0.508695919 0.004741658
Now we can plot our predictions from this function:
pred$SDI2<-params[1]+params[2]*eelgrass
## Warning in pred$SDI2 <- params[1] + params[2] * eelgrass: Coercing LHS to a
## list
plot(eelgrass, sdi)+
lines(eelgrass,pred$SDI2,lwd=2,col="red")
## integer(0)
#if you write lm into your consol it will tell you the function makeup
#view(lm) also works