library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#library("tidyverse")
library("broom")
RIKZ <- read.csv("RIKZ.csv")
RIKZ$Richness <- rowSums(RIKZ[,2:76] > 0)
RIKZ<- RIKZ %>% select('Sample','week','angle1','angle2',
'exposure','salinity','temperature','NAP','penetrability','grainsize',
'humus','chalk','sorting1','Beach','Richness')
head(RIKZ)
## Sample week angle1 angle2 exposure salinity temperature NAP penetrability
## 1 1 1 32 96 10 29.4 17.5 0.045 253.9
## 2 2 1 62 96 10 29.4 17.5 -1.036 226.9
## 3 3 1 65 96 10 29.4 17.5 -1.336 237.1
## 4 4 1 55 96 10 29.4 17.5 0.616 248.6
## 5 5 1 23 96 10 29.4 17.5 -0.684 251.9
## 6 6 1 129 89 8 29.6 20.8 1.190 250.1
## grainsize humus chalk sorting1 Beach Richness
## 1 222.5 0.05 2.05 69.830 1 11
## 2 200.0 0.30 2.50 59.000 1 11
## 3 194.5 0.10 3.45 59.220 1 14
## 4 221.0 0.15 1.60 67.750 1 12
## 5 202.0 0.05 2.45 57.760 1 10
## 6 192.5 0.10 2.50 53.075 2 9
par(mar = c(4.5,4.5,0.5,0.5), cex.lab = 1.5, cex.axis = 1.5) #Making the graph margin and font
plot(RIKZ$NAP,RIKZ$Richness, ylab = "Richness", xlab ="NAP")
ggplot(data=RIKZ, aes(x=NAP, y=Richness))+
geom_point()+
theme_classic(base_size = 18)
# make "Beach" into a factor otherwise treated as continuous
RIKZ$fBeach <- factor(RIKZ$Beach)
Richness_NAP <- ggplot(RIKZ, aes(x = NAP, y = Richness)) +
geom_point(aes(color=fBeach))+geom_smooth(method='lm')
Richness_NAP
## `geom_smooth()` using formula = 'y ~ x'
readiness assessment
Richness = 7.6306 - 2.8362*NAP
Substituting NAP = -0.5 into this equation, we get:
Richness = 7.6306 - 2.8362*(-0.5) = 9.2235
#1
m1<- lm(Richness ~ NAP, data = RIKZ)
summary(m1)
##
## Call:
## lm(formula = Richness ~ NAP, data = RIKZ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9976 -2.6941 -0.8107 1.1087 13.9428
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.6306 0.6534 11.677 6.36e-15 ***
## NAP -2.8362 0.6266 -4.526 4.69e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.133 on 43 degrees of freedom
## Multiple R-squared: 0.3227, Adjusted R-squared: 0.307
## F-statistic: 20.49 on 1 and 43 DF, p-value: 4.694e-05
pred_richness <- predict(m1, newdata = data.frame(NAP = 0))
pred_richness
## 1
## 7.630552
mean_pred_richness <- mean(pred_richness)
mean_pred_richness
## [1] 7.630552
se_pred <- sqrt(var(pred_richness))
se_pred
## [1] NA
#Example linear regression model
RIKZ_model.1 <- lm(Richness ~ NAP, data = RIKZ)
summary(RIKZ_model.1)
##
## Call:
## lm(formula = Richness ~ NAP, data = RIKZ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9976 -2.6941 -0.8107 1.1087 13.9428
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.6306 0.6534 11.677 6.36e-15 ***
## NAP -2.8362 0.6266 -4.526 4.69e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.133 on 43 degrees of freedom
## Multiple R-squared: 0.3227, Adjusted R-squared: 0.307
## F-statistic: 20.49 on 1 and 43 DF, p-value: 4.694e-05
Indicating strong evidence of significant negative association between NAP and Richness.
Now checking assumptions
test_RIKZ <- augment(RIKZ_model.1, data=RIKZ)
ggplot(test_RIKZ, aes(x = .fitted, y = .resid)) +
geom_point(aes(color=fBeach)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# this is lifted out of base R qqline()
y <- quantile(test_RIKZ$.resid, c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
ggplot(test_RIKZ, aes(sample = .resid)) +
stat_qq() +
geom_abline(slope = slope, intercept = int)
ggplot(test_RIKZ, aes(x = .fitted, y = sqrt(abs(.std.resid)))) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0.823)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(test_RIKZ, aes(.hat, .std.resid)) +
geom_vline(size = 2, colour = "white", xintercept = 0) +
geom_hline(size = 2, colour = "white", yintercept = 0) +
geom_point(aes(size = .cooksd)) + geom_smooth(se = FALSE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
old.par = par(mfrow=c(2,2))
plot(RIKZ_model.1)
par(old.par)
The residuals are scattering above and below the fitted line. Also the smooth line is not flat and close to 0. Therefore the varience is not constant.
The Q-Q plot: If the residuals are following a normal distribution then the points fall along the straight line, But here in the plot, it does not seems tosupport that the residuals are normally distributed.
The Scale-Location plot is similar to the Residuals vs. Fitted plot, but can make it easier to spot changes in variance. The smooth line here is not close to 1, and not a flat line.
This plot shows contours of another statistic, the “Cook’s distance”, which is a combination of the size of the residual and the leverage. The rule of thumb is to worry about points with Cook’s distance > 2.
richness_week<-ggplot(data=RIKZ, aes(x = week, y = Richness))+
geom_point()+
theme_classic(base_size = 18)
richness_week
Do you think the timing of the sample collection matters? Test your intuition with a linear model
The Plot shows most samples abundance measures for single species found on 1st week.So time fo rsample collection matters. Then the linear model:
RIKZ_model.2 = lm(Richness ~ week, data = RIKZ)
summary(RIKZ_model.2)
##
## Call:
## lm(formula = Richness ~ week, data = RIKZ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1583 -3.1583 -0.6167 2.3000 18.9250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.2417 1.9085 5.366 3.03e-06 ***
## week -1.5417 0.7584 -2.033 0.0483 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.796 on 43 degrees of freedom
## Multiple R-squared: 0.08768, Adjusted R-squared: 0.06646
## F-statistic: 4.132 on 1 and 43 DF, p-value: 0.04827
Therefore the linear model for richness with week shows significant negative relationship estimate of 10.24(1.9) and slope: (-1.54(0.76)) (P=0.045)
Since the residual plot shows that the point are not randomly assigned to the zero. In the Q-Q plot, points fall roughly along the straight line. Scale location plot met the assumptions. Roughly have normal distribution.
old.par = par(mfrow=c(2,2))
plot(RIKZ_model.2)
par(old.par)
Notice that there are only 4 weeks, with multiple observations per week. Another way to analyse this type of variable is to treat it as a categorical variable, and estimate a different parameter for each week. We can do this quickly like this:
RIKZ_model.3 <- lm(Richness~factor(week), data=RIKZ)
summary(RIKZ_model.3)
##
## Call:
## lm(formula = Richness ~ factor(week), data = RIKZ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.400 -1.667 -0.400 1.333 14.600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.400 1.183 10.486 3.58e-13 ***
## factor(week)2 -8.733 1.527 -5.721 1.08e-06 ***
## factor(week)3 -7.200 1.527 -4.716 2.78e-05 ***
## factor(week)4 -4.000 2.048 -1.953 0.0577 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.739 on 41 degrees of freedom
## Multiple R-squared: 0.4712, Adjusted R-squared: 0.4326
## F-statistic: 12.18 on 3 and 41 DF, p-value: 7.77e-06
old.par = par(mfrow=c(2,2))
plot(RIKZ_model.3)
par(old.par)
However, this ends up labeling the parameters as factor(week) which is a bit odd, and makes it more difficult to do predictions from the model. A better approach is to change the variable in the dataframe:
RIKZ$fweek <- factor(RIKZ$week)
RIKZ_model.3 <- lm(Richness ~ fweek, data = RIKZ)
old.par = par(mfrow=c(2,2))
plot(RIKZ_model.3)
par(old.par)
Similar to the previous plot
summary(RIKZ_model.1)
##
## Call:
## lm(formula = Richness ~ NAP, data = RIKZ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9976 -2.6941 -0.8107 1.1087 13.9428
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.6306 0.6534 11.677 6.36e-15 ***
## NAP -2.8362 0.6266 -4.526 4.69e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.133 on 43 degrees of freedom
## Multiple R-squared: 0.3227, Adjusted R-squared: 0.307
## F-statistic: 20.49 on 1 and 43 DF, p-value: 4.694e-05
summary(RIKZ_model.2)
##
## Call:
## lm(formula = Richness ~ week, data = RIKZ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1583 -3.1583 -0.6167 2.3000 18.9250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.2417 1.9085 5.366 3.03e-06 ***
## week -1.5417 0.7584 -2.033 0.0483 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.796 on 43 degrees of freedom
## Multiple R-squared: 0.08768, Adjusted R-squared: 0.06646
## F-statistic: 4.132 on 1 and 43 DF, p-value: 0.04827
summary(RIKZ_model.3)
##
## Call:
## lm(formula = Richness ~ fweek, data = RIKZ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.400 -1.667 -0.400 1.333 14.600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.400 1.183 10.486 3.58e-13 ***
## fweek2 -8.733 1.527 -5.721 1.08e-06 ***
## fweek3 -7.200 1.527 -4.716 2.78e-05 ***
## fweek4 -4.000 2.048 -1.953 0.0577 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.739 on 41 degrees of freedom
## Multiple R-squared: 0.4712, Adjusted R-squared: 0.4326
## F-statistic: 12.18 on 3 and 41 DF, p-value: 7.77e-06
model.list = list(
Richness ~ NAP,
+ Richness~week,
+ Richness ~ factor(week))
fitted.list = lapply(model.list,lm,data=RIKZ)
my.AIC = sapply(fitted.list,AIC)
k = sapply(fitted.list,function(mm)length(coef(mm)))
deltas = my.AIC-min(my.AIC)
weights = exp(-deltas/2)
weights = weights / sum(weights)
write.table(cbind(as.character(model.list),my.AIC,k,deltas,weights),"clipboard")
my.AICC = sapply(1:3,function(i)AIC(fitted.list[[i]],k=2*(45/(45-k[i]-1))))
cbind(my.AIC,k,deltas,weights)
## my.AIC k deltas weights
## [1,] 259.3616 2 7.140903 2.737187e-02
## [2,] 272.7669 2 20.546212 3.360304e-05
## [3,] 252.2207 4 0.000000 9.725945e-01
By looking at adjusted R squared value, Model 3 has 0.43 which is highest. The model with the lower AIC value is generally considered to be a better predictor of the response variable. In this case, model 3 has a lower AIC value (252.2207) compared to model 2 (272.7669). Therefore, model 3 is a better predictor of Species Richness.
Now we would like to find out what the estimated species richness is in each week. We could do the calculations manually, but there is a better way that uses the fitted model object. First we need a data.frame that contains one row for each estimate, and one column for each covariate in the model that we want to predict from. We can call this data.frame anything, but I prefer to call it nd, for “new data”. In our case, we are going to have 4 rows, one for each week, as a factor called fweek:
nd <- data.frame(fweek=factor(levels(RIKZ$fweek)))
# There is a lot going on in that one line of code. The best way to break this down is to run each piece from the “inside” out, so:
RIKZ$fweek # shows us the original data
## [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 3 3 3 3 3 2 2 2 2 2 3 3 3
## [39] 3 3 3 3 3 3 3
## Levels: 1 2 3 4
levels(RIKZ$fweek) # gives a vector of the unique levels
## [1] "1" "2" "3" "4"
factor(levels(RIKZ$fweek)) # creates a new “factor”
## [1] 1 2 3 4
## Levels: 1 2 3 4
# create a data.frame with one column called fweek
data.frame(fweek=factor(levels(RIKZ$fweek)))
## fweek
## 1 1
## 2 2
## 3 3
## 4 4
# and save it in an object named ‘nd’
nd <- data.frame(fweek=factor(levels(RIKZ$fweek)))
# nd <- data.frame(fweek = as.vector(RIKZ$fweek))
# predicted.Richness <- augment(RIKZ_model.3, newdata = nd, se_fit = TRUE)
#
#
# # have to set color explicitly otherwise looks for fbeach in predicted.Richness
# richness_week + geom_point(aes(x = fweek, y = .fitted), color = "red", data = predicted.Richness)
That will fail with an error about a discrete value being supplied to a continuous scale. The problem is that we made the object richness_week with week not fweek. So let’s redo that
predicted.Richness <- augment(RIKZ_model.3, newdata=nd, se_fit = TRUE)
# redo the plot object
richness_week <- ggplot(RIKZ, aes(x = fweek, y = Richness, color = fBeach)) +
geom_point()
# have to set color explicitly otherwise looks for fbeach in predicted.Richness
richness_week + geom_point(aes(x = fweek, y = .fitted), color = "red", size = 2, data = predicted.Richness) +
geom_errorbar(aes(x = fweek,
ymin = .fitted - 1.96*.se.fit,
ymax = .fitted + 1.96*.se.fit), color = "red", data = predicted.Richness, inherit.aes = FALSE)
RIKZ_model.4 = lm(Richness ~ NAP + fweek, data = RIKZ)
summary(RIKZ_model.4)
##
## Call:
## lm(formula = Richness ~ NAP + fweek, data = RIKZ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0897 -1.2301 -0.3601 0.6391 12.0963
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.1688 0.9564 12.723 1.21e-15 ***
## NAP -2.2601 0.4730 -4.779 2.39e-05 ***
## fweek2 -7.4313 1.2629 -5.884 6.88e-07 ***
## fweek3 -5.9837 1.2592 -4.752 2.60e-05 ***
## fweek4 -2.4019 1.6879 -1.423 0.162
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.021 on 40 degrees of freedom
## Multiple R-squared: 0.6634, Adjusted R-squared: 0.6297
## F-statistic: 19.71 on 4 and 40 DF, p-value: 4.971e-09
The model 4 includes the NAP and Fweek, Estimate for NAP slightly increased.In model 3, 4th week was marginally significant for response, however, model 4 reveals, week 4 is not nignificant, Overall, the adjusted R squared increased in this 4th model (0.63).
Making a plot with a more complex model is a bit involved. The problem is that we now have to make predictions with both NAP and fweek. So we are going to get 4 lines on a plot of Richness vs. NAP; one for each week.
nd = expand.grid(NAP=seq(-1.5,2.5,0.1), fweek=factor(levels(RIKZ$fweek)))
predicted.Richness.4 <- augment(RIKZ_model.4, newdata = nd)
# reuse ggplot object from above. fix color to black or geom_line looks for fBeach. x aesthetic inherited from Richness_NAP
Richness_NAP + geom_line(aes(y = .fitted, group = fweek, linetype = fweek), color = "black", data = predicted.Richness.4)
## `geom_smooth()` using formula = 'y ~ x'
It is important that the variable fweek in nd has the same levels as the variable that was used to fit the model; specifying the levels is the best way to ensure this. Notice that the lines are all parallel – that is because we fitted a model with no interaction between the continuous variable NAP and the factor variable fweek. Effectively we have 4 intercepts and one slope.
Now fit a model that includes both NAP, fweek, and their interaction:
RIKZ_model.5 = lm(Richness~NAP*fweek,data=RIKZ)
summary(RIKZ_model.5)
##
## Call:
## lm(formula = Richness ~ NAP * fweek, data = RIKZ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3022 -1.0157 -0.2946 0.3383 7.9013
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.21126 0.79062 15.445 < 2e-16 ***
## NAP -1.84501 0.88491 -2.085 0.04403 *
## fweek2 -7.84593 1.07328 -7.310 1.10e-08 ***
## fweek3 -6.17718 1.04937 -5.887 8.94e-07 ***
## fweek4 1.57157 1.62779 0.965 0.34058
## NAP:fweek2 0.37044 1.13928 0.325 0.74690
## NAP:fweek3 -0.06858 1.06033 -0.065 0.94878
## NAP:fweek4 -7.05516 1.71613 -4.111 0.00021 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.484 on 37 degrees of freedom
## Multiple R-squared: 0.7895, Adjusted R-squared: 0.7497
## F-statistic: 19.82 on 7 and 37 DF, p-value: 9.602e-11
Look carefully to see the difference in the code – it is a single character! However that single change adds 3 parameters to our model. Now we have 8 parameters, essentially an intercept and slope for each week.
Along with the conclusion from model 4, the linear model (model 5) has preditor variable as NAP, week as factors and their interaction. The model summary shows, there as only significance found for interaction os NAp and week4.
The three interaction coefficients can be thought of as a change in the slope of NAP for weeks 2, 3 and 4. So we expect the line for week 2 to be shallower (less negative), while weeks 3 and 4 should be steeper (more negative). Confirm this interpretation by repeating the figure above using model 5 to get the predictions.
nd = expand.grid(NAP=seq(-1.5,2.5,0.1), fweek=factor(levels(RIKZ$fweek)))
predicted.Richness.5 <- augment(RIKZ_model.5, newdata = nd)
# reuse ggplot object from above. fix color to black or geom_line looks for fBeach. x aesthetic inherited from Richness_NAP
Richness_NAP + geom_line(aes(y = .fitted, group = fweek, linetype = fweek), color = "black", data = predicted.Richness.5)
## `geom_smooth()` using formula = 'y ~ x'