hot <- read_csv("HOT_cyto_counts_edit.csv",
col_types = cols(...1 = col_skip()))
## New names:
## • `` -> `...1`
ggpairs(hot[, c(3, 5, 6, 8)])
#Convert date to date
hot <- hot %>%
mutate(
# Convert date to string, pad with zeroes if needed
date_char = sprintf("%06d", date),
date_fmt = mdy(date_char),
doy = yday(date_fmt) # extract day of year
)
ggplot(hot, aes(x = hbact)) +
geom_histogram(bins = 50) +
theme_clean()
gam_hbact <- gam(hbact ~ s(doy, press), data = hot)
summary(gam_hbact)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## hbact ~ s(doy, press)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.88022 0.01811 214.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(doy,press) 22.85 26.88 123.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.732 Deviance explained = 73.7%
## GCV = 0.40709 Scale est. = 0.39911 n = 1217
#how should the response variable be modeled?
gam.check(gam_hbact)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 6 iterations.
## The RMS GCV score gradient at convergence was 2.095162e-06 .
## The Hessian was positive definite.
## Model rank = 30 / 30
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press) 29.0 22.8 0.96 0.04 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# k' edf k-index p-value
#s(doy,press) 29.0 22.8 0.96 0.07 .
#Change basis
gam_hbact_k <- gam(hbact ~ s(doy, press, k= 60), data = hot)
gam.check(gam_hbact_k)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 4 iterations.
## The RMS GCV score gradient at convergence was 9.586221e-09 .
## The Hessian was positive definite.
## Model rank = 60 / 60
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press) 59.0 27.8 0.97 0.12
#Log transformation
gam_hbact_log <- gam(log(hbact + 1e-4) ~ s(doy, press), data = hot)
gam.check(gam_hbact_log)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 6 iterations.
## The RMS GCV score gradient at convergence was 2.195063e-07 .
## The Hessian was positive definite.
## Model rank = 30 / 30
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press) 29.0 21.7 0.98 0.2
#Tweedie
gam_hbact_tweedie <- gam(hbact ~ s(doy, press), family = Tweedie(), data = hot)
## Error in Tweedie(): Only 1<p<=2 supported
#Gamma
gam_hbact_gamma <- gam(hbact ~ s(doy, press), family = Gamma(), data = hot)
gam.check(gam_hbact_gamma)
##
## Method: GCV Optimizer: outer newton
## full convergence after 5 iterations.
## Gradient range [7.283391e-08,7.283391e-08]
## (score 0.02959145 & scale 0.02902258).
## Hessian positive definite, eigenvalue range [0.0001475623,0.0001475623].
## Model rank = 30 / 30
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press) 29.0 23.7 0.98 0.25
#Separate
gam_hbact1 <- gam(hbact ~ s(doy) + s(press), data = hot)
gam.check(gam_hbact1)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 8 iterations.
## The RMS GCV score gradient at convergence was 2.191183e-07 .
## The Hessian was positive definite.
## Model rank = 19 / 19
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy) 9.00 7.79 0.41 <2e-16 ***
## s(press) 9.00 4.51 0.99 0.35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#k' edf k-index p-value
#s(doy) 9.00 7.79 0.41 <2e-16 ***
#s(press) 9.00 4.51 0.99 0.31
#by function
gam_hbact_by = gam(hbact ~ doy + s(press, by = doy), data= hot)
gam.check(gam_hbact_by)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 6 iterations.
## The RMS GCV score gradient at convergence was 2.783807e-06 .
## The Hessian was positive definite.
## Model rank = 11 / 12
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(press):doy 10.0 4.8 0.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Model comparison
AICc(gam_hbact) #2362.565
## [1] 2362.565
AICc(gam_hbact_k) #2366.868
## [1] 2366.868
AICc(gam_hbact_log) #-819.6515
## [1] -819.6515
AICc(gam_hbact_gamma) #2340.832
## [1] 2340.832
AICc(gam_hbact1) #2458.142
## [1] 2458.142
AICc(gam_hbact_by) #2827.85
## [1] 2827.85
#Creating an image:
plot(gam_hbact_log, scheme = 1, col = 'skyblue')
#Prediction grid image:
doy_seq <- seq(min(hot$doy, na.rm = TRUE), max(hot$doy, na.rm = TRUE), length.out = 100)
press_seq <- seq(min(hot$press, na.rm = TRUE), max(hot$press, na.rm = TRUE), length.out = 100)
grid <- expand.grid(doy = doy_seq, press = press_seq)
grid$pred_hbact <- predict(gam_hbact_log, newdata = grid, type = "response")
zz <- matrix(grid$pred_hbact, nrow = length(press_seq), ncol = length(doy_seq))
library(fields)
## Warning: package 'fields' was built under R version 4.4.3
## Loading required package: spam
## Warning: package 'spam' was built under R version 4.4.3
## Spam version 2.11-1 (2025-01-20) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following object is masked from 'package:arm':
##
## display
## The following object is masked from 'package:Matrix':
##
## det
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: viridisLite
##
## Try help(fields) to get started.
image.plot(doy_seq, press_seq, zz[nrow(zz):1, ],
col = heat.colors(100),
xlab = "Day of Year", ylab = "Depth (dbar)",
main = "Predicted Heterotrophic Bacteria Abundance")
contour(doy_seq, press_seq, zz[nrow(zz):1, ], add = TRUE, col = "black")
ggplot(hot, aes(x = pro)) +
geom_histogram(bins = 50) +
theme_clean()
gam_pro <- gam(pro ~ s(doy, press), data = hot)
summary(gam_pro)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## pro ~ s(doy, press)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.33514 0.01069 124.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(doy,press) 27.5 28.86 221.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.84 Deviance explained = 84.4%
## GCV = 0.14245 Scale est. = 0.13911 n = 1217
#how should the response variable be modeled?
gam.check(gam_pro)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 8 iterations.
## The RMS GCV score gradient at convergence was 2.946778e-07 .
## The Hessian was positive definite.
## Model rank = 30 / 30
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press) 29.0 27.5 1.06 0.98
#k' edf k-index p-value
#s(doy,press) 29.0 27.5 1.06 0.98
#Change basis
gam_pro_k <- gam(pro ~ s(doy, press, k= 60), data = hot)
gam.check(gam_pro_k)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 5 iterations.
## The RMS GCV score gradient at convergence was 1.052689e-05 .
## The Hessian was positive definite.
## Model rank = 60 / 60
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press) 59.0 40.3 1.07 1
#Log transformation
gam_pro_log <- gam(log(pro + 1e-4) ~ s(doy, press), data = hot)
summary(gam_pro_log)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(pro + 1e-04) ~ s(doy, press)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.36597 0.01545 -23.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(doy,press) 24.84 27.99 334.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.885 Deviance explained = 88.8%
## GCV = 0.29688 Scale est. = 0.29057 n = 1217
#how should the response variable be modeled?
gam.check(gam_pro_log)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 7 iterations.
## The RMS GCV score gradient at convergence was 1.953061e-05 .
## The Hessian was positive definite.
## Model rank = 30 / 30
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press) 29.0 24.8 1.04 0.90
#Tweedie
gam_pro_tweedie <- gam(pro ~ s(doy, press), family = Tweedie(), data = hot)
## Error in Tweedie(): Only 1<p<=2 supported
#Gamma
gam_pro_tweedie <- gam(pro ~ s(doy, press), family = Gamma(), data = hot)
## Error in eval(family$initialize): non-positive values not allowed for the 'Gamma' family
#Separate
gam_pro1 <- gam(pro ~ s(doy) + s(press), data = hot)
summary(gam_pro1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## pro ~ s(doy) + s(press)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.33514 0.01133 117.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(doy) 5.228 6.339 8.295 <2e-16 ***
## s(press) 6.112 6.872 803.636 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.821 Deviance explained = 82.3%
## GCV = 0.1577 Scale est. = 0.1561 n = 1217
#how should the response variable be modeled?
gam.check(gam_pro1)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 8 iterations.
## The RMS GCV score gradient at convergence was 3.95273e-06 .
## The Hessian was positive definite.
## Model rank = 19 / 19
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy) 9.00 5.23 0.46 <2e-16 ***
## s(press) 9.00 6.11 1.04 0.93
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#k' edf k-index p-value
#s(doy) 9.00 5.23 0.46 <2e-16 ***
#s(press) 9.00 6.11 1.04 0.92
#by function
gam_pro_by = gam(pro ~ doy + s(press, by = doy), data= hot)
gam.check(gam_pro_by)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 5 iterations.
## The RMS GCV score gradient at convergence was 1.005065e-05 .
## The Hessian was positive definite.
## Model rank = 11 / 12
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(press):doy 10.00 6.15 0.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Model comparison
AICc(gam_pro) #1084.864
## [1] 1084.864
AICc(gam_pro1) #1208.013
## [1] 1208.013
AICc(gam_pro_log) #1978.419
## [1] 1978.419
AICc(gam_pro_by) #1990.762
## [1] 1990.762
AICc(gam_pro_k) #1092.568
## [1] 1092.568
#Creating an image:
plot(gam_pro, scheme = 1, col = 'skyblue')
#Prediction grid image:
doy_seq <- seq(min(hot$doy, na.rm = TRUE), max(hot$doy, na.rm = TRUE), length.out = 100)
press_seq <- seq(min(hot$press, na.rm = TRUE), max(hot$press, na.rm = TRUE), length.out = 100)
grid <- expand.grid(doy = doy_seq, press = press_seq)
grid$pred_pro <- predict(gam_pro, newdata = grid, type = "response")
zz <- matrix(grid$pred_pro, nrow = length(press_seq), ncol = length(doy_seq))
image.plot(doy_seq, press_seq, zz[nrow(zz):1, ],
col = heat.colors(100),
xlab = "Day of Year", ylab = "Depth (dbar)",
main = "Predicted Heterotrophic Bacteria Abundance")
contour(doy_seq, press_seq, zz[nrow(zz):1, ], add = TRUE, col = "black")
ggplot(hot, aes(x = picoeuk)) +
geom_histogram(bins = 50) +
theme_clean()
#skewed right
gam_picoeuk <- gam(picoeuk ~ s(doy, press), data = hot)
gam.check(gam_picoeuk)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 4 iterations.
## The RMS GCV score gradient at convergence was 7.945724e-08 .
## The Hessian was positive definite.
## Model rank = 30 / 30
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press) 29.0 23.5 1.04 0.91
#k' edf k-index p-value
#s(doy,press) 29.0 23.5 1.04 0.9
###Skewed residuals
#Log transformation
gam_picoeuk_log <- gam(log(picoeuk + 1e-4) ~ s(doy, press), data = hot)
gam.check(gam_picoeuk_log)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 7 iterations.
## The RMS GCV score gradient at convergence was 6.684561e-06 .
## The Hessian was positive definite.
## Model rank = 30 / 30
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press) 29.0 25.5 1.04 0.92
#squared
gam_picoeuk_sq <- gam((picoeuk^2) ~ s(doy, press), data = hot)
gam.check(gam_picoeuk_sq)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 4 iterations.
## The RMS GCV score gradient at convergence was 2.114108e-11 .
## The Hessian was positive definite.
## Model rank = 30 / 30
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press) 29.0 21.6 1.1 0.92
#Separate
gam_picoeuk1 <- gam(picoeuk ~ s(doy) + s(press), data = hot)
gam.check(gam_picoeuk1)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 4 iterations.
## The RMS GCV score gradient at convergence was 3.882605e-08 .
## The Hessian was positive definite.
## Model rank = 19 / 19
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy) 9.00 8.43 0.68 <2e-16 ***
## s(press) 9.00 6.07 1.01 0.58
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#by function
gam_picoeuk_by = gam(picoeuk ~ doy + s(press, by = doy), data= hot)
gam.check(gam_picoeuk_by)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 4 iterations.
## The RMS GCV score gradient at convergence was 1.019706e-07 .
## The Hessian was positive definite.
## Model rank = 11 / 12
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(press):doy 10.0 6.3 0.97 0.14
#Model comparison
AICc(gam_picoeuk) #-9132.535
## [1] -9132.535
AICc(gam_picoeuk_sq) #-16847.58
## [1] -16847.58
AICc(gam_picoeuk_log) #2244.591
## [1] 2244.591
AICc(gam_picoeuk_by) #-8942.759
## [1] -8942.759
AICc(gam_picoeuk1) #-9114.204
## [1] -9114.204
#Creating an image:
plot(gam_picoeuk_sq, scheme = 1, col = 'skyblue')
#Prediction grid image:
doy_seq <- seq(min(hot$doy, na.rm = TRUE), max(hot$doy, na.rm = TRUE), length.out = 100)
press_seq <- seq(min(hot$press, na.rm = TRUE), max(hot$press, na.rm = TRUE), length.out = 100)
grid <- expand.grid(doy = doy_seq, press = press_seq)
grid$pred_picoeuk <- predict(gam_picoeuk_sq, newdata = grid, type = "response")
zz <- matrix(grid$pred_picoeuk, nrow = length(press_seq), ncol = length(doy_seq))
image.plot(doy_seq, press_seq, zz[nrow(zz):1, ],
col = heat.colors(100),
xlab = "Day of Year", ylab = "Depth (dbar)",
main = "Predicted Heterotrophic Bacteria Abundance")
contour(doy_seq, press_seq, zz[nrow(zz):1, ], add = TRUE, col = "black")
#####What are your interpretations of the results so far?
### For "hbact" (heterotrophic bacteria concentration), the highest abundance was at lower depths, and at time approx. 100 days.
### For "pro" (Prochlorococcus concentration), the highest abundance was also at lower depths, and at time approx. 100 days. However, from 50 -200 days there was high abundance overall.
### For "picoeuk" (photosynthetic picoeukaryote concentration), Highest abundavce was at around 100m at times 0-50 days and around 300 days.
#####(a) Do the different kinds of microbes have different mean
abundances?
#####(b) Do the different kinds of microbes have different average depth
distributions (i.e., averaging over time)? #####(c) Do the different
kinds of microbes have different average seasonal dynamics (i.e.,
averaging over depths)?
#####To fit GAM(s) including all three groups simultaneously you will need to convert the data to ‘long’ format, where there is a column that contains all the concentrations of all three types of microbes, and a second column that codes which microbe was counted in that row, as well as additional columns for the other model predictors. You can convert to long format by hand using a spreadsheet, or you can use a helpful function called pivot_longer() in the package ‘tidyr’. Now that you have fit models to test questions (a)-(c), make appropriate plots and perform appropriate hypothesis tests. How do you intepret the results?
#Put data into the long format:
hot_long <- hot %>%
pivot_longer(cols = c(pro, hbact, picoeuk),
names_to = "group",
values_to = "abundance")
hot_long$group <- as.factor(hot_long$group)
#Use a GAM including all groups simultaneously to simultaneously
gam_all <- gam(abundance ~ group + s(doy, press, by = group), data = hot_long, family = gaussian())
summary(gam_all)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## abundance ~ group + s(doy, press, by = group)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.88022 0.01210 320.7 <2e-16 ***
## grouppicoeuk -3.87028 0.01711 -226.2 <2e-16 ***
## grouppro -2.54508 0.01711 -148.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(doy,press):grouphbact 25.8 28.39 262.887 <2e-16 ***
## s(doy,press):grouppicoeuk 2.0 2.00 0.013 0.987
## s(doy,press):grouppro 27.1 28.78 173.502 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.947 Deviance explained = 94.8%
## GCV = 0.181 Scale est. = 0.17813 n = 3651
gam.check(gam_all) #edf = 27.8
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 9 iterations.
## The RMS GCV score gradient at convergence was 5.999858e-08 .
## The Hessian was positive definite.
## Model rank = 90 / 90
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press):grouphbact 29.0 25.8 0.9 <2e-16 ***
## s(doy,press):grouppicoeuk 29.0 2.0 0.9 <2e-16 ***
## s(doy,press):grouppro 29.0 27.1 0.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AICc(gam_all) #4123.715
## [1] 4123.715
#Log
gam_all_log <- gam(log(abundance + 1e-4) ~ group + s(doy, press, by = group), data = hot_long, family = gaussian())
gam.check(gam_all_log)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 8 iterations.
## The RMS GCV score gradient at convergence was 4.363269e-07 .
## The Hessian was positive definite.
## Model rank = 90 / 90
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press):grouphbact 29.0 12.4 0.78 <2e-16 ***
## s(doy,press):grouppicoeuk 29.0 26.6 0.78 <2e-16 ***
## s(doy,press):grouppro 29.0 25.6 0.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AICc(gam_all_log) #5012.474
## [1] 5012.474
#Adjust k
gam_all_k100 <- gam(abundance ~ group + s(doy, press, by = group, k = 100), data = hot_long,family = gaussian())
gam.check(gam_all_k100) #edf = 57.9
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 10 iterations.
## The RMS GCV score gradient at convergence was 9.640121e-08 .
## The Hessian was positive definite.
## Model rank = 300 / 300
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doy,press):grouphbact 99.0 57.9 0.9 <2e-16 ***
## s(doy,press):grouppicoeuk 99.0 2.0 0.9 <2e-16 ***
## s(doy,press):grouppro 99.0 42.2 0.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AICc(gam_all_k100) #4116.54
## [1] 4116.54
##USE adjust K model
#ANSWERING QUESTIONS:
## (a) Do the different kinds of microbes have different mean abundances?
#Based on the coefficient results, the microbes have different mean abundances.
#summary(gam_all_k100)
#Parametric coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) 3.88022 0.01200 323.3 <2e-16 ***
#grouppicoeuk -3.87028 0.01697 -228.0 <2e-16 ***
#grouppro -2.54508 0.01697 -149.9 <2e-16 ***
## (b) Do the different kinds of microbes have different average depth distributions (i.e., averaging over time)?
press_seq <- seq(min(hot$press), max(hot$press), length.out = 100)
doy_mean <- mean(hot$doy, na.rm = TRUE)
grid_depth <- expand.grid(
doy = doy_mean,
press = press_seq,
group = levels(hot_long$group)
)
grid_depth$pred <- predict(gam_all_k100, newdata = grid_depth)
ggplot(grid_depth, aes(x = press, y = pred, color = group)) +
geom_line(size = 1.2) +
theme_minimal() +
labs(x = "Depth (dbar)", y = "Predicted Abundance")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#hbact has the highest predicted abundance, followed by picoeuk, with pro having the lowest. hbact and picoeuk have predicted abundance decline with depth. I am wondering if pro is at 0 becuase depth is a poor predictor of abundance for this variable.
## (c) Do the different kinds of microbes have different average seasonal dynamics (i.e., averaging over depths)?
doy_seq <- seq(min(hot$doy), max(hot$doy), length.out = 100)
press_mean <- mean(hot$press, na.rm = TRUE)
grid_season <- expand.grid(
doy = doy_seq,
press = press_mean,
group = levels(hot_long$group)
)
grid_season$pred <- predict(gam_all_k100, newdata = grid_season)
ggplot(grid_season, aes(x = doy, y = pred, color = group)) +
geom_line(size = 1.2) +
theme_minimal() +
labs(x = "Day of Year", y = "Predicted Abundance")
### I think based on this graph we can see the groups do have different predicted abundance based on time of year.