Load data, make season a factor and change Lake Name to Lake
suppressMessages(library(readxl))
suppressMessages(library(dplyr))
suppressMessages(library(lubridate))
us=read_excel('MU combined Statewide and Volunteer Data.xlsx')
us=us%>%mutate(Season=factor(Season),Date=as.Date(paste(Year,Month,Day,sep='/')))%>%select(-`MU#`)

Data exploration

Check missing data

There is a considerable amount of missing values,particularly for NTU and CHL. Factor Season is very unbalanced with most obs. recoirded in season 2 qand only 4 obs. in season 4. Sampling effort increased with time.

barplot(with(us,table(Season)),col=rainbow(7),xlab='Season',ylab='Number of obs.')

us=us%>%filter(Month<13,!is.na(Season),Season!=4)%>%select(-NTU)
with(us,barplot(table(factor(Year)),col=rainbow(30),xlab='Year',ylab='no. of obs.',main='Sampling effort through time'))

par(las=2);par(mar=c(5,8,4,2))
with(us,barplot(table(Lake),col=rainbow(30),xlab='Year',ylab='no. of obs.',main='Sampling effort across lakes'),cex.names=0.5,horiz=TRUE)

with(us,barplot(table(Month),col=rainbow(30),xlab='Year',ylab='no. of obs.',main='Sampling effort per month'))

Variables distribution and correlation and co-linearity

Pair plots of untranformed data

load('lakes.RData')
pairs(na.omit(us[,6:11]),upper.panel = panel.cor,lower.panel = panel.smooth,cex.labels = 1.5,gap = 0.5,cex=0.7,diag.panel = panel.hist)

Pair plots of transformed data

pairs(na.omit(log(us[,6:11])),upper.panel = panel.cor,lower.panel = panel.smooth,cex.labels = 1.5,gap = 0.5,cex=0.7,diag.panel = panel.hist)

Variance inflation factor.

us1=us%>%mutate(logTN=log(TN),logTP=log(TP),logCHL=log(CHL),logNVSS=log(NVSS),logVSS=log(VSS))
corvif(us1[,c(13,14,16,17)])
## 
## 
## Variance inflation factors
## 
##             GVIF
## logTN   3.134206
## logTP   5.056302
## logNVSS 1.863074
## logVSS  2.602770
corvif(us1[,c(13,14,16)])
## 
## 
## Variance inflation factors
## 
##             GVIF
## logTN   3.044174
## logTP   3.995180
## logNVSS 1.857591

Relationship between predictors and CHL

suppressMessages(library(ggplot2))
suppressMessages(library(tidyr))
us2=us%>%gather('key','value',-c(Season,Year,CHL,Lake,Month,Day,Date))
ggplot(us2,aes(x=value,y=CHL))+geom_point(alpha=.3)+geom_smooth(method = lm)+theme_bw()+scale_y_log10()+scale_x_log10()+facet_grid(Season~key,scales = 'free')

Fit a linear model with the log transformed data and Season and reduce model using a stepwise selection based on AIC interaction.

## Start:  AIC=-6086.35
## logCHL ~ (scale(logTN) + scale(logTP) + scale(logNVSS)) * Season
## 
##                         Df Sum of Sq    RSS     AIC
## <none>                               5455.0 -6086.4
## - scale(logTN):Season    2    12.200 5467.2 -6067.9
## - scale(logTP):Season    2    14.927 5469.9 -6062.9
## - scale(logNVSS):Season  2    24.247 5479.3 -6045.9
## 
## Call:
## lm(formula = logCHL ~ (scale(logTN) + scale(logTP) + scale(logNVSS)) * 
##     Season, data = us1)
## 
## Coefficients:
##            (Intercept)            scale(logTN)            scale(logTP)  
##                2.01046                 0.01543                 0.50133  
##         scale(logNVSS)                 Season2                 Season3  
##                0.17134                 0.50794                 0.73688  
##   scale(logTN):Season2    scale(logTN):Season3    scale(logTP):Season2  
##                0.22692                 0.32829                 0.21720  
##   scale(logTP):Season3  scale(logNVSS):Season2  scale(logNVSS):Season3  
##               -0.04849                -0.24955                -0.16007
## 
## Call:
## lm(formula = logCHL ~ (logTN + logTP + logNVSS) * Season, data = us1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.21554 -0.16117  0.04346  0.21340  0.93204 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.14954    0.20191  -0.741 0.458946    
## logTN            0.02800    0.09117   0.307 0.758785    
## logTP            0.56718    0.06542   8.670  < 2e-16 ***
## logNVSS          0.15416    0.03346   4.607 4.13e-06 ***
## Season2         -1.23369    0.20851  -5.917 3.39e-09 ***
## Season3         -1.22086    0.31572  -3.867 0.000111 ***
## logTN:Season2    0.41159    0.09449   4.356 1.34e-05 ***
## logTN:Season3    0.59546    0.14445   4.122 3.78e-05 ***
## logTP:Season2    0.24573    0.06788   3.620 0.000296 ***
## logTP:Season3   -0.05486    0.09787  -0.561 0.575096    
## logNVSS:Season2 -0.22453    0.03482  -6.448 1.19e-10 ***
## logNVSS:Season3 -0.14402    0.04886  -2.948 0.003208 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3205 on 10019 degrees of freedom
##   (3752 observations deleted due to missingness)
## Multiple R-squared:  0.5857, Adjusted R-squared:  0.5853 
## F-statistic:  1288 on 11 and 10019 DF,  p-value: < 2.2e-16
## Analysis of Variance Table
## 
## Response: logCHL
##                   Df  Sum Sq Mean Sq   F value    Pr(>F)    
## logTN              1 1150.68 1150.68 11205.057 < 2.2e-16 ***
## logTP              1  256.17  256.17  2494.494 < 2.2e-16 ***
## logNVSS            1    5.50    5.50    53.549 2.715e-13 ***
## Season             2   30.41   15.21   148.066 < 2.2e-16 ***
## logTN:Season       2    6.28    3.14    30.588 5.703e-14 ***
## logTP:Season       2    1.20    0.60     5.820  0.002978 ** 
## logNVSS:Season     2    4.57    2.29    22.267 2.245e-10 ***
## Residuals      10019 1028.88    0.10                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Partial main effects

par(mfrow=c(2,2));termplot(m2,partial.resid = TRUE,se = TRUE,lty.se = 2,col.se=1,ask = F,col.res = 'gray50')
## Warning in termplot(m2, partial.resid = TRUE, se = TRUE, lty.se = 2, col.se
## = 1, : 'model' appears to involve interactions: see the help page

Check patterns in the residuals

par(mfrow=c(2,2));plot(m2)