suppressMessages(library(readxl))
suppressMessages(library(dplyr))
suppressMessages(library(lubridate))
us=read_excel('MU combined Statewide and Volunteer Data.xlsx')
us=us%>%mutate(Season=factor(Season),Date=as.Date(paste(Year,Month,Day,sep='/')))%>%select(-`MU#`)
There is a considerable amount of missing values,particularly for NTU and CHL. Factor Season is very unbalanced with most obs. recoirded in season 2 qand only 4 obs. in season 4. Sampling effort increased with time.
barplot(with(us,table(Season)),col=rainbow(7),xlab='Season',ylab='Number of obs.')
us=us%>%filter(Month<13,!is.na(Season),Season!=4)%>%select(-NTU)
with(us,barplot(table(factor(Year)),col=rainbow(30),xlab='Year',ylab='no. of obs.',main='Sampling effort through time'))
par(las=2);par(mar=c(5,8,4,2))
with(us,barplot(table(Lake),col=rainbow(30),xlab='Year',ylab='no. of obs.',main='Sampling effort across lakes'),cex.names=0.5,horiz=TRUE)
with(us,barplot(table(Month),col=rainbow(30),xlab='Year',ylab='no. of obs.',main='Sampling effort per month'))
load('lakes.RData')
pairs(na.omit(us[,6:11]),upper.panel = panel.cor,lower.panel = panel.smooth,cex.labels = 1.5,gap = 0.5,cex=0.7,diag.panel = panel.hist)
pairs(na.omit(log(us[,6:11])),upper.panel = panel.cor,lower.panel = panel.smooth,cex.labels = 1.5,gap = 0.5,cex=0.7,diag.panel = panel.hist)
us1=us%>%mutate(logTN=log(TN),logTP=log(TP),logCHL=log(CHL),logNVSS=log(NVSS),logVSS=log(VSS))
corvif(us1[,c(13,14,16,17)])
##
##
## Variance inflation factors
##
## GVIF
## logTN 3.134206
## logTP 5.056302
## logNVSS 1.863074
## logVSS 2.602770
corvif(us1[,c(13,14,16)])
##
##
## Variance inflation factors
##
## GVIF
## logTN 3.044174
## logTP 3.995180
## logNVSS 1.857591
suppressMessages(library(ggplot2))
suppressMessages(library(tidyr))
us2=us%>%gather('key','value',-c(Season,Year,CHL,Lake,Month,Day,Date))
ggplot(us2,aes(x=value,y=CHL))+geom_point(alpha=.3)+geom_smooth(method = lm)+theme_bw()+scale_y_log10()+scale_x_log10()+facet_grid(Season~key,scales = 'free')
us3=us%>%group_by(Year,Month)%>%summarise_each(funs = 'mean',-Lake)%>%gather('key','value',-c(NTU,Season,Year,Month,Day,Date))
ggplot(us3,aes(x=Date,y=value))+geom_point(alpha=.3)+geom_smooth()+theme_bw()+scale_y_log10()+facet_grid(key~.,scales = 'free')+xlab('')
## Start: AIC=-6086.35
## logCHL ~ (scale(logTN) + scale(logTP) + scale(logNVSS)) * Season
##
## Df Sum of Sq RSS AIC
## <none> 5455.0 -6086.4
## - scale(logTN):Season 2 12.200 5467.2 -6067.9
## - scale(logTP):Season 2 14.927 5469.9 -6062.9
## - scale(logNVSS):Season 2 24.247 5479.3 -6045.9
##
## Call:
## lm(formula = logCHL ~ (scale(logTN) + scale(logTP) + scale(logNVSS)) *
## Season, data = us1)
##
## Coefficients:
## (Intercept) scale(logTN) scale(logTP)
## 2.01046 0.01543 0.50133
## scale(logNVSS) Season2 Season3
## 0.17134 0.50794 0.73688
## scale(logTN):Season2 scale(logTN):Season3 scale(logTP):Season2
## 0.22692 0.32829 0.21720
## scale(logTP):Season3 scale(logNVSS):Season2 scale(logNVSS):Season3
## -0.04849 -0.24955 -0.16007
##
## Call:
## lm(formula = logCHL ~ (logTN + logTP + logNVSS) * Season, data = us1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.21554 -0.16117 0.04346 0.21340 0.93204
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.14954 0.20191 -0.741 0.458946
## logTN 0.02800 0.09117 0.307 0.758785
## logTP 0.56718 0.06542 8.670 < 2e-16 ***
## logNVSS 0.15416 0.03346 4.607 4.13e-06 ***
## Season2 -1.23369 0.20851 -5.917 3.39e-09 ***
## Season3 -1.22086 0.31572 -3.867 0.000111 ***
## logTN:Season2 0.41159 0.09449 4.356 1.34e-05 ***
## logTN:Season3 0.59546 0.14445 4.122 3.78e-05 ***
## logTP:Season2 0.24573 0.06788 3.620 0.000296 ***
## logTP:Season3 -0.05486 0.09787 -0.561 0.575096
## logNVSS:Season2 -0.22453 0.03482 -6.448 1.19e-10 ***
## logNVSS:Season3 -0.14402 0.04886 -2.948 0.003208 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3205 on 10019 degrees of freedom
## (3752 observations deleted due to missingness)
## Multiple R-squared: 0.5857, Adjusted R-squared: 0.5853
## F-statistic: 1288 on 11 and 10019 DF, p-value: < 2.2e-16
## Analysis of Variance Table
##
## Response: logCHL
## Df Sum Sq Mean Sq F value Pr(>F)
## logTN 1 1150.68 1150.68 11205.057 < 2.2e-16 ***
## logTP 1 256.17 256.17 2494.494 < 2.2e-16 ***
## logNVSS 1 5.50 5.50 53.549 2.715e-13 ***
## Season 2 30.41 15.21 148.066 < 2.2e-16 ***
## logTN:Season 2 6.28 3.14 30.588 5.703e-14 ***
## logTP:Season 2 1.20 0.60 5.820 0.002978 **
## logNVSS:Season 2 4.57 2.29 22.267 2.245e-10 ***
## Residuals 10019 1028.88 0.10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(2,2));termplot(m2,partial.resid = TRUE,se = TRUE,lty.se = 2,col.se=1,ask = F,col.res = 'gray50')
## Warning in termplot(m2, partial.resid = TRUE, se = TRUE, lty.se = 2, col.se
## = 1, : 'model' appears to involve interactions: see the help page
par(mfrow=c(2,2));plot(m2)