# load the data for ENA indices
#setwd("~/Dropbox/Xu ly so lieu")
#load("/Users/LDA/Dropbox/Xu ly so lieu/ENA.rda")
setwd("E:/Dropbox/Xu ly so lieu")
load("ENA.rda")#system.time(data <- read.xlsx("data for analysis.xlsx",1))
load("data for analysis.rda")
df <- data
#str(df)Prepare the training data set and validation data set. Validation data set will consitute of some economically important species which are presented in the food web models as a single species compartment. There are 7 species which are presented in not less than 15 models.
Now create a training data set that does not include the above species
df$Group <- as.character(df$Group)
df_top$Group <- as.character(df_top$Group)
# get rows of validation data set in df
idx <- NULL
system.time(for (i in 1:nrow(df)){
for (j in 1:nrow(df_top)){
if (df$ID[i]==df_top$ID[j] & df$Group[i]==df_top$Group[j]){
x <- which(df$ID== df$ID[j] & df$Group==df$Group[j])
idx <- c(idx, x)
}
}
})## user system elapsed
## 19.50 0.01 19.92
# remove validation data from df
df_cal <- df[-idx,]
df.backup <- df
df <- df_calTable 1: Data summary of all ecosystem type
Make boxplot for distribution of log10(SPPR) in all ecosystem types
Figure 1. Boxplot of log10(SPPR) in different ecosystem types
Table 2. Summary of ecological parameters
Make boxplot for distribution of PB and PQ in all ecosystem types
***Figure 2. Boxplot for PQ of different species
Figure 3. Boxplot of QB for different species group. Black dots indicate the mean values
df_cor <- df[,which(colnames(df) %in% c("SPPR", "TL", "TE", "PB", "QB", "PQ","NPP"))]
pairs(df_cor, lower.panel=panel.smooth,upper.panel = panel.cor)#scatterplotMatrix(df_cor, spread=FALSE, smoother.args=list(lty=2))log10(SPPR) is possitvely correlated with Trophic level(TL) and negatively correlated with growth efficiency (PQ), but at lower level. Therefore we can test the regression models with predictors are TL, PQ and Ecosystem.type
ggplot(data=df, aes(x=TL, y=SPPR, colour=Ecosystem.type))+
geom_point()+
geom_smooth(method=lm)+
scale_y_continuous("log10(SPPR)")In this part, I used the all subset model technique to find which model is the best in terms of adjusted R square.
leaps <-regsubsets(SPPR ~ TL+PQ+QB+TE+PB, data=df, nbest=4)
plot(leaps, scale="adjr2") # model with population and illiteracy give the best R2Figure 4. Result for all subset model regression. From Figure 4, it is clear that model with TL and PQ as predictor will give higher adjusted R square than only TL. Also, model with 3 predictors: TL, PQ and TE is the best.
fit1 <- lm(SPPR~TL + PQ +TE, data = df)
summary(fit1)##
## Call:
## lm(formula = SPPR ~ TL + PQ + TE, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8169 -0.3620 -0.0757 0.2649 4.0425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.435611 0.058234 -7.48 9.59e-14 ***
## TL 1.184088 0.015264 77.58 < 2e-16 ***
## PQ -1.640970 0.098373 -16.68 < 2e-16 ***
## TE -0.052318 0.002357 -22.20 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.595 on 3103 degrees of freedom
## Multiple R-squared: 0.6904, Adjusted R-squared: 0.6901
## F-statistic: 2306 on 3 and 3103 DF, p-value: < 2.2e-16
Cross-validation
shrinkage(fit1, n.predictors=3, k=5)## Original R-square = 0.6903887
## 5 Fold Cross-Validated R-square = 0.689241
## Change = 0.00114772
Model diagnostic
par(mfrow=c(2,2))
plot(fit1)Figure 5. Plot for model diagnostic with TL, PQ, and TE as predictors.
As can be seen from Figure 5, the normality of residuals might be violated. But I do not know how to handle this problem.
shapiro.test(fit1$residuals)##
## Shapiro-Wilk normality test
##
## data: fit1$residuals
## W = 0.9219, p-value < 2.2e-16
Figure 6. Observed values vs Predicted values
fit2 <- lm(SPPR~TL + PQ, data=df)
summary(fit2)##
## Call:
## lm(formula = SPPR ~ TL + PQ, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8262 -0.3925 -0.0880 0.3031 4.0294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.87195 0.05900 -14.78 <2e-16 ***
## TL 1.11952 0.01613 69.42 <2e-16 ***
## PQ -1.80885 0.10556 -17.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6404 on 3104 degrees of freedom
## Multiple R-squared: 0.6412, Adjusted R-squared: 0.641
## F-statistic: 2774 on 2 and 3104 DF, p-value: < 2.2e-16
Cross-validation
shrinkage(fit2, n.predictors=2, k=5)## Original R-square = 0.6412369
## 5 Fold Cross-Validated R-square = 0.6400997
## Change = 0.001137167
***Model diagnostic with TL and PQ as predictors.
par(mfrow=c(2,2))
plot(fit2) Figure 7. Plot for model diagnostic with TL and PQ as predictors.
It is clear that residuals might not follow the normal distribution.
shapiro.test(fit2$residuals)##
## Shapiro-Wilk normality test
##
## data: fit2$residuals
## W = 0.93263, p-value < 2.2e-16
Figure 8. Observed vs Fitted.values when model consists of 2 predictor (i.e. TL and PQ) ##2.5. Model with only TL as predictor
fit3 <- lm(SPPR~TL, data=df)
summary(fit3)##
## Call:
## lm(formula = SPPR ~ TL, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6993 -0.4294 -0.0706 0.3436 4.2288
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.34097 0.05467 -24.53 <2e-16 ***
## TL 1.15776 0.01671 69.30 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6699 on 3105 degrees of freedom
## Multiple R-squared: 0.6073, Adjusted R-squared: 0.6072
## F-statistic: 4802 on 1 and 3105 DF, p-value: < 2.2e-16
Cross-validation
shrinkage(fit3, n.predictors=1, k=5)## Original R-square = 0.6073007
## 5 Fold Cross-Validated R-square = 0.6067353
## Change = 0.0005654474
***Model diagnostic with TL as only predictor
par(mfrow=c(2,2))
plot(fit3) Figure 9. Model diagnostic with only TL as an predictor
In this part, I tested if it is necessary to consider Ecosystem.type as a predictor.
fit4 <- lm(SPPR ~ TL+ PQ + Ecosystem.type, data=df)
summary(fit4)##
## Call:
## lm(formula = SPPR ~ TL + PQ + Ecosystem.type, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7890 -0.3940 -0.0879 0.2974 3.9574
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.985640 0.096248 -10.241 < 2e-16 ***
## TL 1.128750 0.016400 68.826 < 2e-16 ***
## PQ -1.816081 0.105304 -17.246 < 2e-16 ***
## Ecosystem.typeCoral reef 0.255307 0.112395 2.272 0.0232 *
## Ecosystem.typeBay/Fjord 0.038646 0.087803 0.440 0.6599
## Ecosystem.typeCoastal lagoon 0.487037 0.099216 4.909 9.63e-07 ***
## Ecosystem.typeChanel/Strait 0.002842 0.086791 0.033 0.9739
## Ecosystem.typeUpwelling -0.042889 0.084952 -0.505 0.6137
## Ecosystem.typeContinental shelf 0.080539 0.079889 1.008 0.3135
## Ecosystem.typeOpen ocean 0.157325 0.082688 1.903 0.0572 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6333 on 3097 degrees of freedom
## Multiple R-squared: 0.6499, Adjusted R-squared: 0.6489
## F-statistic: 638.9 on 9 and 3097 DF, p-value: < 2.2e-16
Only 2/7 ecosystem types show the significant effect on estimated intercept. Also, the adjusted R squared just increases by 0.008. So I think it is not necessary to build the regression for each ecosystem types.
In this section I used independent datasets of some economically important species.
library(hydroGOF)## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
val <- function(fit=list(fit1, fit2, fit3), dfnew){
RMSE <- NULL
for (i in 1:3){
pred<- predict(fit[[i]], newdata=dfnew)
RMSE[i] <- rmse(dfnew$SPPR, pred)
}
return(RMSE)
}
val_cod <- round(val(dfnew=df_cod), digits = 2)
val_herring <- round(val(dfnew=df_herring), digits = 2)
val_capelin <- round(val(dfnew=df_capelin), digits = 2)
val_plaice <- round(val(dfnew=df_plaice), digits = 2)
val_haddock <- round(val(dfnew = df_haddock), digits = 2)
val_halibut <- round(val(dfnew = df_halibut), digits = 2)
val_mackerel <- round(val(dfnew = df_mackerel), digits = 2)
df_val <- cbind(val_cod, val_herring, val_capelin, val_plaice, val_haddock, val_halibut, val_mackerel)
colnames(df_val) <- c("Atlantic cod", "Herring", "Capelin", "American plaice", "Haddock", "Greenland halibut", "Atlantic mackerel")
rownames(df_val) <- c("log10(SPPR = 1.18*TL - 1.64*PQ -0.05*TE- 0.44",
"log10(SPPR = 1.12*TL - 1.809*PQ - 0.872",
"log10(SPPR)= 1.15*TL = 1.34")
datatable (df_val)