# load the data for ENA indices
#setwd("~/Dropbox/Xu ly so lieu")
#load("/Users/LDA/Dropbox/Xu ly so lieu/ENA.rda")
setwd("E:/Dropbox/Xu ly so lieu")
load("ENA.rda")
#system.time(data <- read.xlsx("data for analysis.xlsx",1))
load("data for analysis.rda")
df <- data
#str(df)

Prepare the training data set and validation data set. Validation data set will consitute of some economically important species which are presented in the food web models as a single species compartment. There are 7 species which are presented in not less than 15 models.

Now create a training data set that does not include the above species

df$Group <- as.character(df$Group)
df_top$Group <- as.character(df_top$Group)
# get rows of validation data set in df
idx <- NULL
system.time(for (i in 1:nrow(df)){
    for (j in 1:nrow(df_top)){
        if (df$ID[i]==df_top$ID[j] & df$Group[i]==df_top$Group[j]){
            x <- which(df$ID== df$ID[j] & df$Group==df$Group[j])
            idx <- c(idx, x)
        }
    }
})
##    user  system elapsed 
##   19.50    0.01   19.92
# remove validation data from df
df_cal <- df[-idx,]
df.backup <- df
df <- df_cal

1. Exploratory of data

Table 1: Data summary of all ecosystem type

Make boxplot for distribution of log10(SPPR) in all ecosystem types

Figure 1. Boxplot of log10(SPPR) in different ecosystem types

1.2. Summary of ecological parameters

Table 2. Summary of ecological parameters

Make boxplot for distribution of PB and PQ in all ecosystem types

***Figure 2. Boxplot for PQ of different species

Figure 3. Boxplot of QB for different species group. Black dots indicate the mean values

2. Building a model for all species

2.1 Correlation of log10(SPPR) vs predictors

df_cor <- df[,which(colnames(df) %in% c("SPPR", "TL", "TE", "PB", "QB", "PQ","NPP"))]
pairs(df_cor, lower.panel=panel.smooth,upper.panel = panel.cor)

#scatterplotMatrix(df_cor, spread=FALSE, smoother.args=list(lty=2))

log10(SPPR) is possitvely correlated with Trophic level(TL) and negatively correlated with growth efficiency (PQ), but at lower level. Therefore we can test the regression models with predictors are TL, PQ and Ecosystem.type

ggplot(data=df, aes(x=TL, y=SPPR, colour=Ecosystem.type))+
  geom_point()+
  geom_smooth(method=lm)+
  scale_y_continuous("log10(SPPR)")

2.2. Variable selection

In this part, I used the all subset model technique to find which model is the best in terms of adjusted R square.

leaps <-regsubsets(SPPR ~ TL+PQ+QB+TE+PB, data=df, nbest=4)
plot(leaps, scale="adjr2") # model with population and illiteracy give the best R2

Figure 4. Result for all subset model regression. From Figure 4, it is clear that model with TL and PQ as predictor will give higher adjusted R square than only TL. Also, model with 3 predictors: TL, PQ and TE is the best.

2.3. Model construction with TL, PQ, and TE as predictors

fit1 <- lm(SPPR~TL + PQ +TE, data = df)
summary(fit1)
## 
## Call:
## lm(formula = SPPR ~ TL + PQ + TE, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8169 -0.3620 -0.0757  0.2649  4.0425 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.435611   0.058234   -7.48 9.59e-14 ***
## TL           1.184088   0.015264   77.58  < 2e-16 ***
## PQ          -1.640970   0.098373  -16.68  < 2e-16 ***
## TE          -0.052318   0.002357  -22.20  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.595 on 3103 degrees of freedom
## Multiple R-squared:  0.6904, Adjusted R-squared:  0.6901 
## F-statistic:  2306 on 3 and 3103 DF,  p-value: < 2.2e-16

Cross-validation

shrinkage(fit1, n.predictors=3, k=5)
## Original R-square = 0.6903887 
## 5 Fold Cross-Validated R-square = 0.689241 
## Change = 0.00114772

Model diagnostic

par(mfrow=c(2,2))
plot(fit1)

Figure 5. Plot for model diagnostic with TL, PQ, and TE as predictors.

As can be seen from Figure 5, the normality of residuals might be violated. But I do not know how to handle this problem.

shapiro.test(fit1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  fit1$residuals
## W = 0.9219, p-value < 2.2e-16

Figure 6. Observed values vs Predicted values

2.4. Model with only TL and PQ as predictors.

fit2 <- lm(SPPR~TL + PQ, data=df)
summary(fit2)
## 
## Call:
## lm(formula = SPPR ~ TL + PQ, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8262 -0.3925 -0.0880  0.3031  4.0294 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.87195    0.05900  -14.78   <2e-16 ***
## TL           1.11952    0.01613   69.42   <2e-16 ***
## PQ          -1.80885    0.10556  -17.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6404 on 3104 degrees of freedom
## Multiple R-squared:  0.6412, Adjusted R-squared:  0.641 
## F-statistic:  2774 on 2 and 3104 DF,  p-value: < 2.2e-16

Cross-validation

shrinkage(fit2, n.predictors=2, k=5)
## Original R-square = 0.6412369 
## 5 Fold Cross-Validated R-square = 0.6400997 
## Change = 0.001137167

***Model diagnostic with TL and PQ as predictors.

par(mfrow=c(2,2))
plot(fit2)

Figure 7. Plot for model diagnostic with TL and PQ as predictors.

It is clear that residuals might not follow the normal distribution.

shapiro.test(fit2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  fit2$residuals
## W = 0.93263, p-value < 2.2e-16

Figure 8. Observed vs Fitted.values when model consists of 2 predictor (i.e. TL and PQ) ##2.5. Model with only TL as predictor

fit3 <- lm(SPPR~TL, data=df)
summary(fit3)
## 
## Call:
## lm(formula = SPPR ~ TL, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6993 -0.4294 -0.0706  0.3436  4.2288 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.34097    0.05467  -24.53   <2e-16 ***
## TL           1.15776    0.01671   69.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6699 on 3105 degrees of freedom
## Multiple R-squared:  0.6073, Adjusted R-squared:  0.6072 
## F-statistic:  4802 on 1 and 3105 DF,  p-value: < 2.2e-16

Cross-validation

shrinkage(fit3, n.predictors=1, k=5)
## Original R-square = 0.6073007 
## 5 Fold Cross-Validated R-square = 0.6067353 
## Change = 0.0005654474

***Model diagnostic with TL as only predictor

par(mfrow=c(2,2))
plot(fit3)

Figure 9. Model diagnostic with only TL as an predictor

2.6. System specific model construction

In this part, I tested if it is necessary to consider Ecosystem.type as a predictor.

fit4 <- lm(SPPR ~ TL+ PQ + Ecosystem.type, data=df)
summary(fit4)
## 
## Call:
## lm(formula = SPPR ~ TL + PQ + Ecosystem.type, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7890 -0.3940 -0.0879  0.2974  3.9574 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -0.985640   0.096248 -10.241  < 2e-16 ***
## TL                               1.128750   0.016400  68.826  < 2e-16 ***
## PQ                              -1.816081   0.105304 -17.246  < 2e-16 ***
## Ecosystem.typeCoral reef         0.255307   0.112395   2.272   0.0232 *  
## Ecosystem.typeBay/Fjord          0.038646   0.087803   0.440   0.6599    
## Ecosystem.typeCoastal lagoon     0.487037   0.099216   4.909 9.63e-07 ***
## Ecosystem.typeChanel/Strait      0.002842   0.086791   0.033   0.9739    
## Ecosystem.typeUpwelling         -0.042889   0.084952  -0.505   0.6137    
## Ecosystem.typeContinental shelf  0.080539   0.079889   1.008   0.3135    
## Ecosystem.typeOpen ocean         0.157325   0.082688   1.903   0.0572 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6333 on 3097 degrees of freedom
## Multiple R-squared:  0.6499, Adjusted R-squared:  0.6489 
## F-statistic: 638.9 on 9 and 3097 DF,  p-value: < 2.2e-16

Only 2/7 ecosystem types show the significant effect on estimated intercept. Also, the adjusted R squared just increases by 0.008. So I think it is not necessary to build the regression for each ecosystem types.

3. Model validation

In this section I used independent datasets of some economically important species.

library(hydroGOF)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
val <- function(fit=list(fit1, fit2, fit3), dfnew){
    RMSE <- NULL
    for (i in 1:3){
        pred<- predict(fit[[i]], newdata=dfnew)
        RMSE[i] <- rmse(dfnew$SPPR, pred)
    }
    return(RMSE)
}

val_cod <- round(val(dfnew=df_cod), digits = 2)
val_herring <- round(val(dfnew=df_herring), digits = 2)
val_capelin <- round(val(dfnew=df_capelin), digits = 2)
val_plaice <- round(val(dfnew=df_plaice), digits = 2)
val_haddock <- round(val(dfnew = df_haddock), digits = 2)
val_halibut <- round(val(dfnew = df_halibut), digits = 2)
val_mackerel <- round(val(dfnew = df_mackerel), digits = 2)

df_val <- cbind(val_cod, val_herring, val_capelin, val_plaice, val_haddock, val_halibut, val_mackerel)
colnames(df_val) <- c("Atlantic cod", "Herring", "Capelin", "American plaice", "Haddock", "Greenland halibut", "Atlantic mackerel")
rownames(df_val) <- c("log10(SPPR = 1.18*TL - 1.64*PQ -0.05*TE- 0.44",
                      "log10(SPPR = 1.12*TL - 1.809*PQ - 0.872",
                      "log10(SPPR)= 1.15*TL = 1.34")

datatable (df_val)