#The causal diagram explains the interactive effects of different soil paramters on the microbial biomass carbon.
#Increase in soil organic carbon and available nitrogen contents enhance microbial growth which is in turn positively affect dehydrogenase activity- a indicator of soil microbial profile
#Additionally, higher percentage sand content reflects less good environment for microbial growth in terms of nutrients supplies. Also, increase in electrical conductvity make soil unsuitable for plant and microbial growth due to high salinity which will in turn affect microbial dehydrogenase activity.
#I therefore model the signficance of different predictors of microbial biomass carbon. 
#The model is written in the "dot" language
library(DiagrammeR)
Causalmodel <-
  "digraph{

rankdir = LR;
Microbial_biomass_carbon -> Soil_Organic_carbon [label='+'];
Microbial_biomass_carbon -> Available_Nitrogen [label='+'];
Microbial_biomass_carbon ->Dehydrogenase_activity[label='+'];
Microbial_biomass_carbon -> Electrical_conductivity [label='-'];
Soil_Organic_carbon-> Electrical_conductivity [label='-'];
Dehydrogenase_activity-> Electrical_conductivity [label='-'];
Microbial_biomass_carbon-> Percent_Sand_content[label='-'];
Microbial_biomass_carbon-> Percent_Clay[label='+']
}"
grViz(Causalmodel)
help("grViz")
## Warning in stats::runif(10): '.Random.seed' is not an integer vector but of
## type 'NULL', so ignored
## starting httpd help server ...
##  done
#Lavaan modeling
library(lavaan)
## This is lavaan 0.5-23.1097
## lavaan is BETA software! Please report any bugs.
library(semPlot)
source("lavaan_modavg.R")
getwd()
## [1] "C:/Users/Paul/Documents/Rwork"
t.dat<- read.csv("Microbial biomass data.csv")

# specify "mod.1"
mod.1 <- 'Microbial_biomass_carbon  ~ Available_Nitrogen + Soil_organic_carbon + Dehydrogenase_activity + Percent_Sand_content
Dehydrogenase_activity  ~ Electrical_conductivity + Microbial_biomass_carbon + Percent_Sand_content'

# Fit "mod.1"
mod.1.fit <- sem(mod.1, data=t.dat)
## Warning in lav_data_full(data = data, group = group, cluster = cluster, :
## lavaan WARNING: some observed variances are (at least) a factor 1000 times
## larger than others; use varTable(fit) to investigate
summary(mod.1.fit,standardized=TRUE)
## lavaan (0.5-23.1097) converged normally after  95 iterations
## 
##   Number of observations                           200
## 
##   Estimator                                         ML
##   Minimum Function Test Statistic                2.395
##   Degrees of freedom                                 2
##   P-value (Chi-square)                           0.302
## 
## Parameter Estimates:
## 
##   Information                                 Expected
##   Standard Errors                             Standard
## 
## Regressions:
##                              Estimate   Std.Err   z-value  P(>|z|)
##   Microbial_biomass_carbon ~                                      
##     Availabl_Ntrgn               3.872     2.122    1.825    0.068
##     Soil_rgnc_crbn             643.390   254.494    2.528    0.011
##     Dhydrgns_ctvty               4.824    38.580    0.125    0.900
##     Prcnt_Snd_cntn              -3.208     2.185   -1.468    0.142
##   Dehydrogenase_activity ~                                        
##     Elctrcl_cndctv               0.384     0.957    0.402    0.688
##     Mcrbl_bmss_crb               0.002     0.002    0.773    0.440
##     Prcnt_Snd_cntn              -0.010     0.016   -0.637    0.524
##    Std.lv   Std.all
##                    
##      3.872    0.137
##    643.390    0.177
##      4.824    0.039
##     -3.208   -0.115
##                    
##      0.384    0.028
##      0.002    0.233
##     -0.010   -0.046
## 
## Variances:
##                    Estimate   Std.Err   z-value  P(>|z|)   Std.lv 
##    .Mcrbl_bmss_crb 59702.135 10569.038    5.649    0.000 59702.135
##    .Dhydrgns_ctvty     3.855     0.396    9.732    0.000     3.855
##   Std.all
##     0.924
##     0.923
#Results show that the minimum test statistics=11.73
#I then added another path to my model in model 2
#Specify "model.2"
mod.2 <- 'Microbial_biomass_carbon  ~ Available_Nitrogen + Soil_organic_carbon + Dehydrogenase_activity + Percent_Sand_content
Dehydrogenase_activity  ~ Electrical_conductivity + Microbial_biomass_carbon + Percent_Sand_content + Percent_Clay'
#fit mod.2
mod.2.fit <- sem(mod.2, data=t.dat)
## Warning in lav_data_full(data = data, group = group, cluster = cluster, :
## lavaan WARNING: some observed variances are (at least) a factor 1000 times
## larger than others; use varTable(fit) to investigate
summary(mod.2.fit,standardized=TRUE)
## lavaan (0.5-23.1097) converged normally after  94 iterations
## 
##   Number of observations                           200
## 
##   Estimator                                         ML
##   Minimum Function Test Statistic                2.544
##   Degrees of freedom                                 3
##   P-value (Chi-square)                           0.467
## 
## Parameter Estimates:
## 
##   Information                                 Expected
##   Standard Errors                             Standard
## 
## Regressions:
##                              Estimate   Std.Err   z-value  P(>|z|)
##   Microbial_biomass_carbon ~                                      
##     Availabl_Ntrgn               3.883     2.115    1.836    0.066
##     Soil_rgnc_crbn             644.503   254.554    2.532    0.011
##     Dhydrgns_ctvty               3.987    36.542    0.109    0.913
##     Prcnt_Snd_cntn              -3.224     2.170   -1.486    0.137
##   Dehydrogenase_activity ~                                        
##     Elctrcl_cndctv               0.226     0.961    0.235    0.814
##     Mcrbl_bmss_crb               0.002     0.002    0.845    0.398
##     Prcnt_Snd_cntn              -0.006     0.016   -0.378    0.705
##     Percent_Clay                 0.030     0.025    1.179    0.238
##    Std.lv   Std.all
##                    
##      3.883    0.138
##    644.503    0.177
##      3.987    0.032
##     -3.224   -0.116
##                    
##      0.226    0.016
##      0.002    0.240
##     -0.006   -0.028
##      0.030    0.083
## 
## Variances:
##                    Estimate   Std.Err   z-value  P(>|z|)   Std.lv 
##    .Mcrbl_bmss_crb 59894.049 10409.485    5.754    0.000 59894.049
##    .Dhydrgns_ctvty     3.826     0.389    9.836    0.000     3.826
##   Std.all
##     0.927
##     0.917
#After fitting mode.2, the test statistics increases which was expected to decrease, i now had to revisit my first model to create model 3 by removing two paths
#Specify "model.3"
mod.3 <- 'Microbial_biomass_carbon  ~ Available_Nitrogen + Soil_organic_carbon + Dehydrogenase_activity + Percent_Sand_content
Dehydrogenase_activity  ~ Electrical_conductivity + Microbial_biomass_carbon'
#fit mod.3
mod.3.fit <- sem(mod.3,data=t.dat)
## Warning in lav_data_full(data = data, group = group, cluster = cluster, :
## lavaan WARNING: some observed variances are (at least) a factor 1000 times
## larger than others; use varTable(fit) to investigate
summary(mod.3.fit)
## lavaan (0.5-23.1097) converged normally after 109 iterations
## 
##   Number of observations                           200
## 
##   Estimator                                         ML
##   Minimum Function Test Statistic                2.792
##   Degrees of freedom                                 3
##   P-value (Chi-square)                           0.425
## 
## Parameter Estimates:
## 
##   Information                                 Expected
##   Standard Errors                             Standard
## 
## Regressions:
##                              Estimate   Std.Err   z-value  P(>|z|)
##   Microbial_biomass_carbon ~                                      
##     Availabl_Ntrgn               4.009     2.188    1.832    0.067
##     Soil_rgnc_crbn             657.812   263.806    2.494    0.013
##     Dhydrgns_ctvty              -6.007    38.285   -0.157    0.875
##     Prcnt_Snd_cntn              -3.414     2.164   -1.578    0.115
##   Dehydrogenase_activity ~                                        
##     Elctrcl_cndctv               0.274     0.956    0.287    0.774
##     Mcrbl_bmss_crb               0.003     0.002    1.115    0.265
## 
## Variances:
##                    Estimate   Std.Err   z-value  P(>|z|)
##    .Mcrbl_bmss_crb 62636.348 13614.993    4.601    0.000
##    .Dhydrgns_ctvty     3.865     0.401    9.650    0.000
#Gettin the same ML value
#Next, i tried the model3 again with reduced sample size (from 310 to 200)
#Specify model 4
library(lavaan)
library(semPlot)
getwd()
## [1] "C:/Users/Paul/Documents/Rwork"
t.dat<- read.csv("Microbial biomass data.csv")
mod.4 <- 'Microbial_biomass_carbon  ~ Available_Nitrogen + Soil_organic_carbon + Dehydrogenase_activity + Percent_Sand_content
Dehydrogenase_activity  ~ Electrical_conductivity + Microbial_biomass_carbon'
#fit mod.4
mod.4.fit <- sem(mod.4,data=t.dat)
## Warning in lav_data_full(data = data, group = group, cluster = cluster, :
## lavaan WARNING: some observed variances are (at least) a factor 1000 times
## larger than others; use varTable(fit) to investigate
summary(mod.4.fit,standardized=T, fit.measures=TRUE)
## lavaan (0.5-23.1097) converged normally after 109 iterations
## 
##   Number of observations                           200
## 
##   Estimator                                         ML
##   Minimum Function Test Statistic                2.792
##   Degrees of freedom                                 3
##   P-value (Chi-square)                           0.425
## 
## Model test baseline model:
## 
##   Minimum Function Test Statistic               30.789
##   Degrees of freedom                                 9
##   P-value                                        0.000
## 
## User model versus baseline model:
## 
##   Comparative Fit Index (CFI)                    1.000
##   Tucker-Lewis Index (TLI)                       1.029
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -2883.500
##   Loglikelihood unrestricted model (H1)      -2882.104
## 
##   Number of free parameters                          8
##   Akaike (AIC)                                5783.000
##   Bayesian (BIC)                              5809.386
##   Sample-size adjusted Bayesian (BIC)         5784.041
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.000
##   90 Percent Confidence Interval          0.000  0.116
##   P-value RMSEA <= 0.05                          0.620
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.025
## 
## Parameter Estimates:
## 
##   Information                                 Expected
##   Standard Errors                             Standard
## 
## Regressions:
##                              Estimate   Std.Err   z-value  P(>|z|)
##   Microbial_biomass_carbon ~                                      
##     Availabl_Ntrgn               4.009     2.188    1.832    0.067
##     Soil_rgnc_crbn             657.812   263.806    2.494    0.013
##     Dhydrgns_ctvty              -6.007    38.285   -0.157    0.875
##     Prcnt_Snd_cntn              -3.414     2.164   -1.578    0.115
##   Dehydrogenase_activity ~                                        
##     Elctrcl_cndctv               0.274     0.956    0.287    0.774
##     Mcrbl_bmss_crb               0.003     0.002    1.115    0.265
##    Std.lv   Std.all
##                    
##      4.009    0.142
##    657.812    0.181
##     -6.007   -0.048
##     -3.414   -0.123
##                    
##      0.274    0.020
##      0.003    0.318
## 
## Variances:
##                    Estimate   Std.Err   z-value  P(>|z|)   Std.lv 
##    .Mcrbl_bmss_crb 62636.348 13614.993    4.601    0.000 62636.348
##    .Dhydrgns_ctvty     3.865     0.401    9.650    0.000     3.865
##   Std.all
##     0.969
##     0.926
#Model 4 gives a reasonable ML value
#Comparing the modles using their aic values
source("lavaan_modavg.R")
anova(mod.1.fit, mod.2.fit, mod.3.fit,mod.4.fit)
## Warning in lavTestLRT(object = <S4 object of class structure("lavaan",
## package = "lavaan")>, : lavaan WARNING: some models are based on a
## different set of observed variables
## Chi Square Difference Test
## 
##           Df    AIC    BIC  Chisq Chisq diff Df diff Pr(>Chisq)    
## mod.1.fit  2 5784.6 5814.3 2.3950                                  
## mod.2.fit  3 7031.9 7064.9 2.5442    0.14919       1     0.6993    
## mod.3.fit  3 5783.0 5809.4 2.7920    0.24778       0     <2e-16 ***
## mod.4.fit  3 5783.0 5809.4 2.7920    0.00000       0     1.0000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aictab.lavaan(list(mod.1.fit, mod.2.fit,mod.3.fit,mod.4.fit), c("mod.1", "mod.2","mod.3", "mod.4"))
## Warning in aictab.lavaan(list(mod.1.fit, mod.2.fit, mod.3.fit, mod.4.fit), : 
## Check model structure carefully as some models may be redundant
## 
## Model selection based on AICc:
## 
##        K    AICc Delta_AICc AICcWt Cum.Wt       LL
## mod.4  8 5783.43       0.00   0.41   0.41 -2883.50
## mod.3  8 5783.43       0.00   0.41   0.83 -2883.50
## mod.1  9 5785.19       1.75   0.17   1.00 -2883.30
## mod.2 10 7032.70    1249.26   0.00   1.00 -3505.97
# Results from comparing the models showed that mod.4 best described the relationship
fitMeasures(mod.4.fit)
##                npar                fmin               chisq 
##               8.000               0.007               2.792 
##                  df              pvalue      baseline.chisq 
##               3.000               0.425              30.789 
##         baseline.df     baseline.pvalue                 cfi 
##               9.000               0.000               1.000 
##                 tli                nnfi                 rfi 
##               1.029               1.029               0.728 
##                 nfi                pnfi                 ifi 
##               0.909               0.303               1.007 
##                 rni                logl   unrestricted.logl 
##               1.010           -2883.500           -2882.104 
##                 aic                 bic              ntotal 
##            5783.000            5809.386             200.000 
##                bic2               rmsea      rmsea.ci.lower 
##            5784.041               0.000               0.000 
##      rmsea.ci.upper        rmsea.pvalue                 rmr 
##               0.116               0.620               3.051 
##          rmr_nomean                srmr        srmr_bentler 
##               3.051               0.025               0.025 
## srmr_bentler_nomean         srmr_bollen  srmr_bollen_nomean 
##               0.025               0.025               0.025 
##          srmr_mplus   srmr_mplus_nomean               cn_05 
##               0.025               0.025             560.800 
##               cn_01                 gfi                agfi 
##             813.678               0.986               0.904 
##                pgfi                 mfi                ecvi 
##               0.141               1.001               0.094
semPaths(mod.4.fit,whatLabels="par",layout="spring")
## Warning in qgraph(Edgelist, labels = nLab, bidirectional = Bidir, directed
## = Directed, : The following arguments are not documented and likely not
## arguments of qgraph and thus ignored: loopRotation; residuals; residScale;
## residEdge; CircleEdgeEnd

#The fitted model shows that microbial biomass carbon is primarily determined by the amount of organic carbon which is a source of energy for microbes and for enzyme production, available nitrogen is alos so to very important to microbes as its plays key role in protein sysnthesis. In addition, percentage sand content shows the fertility of the soil which in turn determines primary production with a feedback loop to microbial biomass carbon via the amount of nutrients available in the soil. Furthermore, dehydrogenase activity is largely detetmined by microbial biomass and soil electrical conductivity (relating to soil salinity)