This report show the code and analyis for assesment the risk for selection of pesticides in cleaning water.

Dataset PPDB database

database<- read.csv(file = 'risk_def.csv',header=T,sep=",",dec=".", stringsAsFactors=FALSE)
database_filter<-database[(database$solubility<=600 & database$solubility>0 & database$persistence<=100 & database$persistence>0 & database$ld50<=16000),]
database_filter<-na.omit(database_filter)
#Rescaled de dependent variable (0-1)
normalizedR<-((database_filter$risk-min(database_filter$risk))/(max(database_filter$risk)-min(database_filter$risk)))
database_filter<-cbind(database_filter,RiskN = normalizedR)
#Add scaled data for independent variables
database_filter$solubility_scaled<-scale(database_filter$solubility)
database_filter$persistence_scaled<-scale(database_filter$persistence)
database_filter$ld50_scaled<-scale(database_filter$ld50)
#Add transformed data for independent variables
database_filter$solubility_sqrt<-sqrt(database_filter$solubility)
database_filter$persistence_sqrt<-sqrt(database_filter$persistence)
database_filter$ld50_sqrt<-sqrt(database_filter$ld50)

#Delete data once the model has been studied and outlier detected:
database_filter=database_filter[-c(23,26,70,132,204,378,467,502,534), ]

Summary and distribution: Solubility

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.32    4.30   56.41   38.00  596.00

## [1] 1.704678

Summary and distribution: Persistence

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.02    2.45    9.90   22.98   30.50  100.00

## [1] 0.7220815

Summary and distribution: Oral LD50

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0     532    2000    2695    5000   15000

## [1] -0.03073357

Summary Risk

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.00    9.00   12.00   13.43   16.00   32.00

Type of pesticides

library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
values<-database %>% group_by(type) %>%tally() 
print(values,n=15)# change at 150 for see all values
## # A tibble: 151 × 2
##     type     n
##    <int> <int>
##  1     1     4
##  2     2    14
##  3     3     1
##  4     5    17
##  5     6     1
##  6     7     3
##  7     8     3
##  8     9     3
##  9    10     5
## 10    11    23
## 11    12     4
## 12    13     4
## 13    14     1
## 14    15     3
## 15    16     2
## # ℹ 136 more rows

Barplot

Bar plot for type of pesticides:

## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

Model

Logistic Regression:

require(ggiraph)
## Loading required package: ggiraph
require(ggiraphExtra)
## Loading required package: ggiraphExtra
require(plyr)
require(ggplot2)
fit<-glm(formula = RiskN ~ solubility_sqrt +  persistence_sqrt + ld50_sqrt, data = database_filter, family="binomial")
summary(fit)
## 
## Call:
## glm(formula = RiskN ~ solubility_sqrt + persistence_sqrt + ld50_sqrt, 
##     family = "binomial", data = database_filter)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.671320   0.417713  -4.001 6.30e-05 ***
## solubility_sqrt   0.072154   0.025275   2.855  0.00431 ** 
## persistence_sqrt  0.228718   0.057202   3.998 6.38e-05 ***
## ld50_sqrt        -0.019297   0.006662  -2.897  0.00377 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 49.352  on 238  degrees of freedom
## Residual deviance: 12.613  on 235  degrees of freedom
## AIC: 151.34
## 
## Number of Fisher Scoring iterations: 5
#Deviance analysis
dev <- fit$deviance
nullDev <- fit$null.deviance
modelChi <- nullDev - dev
modelChi
## [1] 36.73841
#p-value chi-squared test
chigl <- fit$df.null - fit$df.residual
chisq.prob <- 1 - pchisq(modelChi, chigl)
message ("P-value for chi-squared test")
## P-value for chi-squared test
chisq.prob
## [1] 5.226712e-08
#psuedo R2 equation of Hosmer & Lemeshow 
R2.hl <- modelChi/fit$null.deviance
message ("R2 of Hosmer & Lemeshow")
## R2 of Hosmer & Lemeshow
R2.hl
## [1] 0.7444225
#odd analysis
message ("ODDs: Ratio of their probabilities of occurrence to their probabilities of non-occurrence")
## ODDs: Ratio of their probabilities of occurrence to their probabilities of non-occurrence
odd_calc<-exp(cbind(OR = coef(fit), confint(fit)))
## Waiting for profiling to be done...
odd_calc
##                         OR      2.5 %    97.5 %
## (Intercept)      0.1879988 0.08039896 0.4160225
## solubility_sqrt  1.0748205 1.02316882 1.1304988
## persistence_sqrt 1.2569876 1.12545289 1.4097634
## ld50_sqrt        0.9808878 0.96782085 0.9935297

Dataset EURL database

eurl<- read.csv(file = 'EURL.csv',header=T,sep=";",dec=".", stringsAsFactors=FALSE)
eurl_filter<-na.omit(eurl)
supptable1=eurl_filter[, colnames(eurl_filter)[c(1,2,3,7)]]
#Add transformed data for independent variables
eurl_filter$solubility_sqrt<-sqrt(eurl_filter$solubility)
eurl_filter$persistence_sqrt<-sqrt(eurl_filter$persistence)
eurl_filter$ld50_sqrt<-sqrt(eurl_filter$ld)
message ("Summary Solubility:")
## Summary Solubility:
summary(eurl_filter$solubility)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       1      22   51980     584 6000000
message ("Summary Persistence:")
## Summary Persistence:
summary(eurl_filter$persistence)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.02    2.55    7.65   34.31   26.00 1000.00
message ("Summary LD50:")
## Summary LD50:
summary(eurl_filter$ld50)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.612  28.266  44.721  47.737  70.711 126.491

Predict risk pesticides and data analysis

Supplementary table 1

DT::datatable(supptable1)

Supplementary table 2

DT::datatable(supptable2)