This report show the code and analyis for assesment the risk for selection of pesticides in cleaning water.
database<- read.csv(file = 'risk_def.csv',header=T,sep=",",dec=".", stringsAsFactors=FALSE)
database_filter<-database[(database$solubility<=600 & database$solubility>0 & database$persistence<=100 & database$persistence>0 & database$ld50<=16000),]
database_filter<-na.omit(database_filter)
#Rescaled de dependent variable (0-1)
normalizedR<-((database_filter$risk-min(database_filter$risk))/(max(database_filter$risk)-min(database_filter$risk)))
database_filter<-cbind(database_filter,RiskN = normalizedR)
#Add scaled data for independent variables
database_filter$solubility_scaled<-scale(database_filter$solubility)
database_filter$persistence_scaled<-scale(database_filter$persistence)
database_filter$ld50_scaled<-scale(database_filter$ld50)
#Add transformed data for independent variables
database_filter$solubility_sqrt<-sqrt(database_filter$solubility)
database_filter$persistence_sqrt<-sqrt(database_filter$persistence)
database_filter$ld50_sqrt<-sqrt(database_filter$ld50)
#Delete data once the model has been studied and outlier detected:
database_filter=database_filter[-c(23,26,70,132,204,378,467,502,534), ]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.32 4.30 56.41 38.00 596.00
## [1] 1.704678
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02 2.45 9.90 22.98 30.50 100.00
## [1] 0.7220815
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 532 2000 2695 5000 15000
## [1] -0.03073357
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 9.00 12.00 13.43 16.00 32.00
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
values<-database %>% group_by(type) %>%tally()
print(values,n=15)# change at 150 for see all values
## # A tibble: 151 × 2
## type n
## <int> <int>
## 1 1 4
## 2 2 14
## 3 3 1
## 4 5 17
## 5 6 1
## 6 7 3
## 7 8 3
## 8 9 3
## 9 10 5
## 10 11 23
## 11 12 4
## 12 13 4
## 13 14 1
## 14 15 3
## 15 16 2
## # ℹ 136 more rows
Bar plot for type of pesticides:
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
Logistic Regression:
require(ggiraph)
## Loading required package: ggiraph
require(ggiraphExtra)
## Loading required package: ggiraphExtra
require(plyr)
require(ggplot2)
fit<-glm(formula = RiskN ~ solubility_sqrt + persistence_sqrt + ld50_sqrt, data = database_filter, family="binomial")
summary(fit)
##
## Call:
## glm(formula = RiskN ~ solubility_sqrt + persistence_sqrt + ld50_sqrt,
## family = "binomial", data = database_filter)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.671320 0.417713 -4.001 6.30e-05 ***
## solubility_sqrt 0.072154 0.025275 2.855 0.00431 **
## persistence_sqrt 0.228718 0.057202 3.998 6.38e-05 ***
## ld50_sqrt -0.019297 0.006662 -2.897 0.00377 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 49.352 on 238 degrees of freedom
## Residual deviance: 12.613 on 235 degrees of freedom
## AIC: 151.34
##
## Number of Fisher Scoring iterations: 5
#Deviance analysis
dev <- fit$deviance
nullDev <- fit$null.deviance
modelChi <- nullDev - dev
modelChi
## [1] 36.73841
#p-value chi-squared test
chigl <- fit$df.null - fit$df.residual
chisq.prob <- 1 - pchisq(modelChi, chigl)
message ("P-value for chi-squared test")
## P-value for chi-squared test
chisq.prob
## [1] 5.226712e-08
#psuedo R2 equation of Hosmer & Lemeshow
R2.hl <- modelChi/fit$null.deviance
message ("R2 of Hosmer & Lemeshow")
## R2 of Hosmer & Lemeshow
R2.hl
## [1] 0.7444225
#odd analysis
message ("ODDs: Ratio of their probabilities of occurrence to their probabilities of non-occurrence")
## ODDs: Ratio of their probabilities of occurrence to their probabilities of non-occurrence
odd_calc<-exp(cbind(OR = coef(fit), confint(fit)))
## Waiting for profiling to be done...
odd_calc
## OR 2.5 % 97.5 %
## (Intercept) 0.1879988 0.08039896 0.4160225
## solubility_sqrt 1.0748205 1.02316882 1.1304988
## persistence_sqrt 1.2569876 1.12545289 1.4097634
## ld50_sqrt 0.9808878 0.96782085 0.9935297
eurl<- read.csv(file = 'EURL.csv',header=T,sep=";",dec=".", stringsAsFactors=FALSE)
eurl_filter<-na.omit(eurl)
supptable1=eurl_filter[, colnames(eurl_filter)[c(1,2,3,7)]]
#Add transformed data for independent variables
eurl_filter$solubility_sqrt<-sqrt(eurl_filter$solubility)
eurl_filter$persistence_sqrt<-sqrt(eurl_filter$persistence)
eurl_filter$ld50_sqrt<-sqrt(eurl_filter$ld)
message ("Summary Solubility:")
## Summary Solubility:
summary(eurl_filter$solubility)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1 22 51980 584 6000000
message ("Summary Persistence:")
## Summary Persistence:
summary(eurl_filter$persistence)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02 2.55 7.65 34.31 26.00 1000.00
message ("Summary LD50:")
## Summary LD50:
summary(eurl_filter$ld50)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.612 28.266 44.721 47.737 70.711 126.491
DT::datatable(supptable1)
DT::datatable(supptable2)