summary of descriptive statistics
library(tidyverse)
## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(DMwR2)
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
data("algae")
algae <- as.tibble(algae)
str(algae)
summary(algae)
Based on qqplot, some observations can’t satisfy normality assumption
library(ggplot2)
theme_set(
theme_minimal() +
theme(legend.position = "top")
)
p1 <- algae %>% ggplot() + aes(x=mxPH) + geom_histogram(aes(y=..density..)) + geom_density(color="red") + geom_rug() +
ggtitle("The Histogram of mxPH (maximum pH)") + xlab("") + ylab("")
p1
library(car)
qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")
## [1] 56 57
algae %>% ggplot(aes(x = factor(0),y = oPO4)) + geom_boxplot() + geom_rug() + geom_hline(aes(yintercept=mean(algae$oPO4, na.rm = TRUE)), linetype=2,colour="red") + ylab("Orthophosphate (oPO4)") + xlab("") + scale_x_discrete(breaks=NULL)
library(forcats)
algae <- mutate(algae, size=fct_relevel(size,c("small","medium","large")),
speed=fct_relevel(speed,c("low","medium","high")),
season=fct_relevel(season,c("spring","summer","autumn","winter")))
ggplot(algae,aes(x=size,y=a1)) + geom_boxplot() + xlab("River Size") + ylab("Algal A1")
filter(algae, !complete.cases(algae) )
apply(algae, 1, function(x) sum(is.na(x)))
manyNAs(algae, 0.2) # row locations with more than 20 percent unknown values
algae <- algae[-c(62,199),] # remove those rows
apply(algae, 2, function(x) sum(is.na(x))) # cl has 8 NA, and chla has 10 NA
# correlation matrix
library(corrplot)
## corrplot 0.84 loaded
cm <- cor(algae[, 4:18], use = "complete.obs")
corrplot(cm, type="upper",tl.pos="d")
corrplot(cm, add=TRUE, type="lower", method="number",
diag=FALSE, tl.pos="n", cl.pos="n")
lm(PO4~oPO4,data = algae) # PO4 = 1.293(oPO4) + 42.897
algae$PO4[28] <- 1.293*algae$oPO4[28] + 42.897
algae <- knnImputation(algae,k=10)
library(performanceEstimation)
DSs <- sapply(names(algae)[12:18], function(x,names.attrs) {
f <- as.formula(paste(x, "~ ."))
PredTask(f, algae[,c(names.attrs,x)], x, copy=TRUE)
},
names(algae)[1:11])
res.all <- performanceEstimation(
DSs,
c(Workflow(learner="lm", pre="knnImp", post="onlyPos"),
workflowVariants(learner="rpartXse", learner.pars=list(se=c(0,0.5,1)))),
EstimationTask(metrics="nmse" ,method=CV(nReps=5, nFolds=10)))
plot(res.all)
topPerformers(res.all)
## $a1
## Workflow Estimate
## nmse lm 0.69
##
## $a2
## Workflow Estimate
## nmse lm 0.987
##
## $a3
## Workflow Estimate
## nmse rpartXse.v3 1
##
## $a4
## Workflow Estimate
## nmse rpartXse.v2 1
##
## $a5
## Workflow Estimate
## nmse lm 0.959
##
## $a6
## Workflow Estimate
## nmse lm 0.872
##
## $a7
## Workflow Estimate
## nmse rpartXse.v2 1
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
res.all <- performanceEstimation(DSs,c(Workflow(learner="lm", pre="knnImp",post="onlyPos"),
workflowVariants(learner="rpartXse",
learner.pars=list(se=c(0,0.5,1))),
workflowVariants(learner="randomForest", pre="knnImp",
learner.pars=list(ntree=c(200,500,700)))),
EstimationTask(metrics="nmse",method=CV(nReps=5,nFolds=10)))
rankWorkflows(res.all, top=3)
## $a1
## $a1$nmse
## Workflow Estimate
## 1 randomForest.v1 0.5294489
## 2 randomForest.v2 0.5303601
## 3 randomForest.v3 0.5309474
##
##
## $a2
## $a2$nmse
## Workflow Estimate
## 1 randomForest.v2 0.7821793
## 2 randomForest.v3 0.7829396
## 3 randomForest.v1 0.7874964
##
##
## $a3
## $a3$nmse
## Workflow Estimate
## 1 randomForest.v2 0.9819369
## 2 randomForest.v3 0.9842245
## 3 randomForest.v1 0.9906091
##
##
## $a4
## $a4$nmse
## Workflow Estimate
## 1 rpartXse.v2 1.000000
## 2 rpartXse.v3 1.000000
## 3 randomForest.v2 1.023628
##
##
## $a5
## $a5$nmse
## Workflow Estimate
## 1 randomForest.v3 0.8175258
## 2 randomForest.v2 0.8190284
## 3 randomForest.v1 0.8217082
##
##
## $a6
## $a6$nmse
## Workflow Estimate
## 1 lm 0.8719455
## 2 randomForest.v2 0.9152018
## 3 randomForest.v3 0.9166955
##
##
## $a7
## $a7$nmse
## Workflow Estimate
## 1 rpartXse.v2 1.000000
## 2 rpartXse.v3 1.000000
## 3 randomForest.v2 1.039846
p <- pairedComparisons(res.all,baseline="randomForest.v3")
p$nmse$F.test
## $chi
## [1] 18.62755
##
## $FF
## [1] 4.781925
##
## $critVal
## [1] 0.6524015
##
## $rejNull
## [1] TRUE
p$nmse$BonferroniDunn.test
## $critDif
## [1] 3.046397
##
## $baseline
## [1] "randomForest.v3"
##
## $rkDifs
## lm rpartXse.v1 rpartXse.v2 rpartXse.v3 randomForest.v1
## 1.7142857 3.5714286 1.6428571 1.5000000 0.4285714
## randomForest.v2
## 0.8571429
##
## $signifDifs
## lm rpartXse.v1 rpartXse.v2 rpartXse.v3 randomForest.v1
## FALSE TRUE FALSE FALSE FALSE
## randomForest.v2
## FALSE
CDdiagram.BD(p)
# predicting on test sample
wfs <- sapply(taskNames(res.all),
function(t) topPerformer(res.all,metric="nmse",task=t))
wfs[["a1"]]
## Workflow Object:
## Workflow ID :: randomForest.v1
## Workflow Function :: standardWF
## Parameter values:
## learner.pars -> ntree=200
## learner -> randomForest
## pre -> knnImp
full.test.algae <- cbind(test.algae, algae.sols)
pts <- array(dim = c(140,7,2),
dimnames = list(1:140, paste0("a",1:7), c("trues","preds")))
for(i in 1:7) {
res <- runWorkflow(wfs[[i]],
as.formula(paste(names(wfs)[i],"~.")),
algae[,c(1:11,11+i)],
full.test.algae[,c(1:11,11+i)])
pts[,i,"trues"] <- res$trues
pts[,i,"preds"] <- res$preds
}
avg.preds <- apply(algae[,12:18], 2, mean)
apply((pts[,,"trues"] - pts[,,"preds"])^2, 2 ,sum) /
apply( (scale(pts[,,"trues"], avg.preds, FALSE))^2, 2, sum)
## a1 a2 a3 a4 a5 a6 a7
## 0.4860505 0.8837355 0.8328648 1.0000000 0.7531270 0.8337472 1.0000000