POPTOT (total population) Hapoly (municipality area)
Surfacebois (wood area) Airbat (area with buildings)
library(SamplingStrata)
data("swissmunicipalities")
df <- swissmunicipalities[,c(1,3,6:8,23)]
# df$DOM <- 1
df$HApoly.cat <- var.bin(df$HApoly,15)
df$POPTOT.cat <- var.bin(df$POPTOT,15)
frame <- buildFrameDF(df=df,
id="id",
X=c("HApoly.cat","POPTOT.cat"),
Y=c("Surfacesbois",
"Surfacescult"),
domainvalue = "REG")
strata <- buildStrataDF(frame,progress=FALSE, verbose=FALSE)
ndom <- length(unique(frame$domainvalue))
cv <- as.data.frame(list(DOM=rep("DOM1",ndom),
CV1=rep(0.05,ndom),
CV2=rep(0.05,ndom),
domainvalue=c(1:ndom)))
cv
## DOM CV1 CV2 domainvalue
## 1 DOM1 0.05 0.05 1
## 2 DOM1 0.05 0.05 2
## 3 DOM1 0.05 0.05 3
## 4 DOM1 0.05 0.05 4
## 5 DOM1 0.05 0.05 5
## 6 DOM1 0.05 0.05 6
## 7 DOM1 0.05 0.05 7
library(hEDA)
dom<-unique(strata$DOM1)
ndom<-length(unique(strata$DOM1))
Kmean<-SamplingStrata::KmeansSolution(strata,
errors=cv,
nstrata=NA,
minnumstrat=2,
maxclusters = 20,
showPlot=FALSE)
##
## -------------------
## Kmeans solution
## -------------------
## *** Domain: 1 ***
## Number of strata: 18
## Sample size : 80
## *** Domain: 2 ***
## Number of strata: 12
## Sample size : 57
## *** Domain: 3 ***
## Number of strata: 12
## Sample size : 38
## *** Domain: 4 ***
## Number of strata: 11
## Sample size : 28
## *** Domain: 5 ***
## Number of strata: 14
## Sample size : 66
## *** Domain: 6 ***
## Number of strata: 13
## Sample size : 34
## *** Domain: 7 ***
## Number of strata: 17
## Sample size : 91
## -------------------
## Total size: 394
## -------------------
nstrata<-NULL
for(i in 1:ndom){
nstrata[i]<-length(table(Kmean$suggestions[which(Kmean$domainvalue==dom[i])]))
}
nstrata
## [1] 18 12 12 11 14 13 17
Source: https://link.springer.com/referenceworkentry/10.1007/978-0-387-30164-8_426 - The K-means clustering algorithm is sensitive to outliers, because a mean is easily influenced by extreme values. - - K-medoids clustering is a variant of K-means that is more robust to noises and outliers. - Instead of using the mean point as the center of a cluster, K-medoids uses an actual point in the cluster to represent it. - Medoid is the most centrally located object of the cluster, with minimum sum of distances to other points. - As an alternative to pam consider using the median - Mean is greatly influenced by the outlier and thus cannot represent the correct cluster center, while medoid is robust to the outlier and correctly represents the cluster center.
library(hEDA)
library(cluster)
Kmedoids<-kmedoidsSolution(strata,
errors=cv,
nstrata=NA,
minnumstrat=2,
maxclusters = 20,
showPlot=FALSE)
Kmedoidsnstrata<-NULL
for(i in 1:ndom){
Kmedoidsnstrata[i]<-length(table(Kmedoids$suggestions[which(Kmedoids$domainvalue==dom[i])]))
}
Kmedoidsnstrata
## DOM CV1 CV2 domainvalue
## 1 DOM1 0.05 0.05 1
## 2 DOM1 0.05 0.05 2
## 3 DOM1 0.05 0.05 3
## 4 DOM1 0.05 0.05 4
## 5 DOM1 0.05 0.05 5
## 6 DOM1 0.05 0.05 6
## 7 DOM1 0.05 0.05 7
##
## -----------------
##
## Kmedoids solution
## -----------------
## *** Domain: 1 ***
## Number of strata: 20
## Sample size : 81
## *** Domain: 2 ***
## Number of strata: 16
## Sample size : 62
## *** Domain: 3 ***
## Number of strata: 12
## Sample size : 39
## *** Domain: 4 ***
## Number of strata: 14
## Sample size : 29
## *** Domain: 5 ***
## Number of strata: 20
## Sample size : 68
## *** Domain: 6 ***
## Number of strata: 15
## Sample size : 33
## *** Domain: 7 ***
## Number of strata: 20
## Sample size : 94
## Total Sample size : 406
## [1] 20 16 12 14 20 15 20
library(hEDA)
fuzzy<-fuzzySolution(strata,
cv,
minClusters=2,
maxclusters = 20)
#sample size
sum(fuzzy[[2]])
#adapt so it can be used in hEDA
fuzzySol<-Kmean
for(i in 1:ndom){
fuzzySol$suggestions[which(fuzzySol$domainvalue==dom[i])]<-unlist(fuzzy[[1]][i])
}
fuzzynstrata<-NULL
for(i in 1:ndom){
fuzzynstrata[i]<-length(table(fuzzy[[1]][[i]]))
}
fuzzynstrata
## [1] 349.6996
## [1] 14 13 7 10 18 11 13
Temp=0.0001; rate of accepting inferior solutions
decrement_constant=0.95; decay of Temperature
jsize=5; Number of Sequences
length_of_markov_chain =50; length of sequence
SAAiters=5; Number of iterations at which the SAA is run
popSize = 20; Population of solutions size
iters = 5; Number of hEDA solutions
mutationChance = 0.01; Mutation chance
elitism = 0.1; Elitism rate
EDAfreq=1; frequency of EDA
kmax_percent=0.025; rate of accepting large perturbations in first sequence
ProbNewStratum=0.0001; Probability of creating new stratum
ptm <- proc.time()
outpar<-hEDA::parallelhEDA(strata, cv, fuzzySol,
Temp=0.0001,initialStrata=nstrata, decrement_constant=0.95, end_time =Inf,
jsize=5,length_of_markov_chain =50,
SAArun=TRUE,SAAiters=5,
popSize = 20, iters = 5, mutationChance = 0.01, elitism = 0.1,
addStrataFactor=0.000001, EDAfreq=1,
verbose = FALSE, dominio=dom,minnumstrat=2,kmax_percent=0.025,ProbNewStratum=0.0001,
strcens=FALSE,writeFiles=FALSE, showPlot=TRUE, minTemp = 0.000005, realAllocation=TRUE)
Time<-proc.time() - ptm
sum(unlist(outpar$SampleSize))
Time
ptm <- proc.time()
solutionGGA <-optimizeStrata(errors=cv, strata, cens = NULL, strcens = FALSE, alldomains = TRUE,
dom = NULL, initialStrata = nstrata, addStrataFactor = 0, minnumstr = 2,
iter = 100, pops = 20, mut_chance = NA, elitism_rate = 0.2,
highvalue = 1e+08, suggestions =fuzzySol,
realAllocation = TRUE,
writeFiles = TRUE, showPlot = TRUE, parallel = TRUE, cores = NA)
Time<-proc.time() - ptm
Time
Bezdek, J. C. (1981). Objective function clustering. In Pattern recognition with fuzzy objective function algorithms, pp. 43–93. Springer.
Klawonn, F. (2004). Fuzzy clustering: Insights and a new approach. Mathware & soft computing. 2004 Vol. 11 Núm. 3.
Morissette, L. and S. Chartier (2013). The k-means clustering technique: General considerations and implementation in mathematica. Tutorials in Quantitative Methods for Psychology 9(1), 15–24.