library("simPop")## 載入需要的套件:lattice
## 載入需要的套件:vcd
## 載入需要的套件:grid
## Package simPop 1.2.1 has been loaded!
## Since simPop does explicit parallelization,
## the number of data.table threads is set to 1.
set.seed(1234)data("eusilcS", package = "simPop")
origData <- eusilcS
dim(origData)## [1] 11725 18
head(origData)## db030 hsize db040 age rb090 pl030 pb220a netIncome py010n
## 9292 1 2 Salzburg 72 male 5 AT 22675.48 0.00
## 9293 1 2 Salzburg 66 female 5 AT 16999.29 0.00
## 7227 2 1 Upper Austria 56 female 2 AT 19274.21 19274.21
## 5275 3 1 Styria 67 female 5 AT 13319.13 0.00
## 7866 4 3 Upper Austria 70 female 5 AT 14365.57 0.00
## 7867 4 3 Upper Austria 46 male 3 AT 0.00 0.00
## py050n py090n py100n py110n py120n py130n py140n db090 rb050
## 9292 0 0 22675.48 0 0 0.00 0 7.822929 7.822929
## 9293 0 0 0.00 0 0 16999.29 0 7.822929 7.822929
## 7227 0 0 0.00 0 0 0.00 0 8.788089 8.788089
## 5275 0 0 13319.13 0 0 0.00 0 8.108452 8.108452
## 7866 0 0 14365.57 0 0 0.00 0 7.509383 7.509383
## 7867 0 0 0.00 0 0 0.00 0 7.509383 7.509383
View(origData)str(origData)## 'data.frame': 11725 obs. of 18 variables:
## $ db030 : int 1 1 2 3 4 4 4 5 5 5 ...
## $ hsize : int 2 2 1 1 3 3 3 5 5 5 ...
## $ db040 : Factor w/ 9 levels "Burgenland","Carinthia",..: 4 4 7 5 7 7 7 4 4 4 ...
## $ age : int 72 66 56 67 70 46 37 41 35 9 ...
## $ rb090 : Factor w/ 2 levels "male","female": 1 2 2 2 2 1 1 1 2 2 ...
## $ pl030 : Factor w/ 7 levels "1","2","3","4",..: 5 5 2 5 5 3 1 1 3 NA ...
## $ pb220a : Factor w/ 3 levels "AT","EU","Other": 1 1 1 1 1 1 3 1 1 NA ...
## $ netIncome: num 22675 16999 19274 13319 14366 ...
## $ py010n : num 0 0 19274 0 0 ...
## $ py050n : num 0 0 0 0 0 ...
## $ py090n : num 0 0 0 0 0 ...
## $ py100n : num 22675 0 0 13319 14366 ...
## $ py110n : num 0 0 0 0 0 0 0 0 0 NA ...
## $ py120n : num 0 0 0 0 0 0 0 0 0 NA ...
## $ py130n : num 0 16999 0 0 0 ...
## $ py140n : num 0 0 0 0 0 0 0 0 0 NA ...
## $ db090 : num 7.82 7.82 8.79 8.11 7.51 ...
## $ rb050 : num 7.82 7.82 8.79 8.11 7.51 ...
length(unique(origData$db030))## [1] 4641
View(origData)#創建一個類“dataObj”的對象,使用specifyInput()存入
inp <- specifyInput(origData, hhid ="db030", hhsize = "hsize",strata="db040",
weight = "rb050")#顯示此“dataObj”對象的內容摘要
print(inp)##
## --------------
## survey sample of size 11725 x 19
##
## Selected important variables:
##
## household ID: db030
## personal ID: pid
## variable household size: hsize
## sampling weight: rb050
## strata: db040
## --------------
#two-dimensional table
data("totalsRG", package = "simPop")
print(totalsRG)## rb090 db040 Freq
## 1 male Burgenland 140436
## 2 female Burgenland 146980
## 3 male Carinthia 270084
## 4 female Carinthia 285797
## 5 male Lower Austria 797398
## 6 female Lower Austria 828087
## 7 male Salzburg 702539
## 8 female Salzburg 722883
## 9 male Styria 259595
## 10 female Styria 274675
## 11 male Tyrol 595842
## 12 female Tyrol 619404
## 13 male Upper Austria 353910
## 14 female Upper Austria 368128
## 15 male Vienna 850596
## 16 female Vienna 916150
## 17 male Vorarlberg 184939
## 18 female Vorarlberg 190343
#n-dimensional table
data("totalsRGtab", package = "simPop")
print(totalsRGtab)## db040
## rb090 Burgenland Carinthia Lower Austria Salzburg Styria Tyrol
## female 146980 285797 828087 722883 274675 619404
## male 140436 270084 797398 702539 259595 595842
## db040
## rb090 Upper Austria Vienna Vorarlberg
## female 368128 916150 190343
## male 353910 850596 184939
totalsRG$Freq <- totalsRG$Freq / 100
totalsRGtab <- totalsRGtab / 100
weights.df <- calibSample(inp, totalsRG)
weights.tab <- calibSample(inp, totalsRGtab)
identical(weights.df, weights.tab)## [1] TRUE
#函數將生成的校準權重添加到“simPopObj”inp添加權重()使用calibSample()
addWeights(inp) <- calibSample(inp, totalsRGtab)synthP <- simStructure(data = inp, method = "direct",basicHHvars = c("age", "rb090", "db040"))synthP##
## --------------
## synthetic population of size
## 85057 x 7
##
## build from a sample of size
## 11725 x 19
## --------------
##
## variables in the population:
## db030,hsize,age,rb090,db040,pid,weight
synthP <- simCategorical(synthP, additional = c("pl030", "pb220a"), method = "multinom", nr_cpus = 1)
print(synthP)##
## --------------
## synthetic population of size
## 85057 x 9
##
## build from a sample of size
## 11725 x 19
## --------------
##
## variables in the population:
## db030,hsize,age,rb090,db040,pid,weight,pl030,pb220a
#人口普查
censusInfo <- eusilcS[, c("age", "rb090", "pl030")]head(censusInfo)## age rb090 pl030
## 9292 72 male 5
## 9293 66 female 5
## 7227 56 female 2
## 5275 67 female 5
## 7866 70 female 5
## 7867 46 male 3
#pl030為工作狀態,有7種總類
stat <- as.numeric(levels(censusInfo$pl030))
stat## [1] 1 2 3 4 5 6 7
#年齡和性別為條件生成這些類別的條件概率
tab1 <- prop.table(table(censusInfo))probs <- reshape2::dcast(as.data.frame(tab1),
formula = rb090 + age ~ pl030, value.var = "Freq")pop <- subset(eusilcS, eusilcS$db030 == 1, select = c("age", "rb090"))
pop## age rb090
## 9292 72 male
## 9293 66 female
pacman::p_load(data.table, magrittr)#給定年齡和 rb090 以及已知概率的經濟狀況進行抽樣,合併變量 age 和 rb090 上的對象 pop 和 probs。
pop <- merge(pop, probs, all.x = TRUE)pop$status <- sapply(1:nrow(pop), function(x) {
pp <- pop[x, -c(1:2)]
ifelse(all(pp == 0), NA, sample(stat, 1, prob = pp))
})pop <- pop[, c("age", "rb090", "status")]
pop## age rb090 status
## 1 66 female 7
## 2 72 male 5
data("eusilcS", package = "simPop")
inp <- specifyInput(data = eusilcS, hhid = "db030", hhsize = "hsize",
strata = "db040", weight = "db090")synthRec <- simStructure(data = inp, method = "direct",
basicHHvars = c("age", "rb090"))
synthRec <- simCategorical(synthRec, additional = "pl030",
method = "distribution", nr_cpus = 1)## [1] "age" "rb090"
synthRec <- simStructure(data = inp, method = "direct",
basicHHvars = c("age", "rb090"))synthRec <- simCategorical(synthRec, additional = "pl030",
method = "distribution", nr_cpus = 1)## [1] "age" "rb090"
使用 simContinuous() 生成連續變量。在應用該功能之前重,基本的家庭結構和可能的其他分類預測因素一定是分別使用 simStructure() 和 simCategorical() 進行模擬。類的對象’simPopObj’ 作為輸入提供。
synthP <- simContinuous(synthP, additional = "netIncome", upper = 200000,
equidist = FALSE, imputeMissings = FALSE, nr_cpus = 1)ageinc <- pop(synthP, var = c("age", "netIncome"))
ageinc$age <- as.numeric(as.character(ageinc$age))#在回歸模型估計中不考慮 16 歲以下的人口16歲及以上人口的淨收入
ageinc[age < 16, netIncome := NA]
pop(synthP, var = "netIncome") <- ageinc$netIncome#也可以使用以下,使用殘差的隨機抽取 synthP <- simContinuous(synthP, additional = “netIncome”, method = “lm”, nr_cpus = 1)
在住戶調查中,一般不收集個人淨收入的信息,而是從其他變量(例如,從按來源收集的收入信息)得出的信息。 合成生成的連續變量可以使用函數 simComponents() 按其組件分解。
sIncome <- manageSimPopObj(synthP, var = "netIncome", sample = TRUE)
sWeight <- manageSimPopObj(synthP, var = "rb050", sample = TRUE)
pIncome <- manageSimPopObj(synthP, var = "netIncome")
breaks <- getBreaks(x = sIncome, w = sWeight, upper = Inf,equidist = FALSE)
synthP <- manageSimPopObj(synthP, var = "netIncomeCat",sample = TRUE, set = TRUE,
values = getCat(x = sIncome, breaks))
synthP <- manageSimPopObj(synthP, var = "netIncomeCat",sample = FALSE, set = TRUE,
values = getCat(x = pIncome, breaks))# simulate net income components
synthP <- simComponents(simPopObj = synthP, total = "netIncome",
components = c("py010n", "py050n", "py090n","py100n","py110n",
"py120n", "py130n", "py140n"),
conditional =c("netIncomeCat", "pl030"),
replaceEmpty = "sequential", seed = 1)synthP ##
## --------------
## synthetic population of size
## 85057 x 19
##
## build from a sample of size
## 11725 x 20
## --------------
##
## variables in the population:
## db030,hsize,age,rb090,db040,pid,weight,pl030,pb220a,netIncomeCat,netIncome,py010n,py050n,py090n,py100n,py110n,py120n,py130n,py140n
#使用函數 simInitSpatial() 模擬較小的區域
simulate_districts <- function(inp) {
hhid <- "db030"
region <- "db040"
a <- inp[!duplicated(inp[, hhid]), c(hhid, region)]
spl <- split(a, a[, region])
regions <- unique(inp[, region])
tmpres <- lapply(1:length(spl), function(x) {
codes <- paste(x, 1:sample(10:90, 1), sep = "")
spl[[x]]$district <- sample(codes, nrow(spl[[x]]), replace = TRUE)
spl[[x]]})
tmpres <- do.call("rbind", tmpres)
tmpres <- tmpres[, -2]
out <- merge(inp, tmpres, by.x = hhid, by.y = hhid, all.x = TRUE)
invisible(out)
}data("eusilcS", package = "simPop")
census <- simulate_districts(eusilcS)
head(table(census$district))##
## 11 110 111 112 113 114
## 11 1 5 1 13 2
#生成了兩個表,一個包含按地區劃分的人數和其他與按地區分的戶籍人口
tabHH <- as.data.frame(xtabs(rb050 ~ db040 + district,
data = census[!duplicated(census$db030), ]))
tabP <- as.data.frame(xtabs(rb050 ~ db040 + district, data = census))
colnames(tabP) <- colnames(tabHH) <- c("db040", "district", "Freq")函數 simInitSpatial() 需要一個“simPopObj”類的對像作為輸入。
synthP <- simInitSpatial(synthP, additional = "district",
region = "db040",
tspatialHH = tabHH, tspatialP = tabP)
head(popData(synthP), 2)## db030 hsize age rb090 db040 pid weight pl030 pb220a
## 1: 1 1 46 male Burgenland 1.1 1 3 AT
## 2: 2 1 78 female Burgenland 2.1 1 5 AT
## netIncomeCat netIncome py010n py050n py090n py100n py110n py120n
## 1: (2.89e+04,3.52e+04] 33579.265 33579.27 0 0 0.000 0 0
## 2: (5.05e+03,8.4e+03] 7692.078 0.00 0 0 7692.078 0 0
## py130n py140n district
## 1: 0 0 134
## 2: 0 0 150
census <- simStructure(data = inp, method = "direct",
basicHHvars = c("age", "rb090", "db040"))
census <- simCategorical(census, additional = c("pl030", "pb220a"),
method = "multinom", nr_cpus = 1)#these margins to our object synthP
census <- data.frame(popData(census))
margins <- as.data.frame(xtabs(~ db040 + rb090 + pl030, data = census))
margins$Freq <- as.numeric(margins$Freq)
synthP <- addKnownMargins(synthP, margins)使用 SA 和 calibPop() 校準到這些邊距 calibPop()對(接近)最優的跌代搜索住戶的組合以填充地理區域
synthPadj <- calibPop(synthP, split = "db040", temp = 1,
eps.factor = 0.00005, maxiter = 200, temp.cooldown = 0.975,
factor.cooldown = 0.85, min.temp = 0.001, verbose = TRUE, nr_cpus = 1)## Starting simulated Annealing for Burgenland
## Cooldown number 10
## Cooldown number 20
## Convergence NOT successfull for Burgenland
## Starting simulated Annealing for Carinthia
## Cooldown number 10
## Convergence NOT successfull for Carinthia
## Starting simulated Annealing for Lower Austria
## Cooldown number 10
## Cooldown number 20
## Convergence NOT successfull for Lower Austria
## Starting simulated Annealing for Salzburg
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Cooldown number 40
## Convergence NOT successfull for Salzburg
## Starting simulated Annealing for Styria
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Convergence NOT successfull for Styria
## Starting simulated Annealing for Tyrol
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Convergence NOT successfull for Tyrol
## Starting simulated Annealing for Upper Austria
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Convergence NOT successfull for Upper Austria
## Starting simulated Annealing for Vienna
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Convergence NOT successfull for Vienna
## Starting simulated Annealing for Vorarlberg
## Cooldown number 10
## Convergence NOT successfull for Vorarlberg
pop <- data.frame(popData(synthP))
popadj <- data.frame(popData(synthPadj))tab.census <- ftable(census[, c("rb090", "db040", "pl030")])
tab_afterSA <- ftable(popadj[, c("rb090", "db040", "pl030")])
tab.census - tab_afterSA## pl030 1 2 3 4 5 6 7
## rb090 db040
## male Burgenland 0 0 0 0 -2 0 0
## Carinthia -1 0 0 0 0 0 0
## Lower Austria 0 0 0 1 0 0 1
## Salzburg 0 0 1 0 0 0 0
## Styria 0 0 0 0 2 0 0
## Tyrol 0 -1 0 0 0 0 0
## Upper Austria 0 0 0 -1 1 0 0
## Vienna 0 0 0 0 0 0 0
## Vorarlberg 0 0 0 0 1 0 0
## female Burgenland 2 0 1 0 -1 0 0
## Carinthia 0 -1 0 0 0 1 0
## Lower Austria 0 2 0 0 0 0 0
## Salzburg 1 -1 0 0 0 0 0
## Styria 0 0 0 0 0 0 0
## Tyrol 0 0 0 0 -1 1 0
## Upper Austria 0 0 0 0 0 0 0
## Vienna -1 0 0 0 -1 0 0
## Vorarlberg 0 0 1 0 0 0 -1
模擬分類變量的多元結構是通過以下方式評估的圖形比較
tab <- spTable(synthP, select = c("rb090", "db040", "hsize"))
spMosaic(tab, labeling = labeling_border(abbreviate = c(db040 = TRUE)))tab <- spTable(synthP, select = c("rb090", "pl030"))
spMosaic(tab, method = "color")spCdfplot(synthP, "netIncome", cond = "rb090", layout = c(1, 2)) spBwplot(synthP, x = "netIncome", cond = "rb090", layout = c(1, 2))區域的樣本和總體淨收入的 CDF (db040)。幾乎完美的配合。
spCdfplot(synthP, "netIncome", cond = "db040", layout = c(3, 3))## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation
## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation
## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation
## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation
## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation
## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation