library("simPop")
## 載入需要的套件:lattice
## 載入需要的套件:vcd
## 載入需要的套件:grid
## Package simPop 1.2.1 has been loaded!
## Since simPop does explicit parallelization,
## the number of data.table threads is set to 1.
set.seed(1234)
data("eusilcS", package = "simPop")
<- eusilcS
origData dim(origData)
## [1] 11725 18
head(origData)
## db030 hsize db040 age rb090 pl030 pb220a netIncome py010n
## 9292 1 2 Salzburg 72 male 5 AT 22675.48 0.00
## 9293 1 2 Salzburg 66 female 5 AT 16999.29 0.00
## 7227 2 1 Upper Austria 56 female 2 AT 19274.21 19274.21
## 5275 3 1 Styria 67 female 5 AT 13319.13 0.00
## 7866 4 3 Upper Austria 70 female 5 AT 14365.57 0.00
## 7867 4 3 Upper Austria 46 male 3 AT 0.00 0.00
## py050n py090n py100n py110n py120n py130n py140n db090 rb050
## 9292 0 0 22675.48 0 0 0.00 0 7.822929 7.822929
## 9293 0 0 0.00 0 0 16999.29 0 7.822929 7.822929
## 7227 0 0 0.00 0 0 0.00 0 8.788089 8.788089
## 5275 0 0 13319.13 0 0 0.00 0 8.108452 8.108452
## 7866 0 0 14365.57 0 0 0.00 0 7.509383 7.509383
## 7867 0 0 0.00 0 0 0.00 0 7.509383 7.509383
View(origData)
str(origData)
## 'data.frame': 11725 obs. of 18 variables:
## $ db030 : int 1 1 2 3 4 4 4 5 5 5 ...
## $ hsize : int 2 2 1 1 3 3 3 5 5 5 ...
## $ db040 : Factor w/ 9 levels "Burgenland","Carinthia",..: 4 4 7 5 7 7 7 4 4 4 ...
## $ age : int 72 66 56 67 70 46 37 41 35 9 ...
## $ rb090 : Factor w/ 2 levels "male","female": 1 2 2 2 2 1 1 1 2 2 ...
## $ pl030 : Factor w/ 7 levels "1","2","3","4",..: 5 5 2 5 5 3 1 1 3 NA ...
## $ pb220a : Factor w/ 3 levels "AT","EU","Other": 1 1 1 1 1 1 3 1 1 NA ...
## $ netIncome: num 22675 16999 19274 13319 14366 ...
## $ py010n : num 0 0 19274 0 0 ...
## $ py050n : num 0 0 0 0 0 ...
## $ py090n : num 0 0 0 0 0 ...
## $ py100n : num 22675 0 0 13319 14366 ...
## $ py110n : num 0 0 0 0 0 0 0 0 0 NA ...
## $ py120n : num 0 0 0 0 0 0 0 0 0 NA ...
## $ py130n : num 0 16999 0 0 0 ...
## $ py140n : num 0 0 0 0 0 0 0 0 0 NA ...
## $ db090 : num 7.82 7.82 8.79 8.11 7.51 ...
## $ rb050 : num 7.82 7.82 8.79 8.11 7.51 ...
length(unique(origData$db030))
## [1] 4641
View(origData)
#創建一個類“dataObj”的對象,使用specifyInput()存入
<- specifyInput(origData, hhid ="db030", hhsize = "hsize",strata="db040",
inp weight = "rb050")
#顯示此“dataObj”對象的內容摘要
print(inp)
##
## --------------
## survey sample of size 11725 x 19
##
## Selected important variables:
##
## household ID: db030
## personal ID: pid
## variable household size: hsize
## sampling weight: rb050
## strata: db040
## --------------
#two-dimensional table
data("totalsRG", package = "simPop")
print(totalsRG)
## rb090 db040 Freq
## 1 male Burgenland 140436
## 2 female Burgenland 146980
## 3 male Carinthia 270084
## 4 female Carinthia 285797
## 5 male Lower Austria 797398
## 6 female Lower Austria 828087
## 7 male Salzburg 702539
## 8 female Salzburg 722883
## 9 male Styria 259595
## 10 female Styria 274675
## 11 male Tyrol 595842
## 12 female Tyrol 619404
## 13 male Upper Austria 353910
## 14 female Upper Austria 368128
## 15 male Vienna 850596
## 16 female Vienna 916150
## 17 male Vorarlberg 184939
## 18 female Vorarlberg 190343
#n-dimensional table
data("totalsRGtab", package = "simPop")
print(totalsRGtab)
## db040
## rb090 Burgenland Carinthia Lower Austria Salzburg Styria Tyrol
## female 146980 285797 828087 722883 274675 619404
## male 140436 270084 797398 702539 259595 595842
## db040
## rb090 Upper Austria Vienna Vorarlberg
## female 368128 916150 190343
## male 353910 850596 184939
$Freq <- totalsRG$Freq / 100
totalsRG<- totalsRGtab / 100
totalsRGtab <- calibSample(inp, totalsRG)
weights.df <- calibSample(inp, totalsRGtab)
weights.tab identical(weights.df, weights.tab)
## [1] TRUE
#函數將生成的校準權重添加到“simPopObj”inp添加權重()使用calibSample()
addWeights(inp) <- calibSample(inp, totalsRGtab)
<- simStructure(data = inp, method = "direct",basicHHvars = c("age", "rb090", "db040")) synthP
synthP
##
## --------------
## synthetic population of size
## 85057 x 7
##
## build from a sample of size
## 11725 x 19
## --------------
##
## variables in the population:
## db030,hsize,age,rb090,db040,pid,weight
<- simCategorical(synthP, additional = c("pl030", "pb220a"), method = "multinom", nr_cpus = 1)
synthP
print(synthP)
##
## --------------
## synthetic population of size
## 85057 x 9
##
## build from a sample of size
## 11725 x 19
## --------------
##
## variables in the population:
## db030,hsize,age,rb090,db040,pid,weight,pl030,pb220a
#人口普查
<- eusilcS[, c("age", "rb090", "pl030")] censusInfo
head(censusInfo)
## age rb090 pl030
## 9292 72 male 5
## 9293 66 female 5
## 7227 56 female 2
## 5275 67 female 5
## 7866 70 female 5
## 7867 46 male 3
#pl030為工作狀態,有7種總類
<- as.numeric(levels(censusInfo$pl030))
stat stat
## [1] 1 2 3 4 5 6 7
#年齡和性別為條件生成這些類別的條件概率
<- prop.table(table(censusInfo)) tab1
<- reshape2::dcast(as.data.frame(tab1),
probs formula = rb090 + age ~ pl030, value.var = "Freq")
<- subset(eusilcS, eusilcS$db030 == 1, select = c("age", "rb090"))
pop pop
## age rb090
## 9292 72 male
## 9293 66 female
::p_load(data.table, magrittr) pacman
#給定年齡和 rb090 以及已知概率的經濟狀況進行抽樣,合併變量 age 和 rb090 上的對象 pop 和 probs。
<- merge(pop, probs, all.x = TRUE) pop
$status <- sapply(1:nrow(pop), function(x) {
pop<- pop[x, -c(1:2)]
pp ifelse(all(pp == 0), NA, sample(stat, 1, prob = pp))
})
<- pop[, c("age", "rb090", "status")]
pop pop
## age rb090 status
## 1 66 female 7
## 2 72 male 5
data("eusilcS", package = "simPop")
<- specifyInput(data = eusilcS, hhid = "db030", hhsize = "hsize",
inp strata = "db040", weight = "db090")
<- simStructure(data = inp, method = "direct",
synthRec basicHHvars = c("age", "rb090"))
<- simCategorical(synthRec, additional = "pl030",
synthRec method = "distribution", nr_cpus = 1)
## [1] "age" "rb090"
<- simStructure(data = inp, method = "direct",
synthRec basicHHvars = c("age", "rb090"))
<- simCategorical(synthRec, additional = "pl030",
synthRec method = "distribution", nr_cpus = 1)
## [1] "age" "rb090"
使用 simContinuous() 生成連續變量。在應用該功能之前重,基本的家庭結構和可能的其他分類預測因素一定是分別使用 simStructure() 和 simCategorical() 進行模擬。類的對象’simPopObj’ 作為輸入提供。
<- simContinuous(synthP, additional = "netIncome", upper = 200000,
synthP equidist = FALSE, imputeMissings = FALSE, nr_cpus = 1)
<- pop(synthP, var = c("age", "netIncome"))
ageinc $age <- as.numeric(as.character(ageinc$age)) ageinc
#在回歸模型估計中不考慮 16 歲以下的人口16歲及以上人口的淨收入
< 16, netIncome := NA]
ageinc[age pop(synthP, var = "netIncome") <- ageinc$netIncome
#也可以使用以下,使用殘差的隨機抽取 synthP <- simContinuous(synthP, additional = “netIncome”, method = “lm”, nr_cpus = 1)
在住戶調查中,一般不收集個人淨收入的信息,而是從其他變量(例如,從按來源收集的收入信息)得出的信息。 合成生成的連續變量可以使用函數 simComponents() 按其組件分解。
<- manageSimPopObj(synthP, var = "netIncome", sample = TRUE)
sIncome <- manageSimPopObj(synthP, var = "rb050", sample = TRUE)
sWeight <- manageSimPopObj(synthP, var = "netIncome")
pIncome <- getBreaks(x = sIncome, w = sWeight, upper = Inf,equidist = FALSE)
breaks <- manageSimPopObj(synthP, var = "netIncomeCat",sample = TRUE, set = TRUE,
synthP values = getCat(x = sIncome, breaks))
<- manageSimPopObj(synthP, var = "netIncomeCat",sample = FALSE, set = TRUE,
synthP values = getCat(x = pIncome, breaks))
# simulate net income components
<- simComponents(simPopObj = synthP, total = "netIncome",
synthP components = c("py010n", "py050n", "py090n","py100n","py110n",
"py120n", "py130n", "py140n"),
conditional =c("netIncomeCat", "pl030"),
replaceEmpty = "sequential", seed = 1)
synthP
##
## --------------
## synthetic population of size
## 85057 x 19
##
## build from a sample of size
## 11725 x 20
## --------------
##
## variables in the population:
## db030,hsize,age,rb090,db040,pid,weight,pl030,pb220a,netIncomeCat,netIncome,py010n,py050n,py090n,py100n,py110n,py120n,py130n,py140n
#使用函數 simInitSpatial() 模擬較小的區域
<- function(inp) {
simulate_districts <- "db030"
hhid <- "db040"
region <- inp[!duplicated(inp[, hhid]), c(hhid, region)]
a <- split(a, a[, region])
spl <- unique(inp[, region])
regions <- lapply(1:length(spl), function(x) {
tmpres <- paste(x, 1:sample(10:90, 1), sep = "")
codes $district <- sample(codes, nrow(spl[[x]]), replace = TRUE)
spl[[x]]
spl[[x]]})<- do.call("rbind", tmpres)
tmpres <- tmpres[, -2]
tmpres <- merge(inp, tmpres, by.x = hhid, by.y = hhid, all.x = TRUE)
out invisible(out)
}
data("eusilcS", package = "simPop")
<- simulate_districts(eusilcS)
census head(table(census$district))
##
## 11 110 111 112 113 114
## 11 1 5 1 13 2
#生成了兩個表,一個包含按地區劃分的人數和其他與按地區分的戶籍人口
<- as.data.frame(xtabs(rb050 ~ db040 + district,
tabHH data = census[!duplicated(census$db030), ]))
<- as.data.frame(xtabs(rb050 ~ db040 + district, data = census))
tabP colnames(tabP) <- colnames(tabHH) <- c("db040", "district", "Freq")
函數 simInitSpatial() 需要一個“simPopObj”類的對像作為輸入。
<- simInitSpatial(synthP, additional = "district",
synthP region = "db040",
tspatialHH = tabHH, tspatialP = tabP)
head(popData(synthP), 2)
## db030 hsize age rb090 db040 pid weight pl030 pb220a
## 1: 1 1 46 male Burgenland 1.1 1 3 AT
## 2: 2 1 78 female Burgenland 2.1 1 5 AT
## netIncomeCat netIncome py010n py050n py090n py100n py110n py120n
## 1: (2.89e+04,3.52e+04] 33579.265 33579.27 0 0 0.000 0 0
## 2: (5.05e+03,8.4e+03] 7692.078 0.00 0 0 7692.078 0 0
## py130n py140n district
## 1: 0 0 134
## 2: 0 0 150
<- simStructure(data = inp, method = "direct",
census basicHHvars = c("age", "rb090", "db040"))
<- simCategorical(census, additional = c("pl030", "pb220a"),
census method = "multinom", nr_cpus = 1)
#these margins to our object synthP
<- data.frame(popData(census))
census <- as.data.frame(xtabs(~ db040 + rb090 + pl030, data = census))
margins $Freq <- as.numeric(margins$Freq)
margins<- addKnownMargins(synthP, margins) synthP
使用 SA 和 calibPop() 校準到這些邊距 calibPop()對(接近)最優的跌代搜索住戶的組合以填充地理區域
<- calibPop(synthP, split = "db040", temp = 1,
synthPadj eps.factor = 0.00005, maxiter = 200, temp.cooldown = 0.975,
factor.cooldown = 0.85, min.temp = 0.001, verbose = TRUE, nr_cpus = 1)
## Starting simulated Annealing for Burgenland
## Cooldown number 10
## Cooldown number 20
## Convergence NOT successfull for Burgenland
## Starting simulated Annealing for Carinthia
## Cooldown number 10
## Convergence NOT successfull for Carinthia
## Starting simulated Annealing for Lower Austria
## Cooldown number 10
## Cooldown number 20
## Convergence NOT successfull for Lower Austria
## Starting simulated Annealing for Salzburg
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Cooldown number 40
## Convergence NOT successfull for Salzburg
## Starting simulated Annealing for Styria
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Convergence NOT successfull for Styria
## Starting simulated Annealing for Tyrol
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Convergence NOT successfull for Tyrol
## Starting simulated Annealing for Upper Austria
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Convergence NOT successfull for Upper Austria
## Starting simulated Annealing for Vienna
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Convergence NOT successfull for Vienna
## Starting simulated Annealing for Vorarlberg
## Cooldown number 10
## Convergence NOT successfull for Vorarlberg
<- data.frame(popData(synthP))
pop <- data.frame(popData(synthPadj)) popadj
<- ftable(census[, c("rb090", "db040", "pl030")])
tab.census <- ftable(popadj[, c("rb090", "db040", "pl030")])
tab_afterSA - tab_afterSA tab.census
## pl030 1 2 3 4 5 6 7
## rb090 db040
## male Burgenland 0 0 0 0 -2 0 0
## Carinthia -1 0 0 0 0 0 0
## Lower Austria 0 0 0 1 0 0 1
## Salzburg 0 0 1 0 0 0 0
## Styria 0 0 0 0 2 0 0
## Tyrol 0 -1 0 0 0 0 0
## Upper Austria 0 0 0 -1 1 0 0
## Vienna 0 0 0 0 0 0 0
## Vorarlberg 0 0 0 0 1 0 0
## female Burgenland 2 0 1 0 -1 0 0
## Carinthia 0 -1 0 0 0 1 0
## Lower Austria 0 2 0 0 0 0 0
## Salzburg 1 -1 0 0 0 0 0
## Styria 0 0 0 0 0 0 0
## Tyrol 0 0 0 0 -1 1 0
## Upper Austria 0 0 0 0 0 0 0
## Vienna -1 0 0 0 -1 0 0
## Vorarlberg 0 0 1 0 0 0 -1
模擬分類變量的多元結構是通過以下方式評估的圖形比較
<- spTable(synthP, select = c("rb090", "db040", "hsize"))
tab spMosaic(tab, labeling = labeling_border(abbreviate = c(db040 = TRUE)))
<- spTable(synthP, select = c("rb090", "pl030"))
tab spMosaic(tab, method = "color")
spCdfplot(synthP, "netIncome", cond = "rb090", layout = c(1, 2))
spBwplot(synthP, x = "netIncome", cond = "rb090", layout = c(1, 2))
區域的樣本和總體淨收入的 CDF (db040)。幾乎完美的配合。
spCdfplot(synthP, "netIncome", cond = "db040", layout = c(3, 3))
## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation
## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation
## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation
## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation
## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation
## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation