R Markdown

 library("simPop")
## 載入需要的套件:lattice
## 載入需要的套件:vcd
## 載入需要的套件:grid
## Package simPop 1.2.1 has been loaded!
## Since simPop does explicit parallelization,
##  the number of data.table threads is set to 1.

1-1 Input data

 set.seed(1234)
data("eusilcS", package = "simPop")
 origData <- eusilcS
 dim(origData)
## [1] 11725    18
head(origData)
##      db030 hsize         db040 age  rb090 pl030 pb220a netIncome   py010n
## 9292     1     2      Salzburg  72   male     5     AT  22675.48     0.00
## 9293     1     2      Salzburg  66 female     5     AT  16999.29     0.00
## 7227     2     1 Upper Austria  56 female     2     AT  19274.21 19274.21
## 5275     3     1        Styria  67 female     5     AT  13319.13     0.00
## 7866     4     3 Upper Austria  70 female     5     AT  14365.57     0.00
## 7867     4     3 Upper Austria  46   male     3     AT      0.00     0.00
##      py050n py090n   py100n py110n py120n   py130n py140n    db090    rb050
## 9292      0      0 22675.48      0      0     0.00      0 7.822929 7.822929
## 9293      0      0     0.00      0      0 16999.29      0 7.822929 7.822929
## 7227      0      0     0.00      0      0     0.00      0 8.788089 8.788089
## 5275      0      0 13319.13      0      0     0.00      0 8.108452 8.108452
## 7866      0      0 14365.57      0      0     0.00      0 7.509383 7.509383
## 7867      0      0     0.00      0      0     0.00      0 7.509383 7.509383
View(origData)
str(origData)
## 'data.frame':    11725 obs. of  18 variables:
##  $ db030    : int  1 1 2 3 4 4 4 5 5 5 ...
##  $ hsize    : int  2 2 1 1 3 3 3 5 5 5 ...
##  $ db040    : Factor w/ 9 levels "Burgenland","Carinthia",..: 4 4 7 5 7 7 7 4 4 4 ...
##  $ age      : int  72 66 56 67 70 46 37 41 35 9 ...
##  $ rb090    : Factor w/ 2 levels "male","female": 1 2 2 2 2 1 1 1 2 2 ...
##  $ pl030    : Factor w/ 7 levels "1","2","3","4",..: 5 5 2 5 5 3 1 1 3 NA ...
##  $ pb220a   : Factor w/ 3 levels "AT","EU","Other": 1 1 1 1 1 1 3 1 1 NA ...
##  $ netIncome: num  22675 16999 19274 13319 14366 ...
##  $ py010n   : num  0 0 19274 0 0 ...
##  $ py050n   : num  0 0 0 0 0 ...
##  $ py090n   : num  0 0 0 0 0 ...
##  $ py100n   : num  22675 0 0 13319 14366 ...
##  $ py110n   : num  0 0 0 0 0 0 0 0 0 NA ...
##  $ py120n   : num  0 0 0 0 0 0 0 0 0 NA ...
##  $ py130n   : num  0 16999 0 0 0 ...
##  $ py140n   : num  0 0 0 0 0 0 0 0 0 NA ...
##  $ db090    : num  7.82 7.82 8.79 8.11 7.51 ...
##  $ rb050    : num  7.82 7.82 8.79 8.11 7.51 ...
length(unique(origData$db030))
## [1] 4641
View(origData)

1-2 Austrian synthetic data

#創建一個類“dataObj”的對象,使用specifyInput()存入
inp <- specifyInput(origData, hhid ="db030", hhsize = "hsize",strata="db040", 
                    weight = "rb050")
#顯示此“dataObj”對象的內容摘要
print(inp)
## 
##  -------------- 
## survey sample of size 11725 x 19 
## 
##  Selected important variables: 
## 
##  household ID: db030
##  personal ID: pid
##  variable household size: hsize
##  sampling weight: rb050
##  strata: db040
##  --------------

1-3 Calibration of the sample

#two-dimensional table
data("totalsRG", package = "simPop")
 print(totalsRG)
##     rb090         db040   Freq
## 1    male    Burgenland 140436
## 2  female    Burgenland 146980
## 3    male     Carinthia 270084
## 4  female     Carinthia 285797
## 5    male Lower Austria 797398
## 6  female Lower Austria 828087
## 7    male      Salzburg 702539
## 8  female      Salzburg 722883
## 9    male        Styria 259595
## 10 female        Styria 274675
## 11   male         Tyrol 595842
## 12 female         Tyrol 619404
## 13   male Upper Austria 353910
## 14 female Upper Austria 368128
## 15   male        Vienna 850596
## 16 female        Vienna 916150
## 17   male    Vorarlberg 184939
## 18 female    Vorarlberg 190343
#n-dimensional table 
data("totalsRGtab", package = "simPop")
print(totalsRGtab)
##         db040
## rb090    Burgenland Carinthia Lower Austria Salzburg Styria  Tyrol
##   female     146980    285797        828087   722883 274675 619404
##   male       140436    270084        797398   702539 259595 595842
##         db040
## rb090    Upper Austria Vienna Vorarlberg
##   female        368128 916150     190343
##   male          353910 850596     184939
totalsRG$Freq <- totalsRG$Freq / 100
totalsRGtab <- totalsRGtab / 100
weights.df <- calibSample(inp, totalsRG)
weights.tab <- calibSample(inp, totalsRGtab)
 identical(weights.df, weights.tab)
## [1] TRUE
#函數將生成的校準權重添加到“simPopObj”inp添加權重()使用calibSample()
addWeights(inp) <- calibSample(inp, totalsRGtab)

1-4 Creating the household structure

synthP <- simStructure(data = inp, method = "direct",basicHHvars = c("age", "rb090", "db040"))
synthP
## 
## -------------- 
## synthetic population  of size 
##  85057 x 7 
## 
## build from a sample of size 
## 11725 x 19
## -------------- 
## 
## variables in the population:
## db030,hsize,age,rb090,db040,pid,weight

1-5 Adding categorical variables

synthP <- simCategorical(synthP, additional = c("pl030", "pb220a"), method = "multinom", nr_cpus = 1)

print(synthP)
## 
## -------------- 
## synthetic population  of size 
##  85057 x 9 
## 
## build from a sample of size 
## 11725 x 19
## -------------- 
## 
## variables in the population:
## db030,hsize,age,rb090,db040,pid,weight,pl030,pb220a
#人口普查  
censusInfo <- eusilcS[, c("age", "rb090", "pl030")]
head(censusInfo)
##      age  rb090 pl030
## 9292  72   male     5
## 9293  66 female     5
## 7227  56 female     2
## 5275  67 female     5
## 7866  70 female     5
## 7867  46   male     3
#pl030為工作狀態,有7種總類
stat <- as.numeric(levels(censusInfo$pl030))
stat
## [1] 1 2 3 4 5 6 7
#年齡和性別為條件生成這些類別的條件概率
tab1 <- prop.table(table(censusInfo))
probs <- reshape2::dcast(as.data.frame(tab1),
                         formula = rb090 + age ~ pl030, value.var = "Freq")
pop <- subset(eusilcS, eusilcS$db030 == 1, select = c("age", "rb090"))
pop
##      age  rb090
## 9292  72   male
## 9293  66 female
pacman::p_load(data.table, magrittr)

#給定年齡和 rb090 以及已知概率的經濟狀況進行抽樣,合併變量 age 和 rb090 上的對象 pop 和 probs。

pop <- merge(pop, probs, all.x = TRUE)
pop$status <- sapply(1:nrow(pop), function(x) {
    pp <- pop[x, -c(1:2)]
    ifelse(all(pp == 0), NA, sample(stat, 1, prob = pp))
 })
pop <- pop[, c("age", "rb090", "status")]
pop
##   age  rb090 status
## 1  66 female      7
## 2  72   male      5
data("eusilcS", package = "simPop")
inp <- specifyInput(data = eusilcS, hhid = "db030", hhsize = "hsize",
 strata = "db040", weight = "db090")
synthRec <- simStructure(data = inp, method = "direct",
 basicHHvars = c("age", "rb090"))


synthRec <- simCategorical(synthRec, additional = "pl030",
 method = "distribution", nr_cpus = 1)
## [1] "age"   "rb090"
synthRec <- simStructure(data = inp, method = "direct",
 basicHHvars = c("age", "rb090"))
synthRec <- simCategorical(synthRec, additional = "pl030",
 method = "distribution", nr_cpus = 1)
## [1] "age"   "rb090"

1-6 Adding continuous variables

使用 simContinuous() 生成連續變量。在應用該功能之前重,基本的家庭結構和可能的其他分類預測因素一定是分別使用 simStructure() 和 simCategorical() 進行模擬。類的對象’simPopObj’ 作為輸入提供。

synthP <- simContinuous(synthP, additional = "netIncome", upper = 200000,
 equidist = FALSE, imputeMissings = FALSE, nr_cpus = 1)
ageinc <- pop(synthP, var = c("age", "netIncome"))
ageinc$age <- as.numeric(as.character(ageinc$age))
#在回歸模型估計中不考慮 16 歲以下的人口16歲及以上人口的淨收入
ageinc[age < 16, netIncome := NA]
pop(synthP, var = "netIncome") <- ageinc$netIncome

#也可以使用以下,使用殘差的隨機抽取 synthP <- simContinuous(synthP, additional = “netIncome”, method = “lm”, nr_cpus = 1)

1-7 Simulation of components

在住戶調查中,一般不收集個人淨收入的信息,而是從其他變量(例如,從按來源收集的收入信息)得出的信息。 合成生成的連續變量可以使用函數 simComponents() 按其組件分解。

sIncome <- manageSimPopObj(synthP, var = "netIncome", sample = TRUE)
sWeight <- manageSimPopObj(synthP, var = "rb050", sample = TRUE)
pIncome <- manageSimPopObj(synthP, var = "netIncome")
breaks <- getBreaks(x = sIncome, w = sWeight, upper = Inf,equidist = FALSE)
synthP <- manageSimPopObj(synthP, var = "netIncomeCat",sample = TRUE, set = TRUE, 
                          values = getCat(x = sIncome, breaks))
synthP <- manageSimPopObj(synthP, var = "netIncomeCat",sample = FALSE, set = TRUE, 
                          values = getCat(x = pIncome, breaks))
# simulate net income components
synthP <- simComponents(simPopObj = synthP, total = "netIncome", 
                        components = c("py010n", "py050n", "py090n","py100n","py110n",
                                       "py120n", "py130n", "py140n"), 
                        conditional =c("netIncomeCat", "pl030"), 
                        replaceEmpty = "sequential", seed = 1)
synthP 
## 
## -------------- 
## synthetic population  of size 
##  85057 x 19 
## 
## build from a sample of size 
## 11725 x 20
## -------------- 
## 
## variables in the population:
## db030,hsize,age,rb090,db040,pid,weight,pl030,pb220a,netIncomeCat,netIncome,py010n,py050n,py090n,py100n,py110n,py120n,py130n,py140n

1-8 Allocating the population in small areas

#使用函數 simInitSpatial() 模擬較小的區域
simulate_districts <- function(inp) {
    hhid <- "db030"
    region <- "db040"
    a <- inp[!duplicated(inp[, hhid]), c(hhid, region)]
    spl <- split(a, a[, region])
    regions <- unique(inp[, region])
    tmpres <- lapply(1:length(spl), function(x) {
        codes <- paste(x, 1:sample(10:90, 1), sep = "")
        spl[[x]]$district <- sample(codes, nrow(spl[[x]]), replace = TRUE)
        spl[[x]]})
    tmpres <- do.call("rbind", tmpres)
    tmpres <- tmpres[, -2]
    out <- merge(inp, tmpres, by.x = hhid, by.y = hhid, all.x = TRUE)
    invisible(out)
    }
data("eusilcS", package = "simPop")
census <- simulate_districts(eusilcS)
head(table(census$district))
## 
##  11 110 111 112 113 114 
##  11   1   5   1  13   2
#生成了兩個表,一個包含按地區劃分的人數和其他與按地區分的戶籍人口
tabHH <- as.data.frame(xtabs(rb050 ~ db040 + district,
                             data = census[!duplicated(census$db030), ]))
tabP <- as.data.frame(xtabs(rb050 ~ db040 + district, data = census))
colnames(tabP) <- colnames(tabHH) <- c("db040", "district", "Freq")

函數 simInitSpatial() 需要一個“simPopObj”類的對像作為輸入。

synthP <- simInitSpatial(synthP, additional = "district",
                         region = "db040", 
                         tspatialHH = tabHH, tspatialP = tabP)
head(popData(synthP), 2)
##    db030 hsize age  rb090      db040 pid weight pl030 pb220a
## 1:     1     1  46   male Burgenland 1.1      1     3     AT
## 2:     2     1  78 female Burgenland 2.1      1     5     AT
##           netIncomeCat netIncome   py010n py050n py090n   py100n py110n py120n
## 1: (2.89e+04,3.52e+04] 33579.265 33579.27      0      0    0.000      0      0
## 2:  (5.05e+03,8.4e+03]  7692.078     0.00      0      0 7692.078      0      0
##    py130n py140n district
## 1:      0      0      134
## 2:      0      0      150

1-9 Post-calibration of the synthetic population

census <- simStructure(data = inp, method = "direct",
                       basicHHvars = c("age", "rb090", "db040"))

census <- simCategorical(census, additional = c("pl030", "pb220a"),
                         method = "multinom", nr_cpus = 1)
#these margins to our object synthP
census <- data.frame(popData(census))
margins <- as.data.frame(xtabs(~ db040 + rb090 + pl030, data = census))
margins$Freq <- as.numeric(margins$Freq)
synthP <- addKnownMargins(synthP, margins)

使用 SA 和 calibPop() 校準到這些邊距 calibPop()對(接近)最優的跌代搜索住戶的組合以填充地理區域

synthPadj <- calibPop(synthP, split = "db040", temp = 1,
                    eps.factor = 0.00005, maxiter = 200, temp.cooldown = 0.975,
                    factor.cooldown = 0.85, min.temp = 0.001, verbose = TRUE, nr_cpus = 1)
## Starting simulated Annealing for Burgenland
## Cooldown number 10
## Cooldown number 20
## Convergence NOT successfull for Burgenland 
## Starting simulated Annealing for Carinthia
## Cooldown number 10
## Convergence NOT successfull for Carinthia 
## Starting simulated Annealing for Lower Austria
## Cooldown number 10
## Cooldown number 20
## Convergence NOT successfull for Lower Austria 
## Starting simulated Annealing for Salzburg
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Cooldown number 40
## Convergence NOT successfull for Salzburg 
## Starting simulated Annealing for Styria
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Convergence NOT successfull for Styria 
## Starting simulated Annealing for Tyrol
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Convergence NOT successfull for Tyrol 
## Starting simulated Annealing for Upper Austria
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Convergence NOT successfull for Upper Austria 
## Starting simulated Annealing for Vienna
## Cooldown number 10
## Cooldown number 20
## Cooldown number 30
## Convergence NOT successfull for Vienna 
## Starting simulated Annealing for Vorarlberg
## Cooldown number 10
## Convergence NOT successfull for Vorarlberg
pop <- data.frame(popData(synthP))
popadj <- data.frame(popData(synthPadj))
tab.census <- ftable(census[, c("rb090", "db040", "pl030")])
tab_afterSA <- ftable(popadj[, c("rb090", "db040", "pl030")])
tab.census - tab_afterSA
##                      pl030  1  2  3  4  5  6  7
## rb090  db040                                   
## male   Burgenland           0  0  0  0 -2  0  0
##        Carinthia           -1  0  0  0  0  0  0
##        Lower Austria        0  0  0  1  0  0  1
##        Salzburg             0  0  1  0  0  0  0
##        Styria               0  0  0  0  2  0  0
##        Tyrol                0 -1  0  0  0  0  0
##        Upper Austria        0  0  0 -1  1  0  0
##        Vienna               0  0  0  0  0  0  0
##        Vorarlberg           0  0  0  0  1  0  0
## female Burgenland           2  0  1  0 -1  0  0
##        Carinthia            0 -1  0  0  0  1  0
##        Lower Austria        0  2  0  0  0  0  0
##        Salzburg             1 -1  0  0  0  0  0
##        Styria               0  0  0  0  0  0  0
##        Tyrol                0  0  0  0 -1  1  0
##        Upper Austria        0  0  0  0  0  0  0
##        Vienna              -1  0  0  0 -1  0  0
##        Vorarlberg           0  0  1  0  0  0 -1

2 Data utility of the simulated population

模擬分類變量的多元結構是通過以下方式評估的圖形比較

tab <- spTable(synthP, select = c("rb090", "db040", "hsize"))
spMosaic(tab, labeling = labeling_border(abbreviate = c(db040 = TRUE)))

tab <- spTable(synthP, select = c("rb090", "pl030"))
spMosaic(tab, method = "color")

spCdfplot(synthP, "netIncome", cond = "rb090", layout = c(1, 2))

 spBwplot(synthP, x = "netIncome", cond = "rb090", layout = c(1, 2))

區域的樣本和總體淨收入的 CDF (db040)。幾乎完美的配合。

spCdfplot(synthP, "netIncome", cond = "db040", layout = c(3, 3))
## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation

## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation

## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation

## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation

## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation

## Warning in spCdf(x, w, ...): number of finite values in 'x' is smaller than 'n':
## no approximation