Data and methods
- Target area and regional unit: 47 Prefectures in Japan
- Target periods: from week 6 to week 12 in 2020
- Week 06: 2.3 - 2.9
- Week 07: 2.10 - 2.16
- Week 08: 2.17 - 2.23
- Week 09: 2.24 - 3.1
- Week 10: 3.2 - 3.8
- Week 11: 3.9 - 3.15
- Week 12: 3.16 - 3.22
- Baselines: week 6 to week 8
- Calculation: week 9 to week 12
- Centre of Area: Coordinates (lon, lat) of seat
- Calculate distances between areas (prefectures): Euclidean distance
- k (of knn): 10
- Expectation-based Poisson scan statistic
- Population-based Poisson scan statistic
- Expectation-based negative binomial scan statistic: thetas = 1: This value may need to be changed
- Expectation-based zero-inflated Poisson scan statistic: probs are constant values in all prefectures that are calculated from EM alogism
- To estimate omega values for individual prefectures, the number of data (period length) is too small
- May use zeroinfl to calculate baseline instead of glm
Libraries
> library(readr)
> library(dplyr)
> library(tidyr)
> library(sp)
> library(magrittr)
> library(scanstatistics)
Counts, population and probs
> JPN_popcov <- read_csv("JPN_popcov6to12.csv", locale = locale(encoding = "SHIFT-JIS"))
> tail(JPN_popcov, 5)
# A tibble: 5 x 4
week pref pop count
<dbl> <chr> <dbl> <dbl>
1 12 43.Kumamoto-ken 17.6 1
2 12 44.Oita-ken 11.4 20
3 12 45.Miyazaki-ken 10.8 2
4 12 46.Kagoshima-ken 16.1 4
5 12 47.Okinawa-ken 14.5 0
> counts <- JPN_popcov %>%
+ filter(week >= 9 & week <= 12) %>%
+ df_to_matrix(time_col = "week", location_col = "pref", value_col = "count")
>
> counts[,1:5]
01.Hokkaido 02.Aomori-ken 03.Iwate-ken 04.Miyagi-ken 05.Akita-ken
9 24 0 0 1 0
10 7 0 0 0 1
11 24 0 0 0 0
12 10 0 0 0 0
> population <- JPN_popcov %>%
+ filter(week >= 9 & week <= 12) %>%
+ df_to_matrix(time_col = "week", location_col = "pref", value_col = "pop")
>
> population[,1:5]
01.Hokkaido 02.Aomori-ken 03.Iwate-ken 04.Miyagi-ken 05.Akita-ken
9 52.86 12.63 12.41 23.16 9.81
10 52.86 12.63 12.41 23.16 9.81
11 52.86 12.63 12.41 23.16 9.81
12 52.86 12.63 12.41 23.16 9.81
> ## EM algorithm for zero inflated poisson
> emdata <- JPN_popcov %>% filter(week >= 9 & week <= 12) %>% select(count)
> init_lam <- as.numeric(emdata %>% summarize(mean=mean(count)))
> est_lam <- init_lam
> alt_lam <- 100
> init_M <- 0
> est_M <- init_M
> omega <- 0
> m <- sum(emdata > 0)
> n <- nrow(emdata)
> S <- sum(emdata)
> while(abs(est_lam-alt_lam)>=0.001){
+ alt_lam <- est_lam
+ est_M <- m/(1-exp(-est_lam))
+ est_lam <- S/est_M
+ if(abs(est_lam-alt_lam)<0.001){
+ omega <- 1-est_M/n
+ }
+ }
> omega
[1] 0.499914
> est_lam
[1] 8.668722
> probs <- mutate(JPN_popcov, prob = omega) %>%
+ filter(week >= 9 & week <= 12) %>%
+ df_to_matrix(time_col = "week", location_col = "pref", value_col = "prob")
>
> probs[,1:5]
01.Hokkaido 02.Aomori-ken 03.Iwate-ken 04.Miyagi-ken 05.Akita-ken
9 0.499914 0.499914 0.499914 0.499914 0.499914
10 0.499914 0.499914 0.499914 0.499914 0.499914
11 0.499914 0.499914 0.499914 0.499914 0.499914
12 0.499914 0.499914 0.499914 0.499914 0.499914
Spatial zones
> JPN_geo <- read_csv("JPN_geo.csv", locale = locale(encoding = "SHIFT-JIS"))
> head(JPN_geo, 5)
# A tibble: 5 x 3
pref lat lon
<chr> <dbl> <dbl>
1 01.Hokkaido 43.1 141.
2 02.Aomori-ken 40.8 141.
3 03.Iwate-ken 39.7 141.
4 04.Miyagi-ken 38.3 141.
5 05.Akita-ken 39.7 140.
> zones <- JPN_geo %>%
+ select(lon, lat) %>%
+ as.matrix %>%
+ spDists(x = ., y = ., longlat = TRUE) %>%
+ dist_to_knn(k = 10) %>%
+ knn_zones
> zones[[1]]
[1] 1
> zones[[2]]
[1] 1 2
> zones[[350]]
[1] 41 42 43 45 46
Baselines
> mod <- glm(count ~ offset(log(pop)) + 1 + I(week - 8),
+ family = poisson(link = "log"),
+ data = JPN_popcov %>% filter(week < 9))
> summary(mod)
Call:
glm(formula = count ~ offset(log(pop)) + 1 + I(week - 8), family = poisson(link = "log"),
data = JPN_popcov %>% filter(week < 9))
Deviance Residuals:
Min 1Q Median 3Q Max
-3.4720 -1.2999 -0.8976 -0.5937 6.0792
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.6825 0.1036 -25.897 < 2e-16 ***
I(week - 8) 0.7453 0.1151 6.472 9.64e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 393.77 on 140 degrees of freedom
Residual deviance: 346.00 on 139 degrees of freedom
AIC: 423.72
Number of Fisher Scoring iterations: 6
> ebp_baselines <- JPN_popcov %>%
+ filter(week >= 9 & week <= 12) %>%
+ mutate(mu = predict(mod, newdata = ., type = "response")) %>%
+ df_to_matrix(value_col = "mu")
> ebp_baselines[,1:5]
01.Hokkaido 02.Aomori-ken 03.Iwate-ken 04.Miyagi-ken 05.Akita-ken
9 7.617563 1.820088 1.788384 3.337548 1.413702
10 16.050575 3.835013 3.768211 7.032375 2.978739
11 33.819339 8.080557 7.939803 14.817554 6.276347
12 71.258984 17.026125 16.729550 31.221303 13.224567
Calculation
Expectation-based Poisson scan statistic
> set.seed(1)
> eb_poisson_result <- scan_eb_poisson(counts = counts,
+ zones = zones,
+ baselines = ebp_baselines,
+ n_mcsim = 999,
+ max_only = FALSE)
> print(eb_poisson_result)
Data distribution: Poisson
Type of scan statistic: expectation-based
Setting: univariate
Number of locations considered: 47
Maximum duration considered: 4
Number of spatial zones: 360
Number of Monte Carlo replicates: 999
Monte Carlo P-value: 1
Gumbel P-value: 1
Most likely event duration: 1
ID of locations in MLC: 44
> prefectures <- as.character(JPN_geo$pref)
> prefectures[c(44)]
[1] "44.Oita-ken"
Population-based Poisson scan statistic
> set.seed(1)
> pb_poisson_result <- scan_pb_poisson(counts = counts,
+ zones = zones,
+ population = population,
+ n_mcsim = 999,
+ max_only = FALSE)
> print(pb_poisson_result)
Data distribution: Poisson
Type of scan statistic: population-based
Setting: univariate
Number of locations considered: 47
Maximum duration considered: 4
Number of spatial zones: 360
Number of Monte Carlo replicates: 999
Monte Carlo P-value: 0.001
Gumbel P-value: 0
Most likely event duration: 3
ID of locations in MLC: 27, 28, 30
> #prefectures <- as.character(JPN_geo$pref)
> prefectures[c(27,28,30)]
[1] "27.Osaka-fu" "28.Hyogo-ken" "30.Wakayama-ken"
Expectation-based negative binomial scan statistic
> pb_negbin_result <- scan_eb_negbin(counts = counts,
+ zones = zones,
+ baselines = ebp_baselines,
+ thetas = 1,
+ type = "emerging",
+ n_mcsim = 999,
+ max_only = FALSE)
> print(pb_negbin_result)
Data distribution: negative binomial
Type of scan statistic: expectation-based
Setting: univariate
Number of locations considered: 47
Maximum duration considered: 4
Number of spatial zones: 360
Number of Monte Carlo replicates: 999
Monte Carlo P-value: 1
Gumbel P-value: 1
Most likely event duration: 1
ID of locations in MLC: 44
> #prefectures <- as.character(JPN_geo$pref)
> prefectures[c(44)]
[1] "44.Oita-ken"
Expectation-based zero-inflated Poisson scan statistic
> eb_zip_result <- scan_eb_zip(counts = counts,
+ zones = zones,
+ baselines = ebp_baselines,
+ probs = probs,
+ n_mcsim = 999,
+ max_only = FALSE,
+ rel_tol = 1e-3)
> print(eb_zip_result)
Data distribution: zero-inflated Poisson
Type of scan statistic: expectation-based
Setting: univariate
Number of locations considered: 47
Maximum duration considered: 4
Number of spatial zones: 360
Number of Monte Carlo replicates: 999
Monte Carlo P-value: 0.012
Gumbel P-value: 0.011
Most likely event duration: 3
ID of locations in MLC: 39
> #prefectures <- as.character(JPN_geo$pref)
> prefectures[c(39)]
[1] "39.Kochi-ken"