Init
options(digits = 3)
library(pacman)
p_load(kirkegaard, readxl, lubridate, zoo, imputeTS)
Data
Download data for all years at from Eurostat’s table at http://appsso.eurostat.ec.europa.eu/nui/show.do?query=BOOKMARK_DS-055296_QID_3D9C156D_UID_-3F171EB0&layout=TIME,C,X,0;GEO,L,Y,0;CITIZEN,L,Z,0;SEX,L,Z,1;AGE,L,Z,2;ASYL_APP,L,Z,3;UNIT,L,Z,4;INDICATORS,C,Z,5;&zSelection=DS-055296UNIT,PER;DS-055296SEX,T;DS-055296ASYL_APP,NASY_APP;DS-055296CITIZEN,EXT_EU28;DS-055296INDICATORS,OBS_FLAG;DS-055296AGE,TOTAL;&rankName1=UNIT_1_2_-1_2&rankName2=AGE_1_2_-1_2&rankName3=CITIZEN_1_2_-1_2&rankName4=INDICATORS_1_2_-1_2&rankName5=ASYL-APP_1_2_-1_2&rankName6=SEX_1_2_-1_2&rankName7=TIME_1_0_0_0&rankName8=GEO_1_2_0_1&sortC=ASC_-1_FIRST&rStp=&cStp=&rDCh=&cDCh=&rDM=true&cDM=true&footnes=false&empty=false&wai=false&time_mode=ROLLING&time_most_recent=false&lang=EN&cfo=%23%23%23%2C%23%23%23.%23%23%23
#read data
#skip first 11 rows as these contain metadata
orig = read_xls("data/migr_asyappctzm.xls", skip = 11)
#copy to short name
d = orig
#subset to first time applicants
d = d[46:78, ]
#fix 1 colname
names(d)[1] = "country"
#fix country names
d$country[1] = "EU28"
d$country[6] = "Germany"
#long form
d = d %>%
gather(key = time, value = number, -country) %>%
filter(country != "EU28")
#recode missing data
d$number %<>% as.numeric()
## Warning in function_list[[k]](value): NAs introduced by coercion
#recode time
d$time %<>% str_replace("M", "") %>% parse_date_time(orders = "%Y%m")
#parts
d$year = d$time %>% year()
d$month = d$time %>% month()
Analysis
#impute missing data
d$number_LOCF = NA
#loop over countries to impute with LOCF
d %<>% plyr::ddply("country", .fun = function(block) {
# browser()
block$number_LOCF = block$number %>% zoo::na.locf(na.rm = F) %>% zoo::na.locf(na.rm = F, fromLast = T)
block$number_spline = block$number %>% na.interpolation(option = "spline")
block$number_Kalman = block$number %>% na.kalman()
block
})
#by month
monthly_totals = d %>% plyr::ddply("time", .fun = function(block) {
data_frame(
raw = block$number %>% sum(na.rm = T),
LOCF = block$number_LOCF %>% sum(na.rm = T),
spline = block$number_spline %>% sum(na.rm = T),
Kalman = block$number_Kalman %>% sum(na.rm = T),
n_missing = sum(is.na(block$number)),
)
})
#plot missing data problem
monthly_totals %>%
select(time, n_missing) %>%
ggplot(aes(time, n_missing)) +
geom_path() +
theme_bw() +
scale_y_continuous("n countries with no data", labels = function(x) format(x, scientific = F)) +
labs(title = "Missing data by month")

GG_save("missing_data.png")
#plot results
monthly_totals %>%
select(-n_missing) %>%
gather(key = method, value = number, -time) %>%
ggplot(aes(time, number, color = method)) +
geom_path() +
theme_bw() +
scale_y_continuous("First time applicants", labels = function(x) format(x, scientific = F)) +
scale_color_discrete("Imputation method") +
labs(title = "First time applicants to the EU-ish countries",
subtitle = "EU28 + Iceland, Norway, Liechtenstein, Switzerland",
caption = "Data source: Eurostat migr_asyappctzm table"
)

GG_save("applicants.png")
#inspect data
d %>% DT::datatable()
monthly_totals %>% DT::datatable()