Init

options(digits = 3)
library(pacman)
p_load(kirkegaard, readxl, lubridate, zoo, imputeTS)

Data

Download data for all years at from Eurostat’s table at http://appsso.eurostat.ec.europa.eu/nui/show.do?query=BOOKMARK_DS-055296_QID_3D9C156D_UID_-3F171EB0&layout=TIME,C,X,0;GEO,L,Y,0;CITIZEN,L,Z,0;SEX,L,Z,1;AGE,L,Z,2;ASYL_APP,L,Z,3;UNIT,L,Z,4;INDICATORS,C,Z,5;&zSelection=DS-055296UNIT,PER;DS-055296SEX,T;DS-055296ASYL_APP,NASY_APP;DS-055296CITIZEN,EXT_EU28;DS-055296INDICATORS,OBS_FLAG;DS-055296AGE,TOTAL;&rankName1=UNIT_1_2_-1_2&rankName2=AGE_1_2_-1_2&rankName3=CITIZEN_1_2_-1_2&rankName4=INDICATORS_1_2_-1_2&rankName5=ASYL-APP_1_2_-1_2&rankName6=SEX_1_2_-1_2&rankName7=TIME_1_0_0_0&rankName8=GEO_1_2_0_1&sortC=ASC_-1_FIRST&rStp=&cStp=&rDCh=&cDCh=&rDM=true&cDM=true&footnes=false&empty=false&wai=false&time_mode=ROLLING&time_most_recent=false&lang=EN&cfo=%23%23%23%2C%23%23%23.%23%23%23

#read data
#skip first 11 rows as these contain metadata
orig = read_xls("data/migr_asyappctzm.xls", skip = 11)

#copy to short name
d = orig

#subset to first time applicants
d = d[46:78, ]

#fix 1 colname
names(d)[1] = "country"

#fix country names
d$country[1] = "EU28"
d$country[6] = "Germany"

#long form
d = d %>% 
  gather(key = time, value = number, -country) %>% 
  filter(country != "EU28")

#recode missing data
d$number %<>% as.numeric()
## Warning in function_list[[k]](value): NAs introduced by coercion
#recode time
d$time %<>% str_replace("M", "") %>% parse_date_time(orders = "%Y%m")

#parts
d$year = d$time %>% year()
d$month = d$time %>% month()

Analysis

#impute missing data
d$number_LOCF = NA

#loop over countries to impute with LOCF
d %<>% plyr::ddply("country", .fun = function(block) {
  # browser()
  block$number_LOCF = block$number %>% zoo::na.locf(na.rm = F) %>% zoo::na.locf(na.rm = F, fromLast = T)
  block$number_spline = block$number %>% na.interpolation(option = "spline")
  block$number_Kalman = block$number %>% na.kalman()
  block
})

#by month
monthly_totals = d %>% plyr::ddply("time", .fun = function(block) {
  data_frame(
    raw = block$number %>% sum(na.rm = T),
    LOCF = block$number_LOCF %>% sum(na.rm = T),
    spline = block$number_spline %>% sum(na.rm = T),
    Kalman = block$number_Kalman %>% sum(na.rm = T),
    n_missing = sum(is.na(block$number)),
  )
})

#plot missing data problem
monthly_totals %>% 
  select(time, n_missing) %>% 
  ggplot(aes(time, n_missing)) +
  geom_path() +
  theme_bw() +
  scale_y_continuous("n countries with no data", labels = function(x) format(x, scientific = F)) +
  labs(title = "Missing data by month")

GG_save("missing_data.png")

#plot results
monthly_totals %>% 
  select(-n_missing) %>% 
  gather(key = method, value = number, -time) %>% 
  ggplot(aes(time, number, color = method)) +
  geom_path() +
  theme_bw() +
  scale_y_continuous("First time applicants", labels = function(x) format(x, scientific = F)) +
  scale_color_discrete("Imputation method") +
  labs(title = "First time applicants to the EU-ish countries",
       subtitle = "EU28 + Iceland, Norway, Liechtenstein, Switzerland",
       caption = "Data source: Eurostat migr_asyappctzm table"
       )

GG_save("applicants.png")

#inspect data
d %>% DT::datatable()
monthly_totals %>% DT::datatable()