Here we look at the difference in the distributions of low and high scoring esssays along two measures, mean and variance. We do two versions of this analysis: (A) Pooling across all languages, and (B) By language. For each, we downsample so that there are the same number in each lang x prompt x score bin.

SUMMARY: Lower scoring languages are closer to the mean (both pooling across languages and by language). Lower scoring essays have more variance.

Parameters

CUTOFF <- 4
N_PER_L_SCORE_PROMPT <- 10
MIN_SCORE <- 2

1 Data

Read in doctag indices, docvecs, and metadata

doctag_indices <- read_feather("../../../data/processed/models/all_model/doctag_indices_all_model.feather") 
docvecs <- read_feather("../../../data/processed/models/all_model/docvecs_all_model.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

metadata <- read_csv("../../../data/raw/merged_metadata.csv") 
metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id))

n_scores_not_na <- metadata_clean %>% 
  filter(!is.na(score)) %>% 
  nrow()

Of all essays, we have scores for 100% of essays. Note that unlike Study 1, we only have whole scores (5 values).

Merge all data sources together

d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code)

Get TSNE coordinates. These are the same for both the all-languages and by-language analyses.

#mats <- d %>%
#  select(doc_vector) %>%
#  unnest() %>%
#  as.matrix()

#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
#  as.data.frame() %>%
#  rename(tsne_X = V1,
#         tsne_Y = V2) %>%
#  bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
#  select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../all_data/tsne_dims_cached.csv")
tsne_dims <- read_csv("../../../data/processed/tsne/tsne_dims_cached_all.csv")

2 Score distributions

tsne_dims %>%
  ggplot(aes(x = score)) +
  geom_histogram(binwidth = 1) +
  geom_vline(aes(xintercept = mean(tsne_dims$score)), color = "red") +
  ggtitle("overall score distribution") +
  theme_bw()

tsne_dims %>%
  ggplot(aes(x = score, fill = L1_code)) +
  geom_histogram(binwidth = 1) +
  facet_wrap(~L1_code) +
  #geom_density() +
  ggtitle("Score distribution by language") +
  theme_bw() +
  theme(legend.position = "none")

tsne_dims %>%
    count(score, L1_code, prompt_id) %>%
    ggplot(aes(x = n)) +
    geom_histogram(binwidth = 1) +
    ggtitle("Number of essays per language, score, prompt bin, including score = 1") +
    theme_bw()

tsne_dims %>%
    filter(score >= MIN_SCORE) %>%
    count(score, L1_code, prompt_id) %>%
    ggplot(aes(x = n)) +
    geom_histogram(binwidth = 1) +
    ggtitle("Number of essays per language, score, prompt bin, excluding score = 1") +
    theme_bw()

MEAN_N_PER_L_SCORE_PROMPT <- tsne_dims %>%
    filter(score >= MIN_SCORE) %>%
    count(score, L1_code, prompt_id) %>%
    summarize(n = mean(n))

3 Downsample

Scores are unevenly distributed across languages; need to downsample. Here were doing 10 per language, score_bin and prompt (mean = 10.1295406)

Downsample essays.

tsne_dims_downsampled <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  filter(score >= MIN_SCORE) %>%
  group_by(L1_code, score, prompt_id) %>%
  sample_n(N_PER_L_SCORE_PROMPT, replace = T) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         score_prompt = paste(score_bin, prompt_id, sep = "_"),
         L1_score_prompt = paste(L1_code, score_bin, prompt_id, sep = "_"),
         L1_prompt = paste(L1_code, prompt_id, sep = "_")) %>%
  ungroup()

The bins:

tsne_dims_downsampled %>%
  group_by(score_bin) %>%
  distinct(score)

## # A tibble: 4 x 2
## # Groups:   score_bin [2]
##   score score_bin
##   <int>     <chr>
## 1     2       low
## 2     3       low
## 3     4      high
## 4     5      high

4 All Languages

4.1 Mean

Centroid = grand center

get_group_centroid <- function(doc_vecs_df, group_name){
  centroid <- doc_vecs_df %>%
    select(doc_vector) %>%
    unnest(doc_vector) %>%
    colMeans() 

  this_group_name  <- doc_vecs_df %>%
    select_(group_name) %>%
    slice(1) %>%
    unlist()

  df <- data.frame(t(centroid)) %>%
    mutate(x = this_group_name) %>%
    select(x, everything())
  
   names(df)[1] = c(as.name(group_name))
   df
}

# L1_score_prompt centers
tsne_centroids_BL_score_prompt <- tsne_dims_downsampled %>%
  split(.$L1_score_prompt) %>%
  map_df(get_group_centroid, "L1_score_prompt") %>%
  separate(L1_score_prompt, c("L1_code", "score_bin", "prompt_id"), "_") %>%
  nest(4:5, .key = "doc_vector")

grand_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% # 
  map_df(get_group_centroid, "L1_prompt") %>% # PL
  separate(L1_prompt, c("L1_code", "prompt_id"), "_")  %>%
  group_by(prompt_id) %>%
  summarize(tsne_X = mean(tsne_X), #P
            tsne_Y = mean(tsne_Y)) %>%
  mutate(group = "all") %>%
  group_by(group) %>%
  summarize(center_tsne_X = mean(tsne_X), # 1
            center_tsne_Y = mean(tsne_Y))

group_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
  mutate(prompt_score = paste0(prompt_id, "_" , score_bin)) %>%
  split(.$prompt_score) %>%
  map_df(get_group_centroid, "prompt_score")  %>% # SP
  mutate(group = "all") %>%
  separate(prompt_score, c("prompt_id", "score_bin"), "_") 
  
distance_measures <- group_centroids %>%
  left_join(grand_centroids) %>%
  rowwise() %>%
  mutate(dist_from_grand_mean = dist(rbind(c(center_tsne_X,
                                                center_tsne_Y),
                                              c(tsne_X, tsne_Y)), method = "euclidean")[1])

bin_dists <- distance_measures %>%
  group_by(score_bin) %>%
  multi_boot_standard(col = "dist_from_grand_mean")

kable(bin_dists)

score_bin	ci_lower	ci_upper	mean
high	14.9029	20.63314	17.76401
low	13.6166	19.07722	16.35866

ggplot(bin_dists, aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Distance from grand centroid (bootstrapping across prompts)") +
  ylab("distance from grand centroid") +
  theme_minimal()

Model

model_data <- tsne_dims_downsampled %>%
  mutate(group = "all") %>%
  left_join(grand_centroids) %>%
  unnest() %>%
  rowwise() %>%
  mutate(dist_from_grand_mean = 
           dist(rbind(c(center_tsne_X, center_tsne_Y),
                      c(tsne_X, tsne_Y)), method = "euclidean")[1])

# these don't converge
#model <- lmer(dist_from_grand_mean ~  score_bin + (score_bin|prompt_id) + (prompt_id|L1_code) +  (score_bin|L1_code), data= model_data,  control=lmerControl(optimizer = "bobyqa"))

#model <- lmer(dist_from_grand_mean ~  score_bin + (score_bin|prompt_id) + (prompt_id|L1_code) , data= model_data,  control=lmerControl(optimizer = "bobyqa"))

#model <- lmer(dist_from_grand_mean ~  score_bin +  (prompt_id|L1_code) , data= model_data,  control=lmerControl(optimizer = "bobyqa"))

model <- lmer(dist_from_grand_mean ~  score_bin +  (1|L1_code) + (1|prompt_id) , data= model_data,  control=lmerControl(optimizer = "bobyqa"))
summary(model)

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## dist_from_grand_mean ~ score_bin + (1 | L1_code) + (1 | prompt_id)
##    Data: model_data
## Control: lmerControl(optimizer = "bobyqa")
## 
## REML criterion at convergence: 196041.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -8.1975 -0.3182  0.0811  0.4600  7.2130 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  L1_code   (Intercept)  0.1221  0.3494  
##  prompt_id (Intercept) 52.4049  7.2391  
##  Residual              10.9067  3.3025  
## Number of obs: 37440, groups:  L1_code, 35; prompt_id, 28
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  18.28218    1.36955   13.35
## score_binlow -1.25799    0.03418  -36.81
## 
## Correlation of Fixed Effects:
##             (Intr)
## score_binlw -0.012

Lowering scoring languages are farther away from the overall group mean.

4.2 Variance

Here are scatter and 2D density plots for each prompt and score bin.

ggplot(tsne_dims_downsampled %>% unnest(), 
       aes(color = score_bin, x = tsne_X, y = tsne_Y)) +
  facet_wrap(~prompt_id, nrow = 4) + 
  geom_point(size = .2, alpha = .3) +
  theme_minimal()

Here is diameter as the average distance to all other points:

tsne_diameters_LA_mean <- tsne_dims_downsampled %>%
  split(.$score_prompt) %>% 
  map_df(get_group_diameter, "score_prompt", "mean_dist")  %>%
  separate(score_prompt, c("score_bin","prompt_id"), sep = "_")

tsne_diameters_LA_mean %>%
  group_by(score_bin) %>%
  multi_boot_standard(col = "diameter") %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (average dist to all within-group points)") +
  theme_minimal()

With this measure, low scoring essays have more variance.

low = filter(tsne_diameters_LA_mean, score_bin == "low")
high = filter(tsne_diameters_LA_mean, score_bin == "high")

t.test(low$diameter, high$diameter, paired = TRUE)

## 
##  Paired t-test
## 
## data:  low$diameter and high$diameter
## t = 4.2199, df = 27, p-value = 0.0002468
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.4745154 1.3726586
## sample estimates:
## mean of the differences 
##                0.923587

5 By Language

5.1 Mean

By languages - Language centroid

grand_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>%
  map_df(get_group_centroid, "L1_prompt") %>% # PL
  separate(L1_prompt, c("L1_code",  "prompt_id"), "_") %>%
  nest(3:4, .key = "doc_vector") %>%
  split(.$L1_code) %>%
  map_df(get_group_centroid, "L1_code")  %>%
  rename(center_tsne_X = tsne_X,
        center_tsne_Y = tsne_Y)

distance_measures <- tsne_centroids_BL_score_prompt %>%
  unnest() %>%
  left_join(grand_centroids) %>%
  rowwise () %>%
  mutate(dist_from_grand_mean = 
           dist(rbind(c(center_tsne_X, center_tsne_Y),
                      c(tsne_X, tsne_Y)), method = "euclidean")[1])

bin_dists <- distance_measures %>%
  group_by(L1_code, score_bin) %>%
  multi_boot_standard(col = "dist_from_grand_mean")

ggplot(bin_dists, aes(x = L1_code, y = mean, group = score_bin, fill = score_bin)) +
  geom_bar(stat = "identity", position = position_dodge() ) +
  xlab("Native language") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), position = position_dodge(.9)) +
  ylab("Euclidean distance from language centroid") +
  theme_minimal() +
  theme(text = element_text(size = 15),
          axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_fill_brewer(palette = "Set1", name = "Score")

kable(bin_dists)

L1_code	score_bin	ci_lower	ci_upper	mean
ARA	high	14.50854	20.31290	17.35065
ARA	low	13.42778	18.60312	15.92734
BEN	high	16.05779	21.90380	18.89651
BEN	low	13.78415	19.82368	16.86073
BUL	high	14.24274	19.81432	17.21946
BUL	low	13.05452	19.16474	16.12481
CHI	high	15.38462	21.06080	18.21525
CHI	low	13.39906	18.50594	16.04284
DUT	high	14.96819	20.59136	17.85774
DUT	low	12.82454	18.80713	15.88590
ENG	high	15.35845	21.17618	18.16705
ENG	low	14.13516	19.40165	16.76981
FAS	high	14.14614	20.07018	17.29799
FAS	low	12.82309	18.41262	15.52350
FRE	high	14.61921	20.40549	17.65681
FRE	low	13.23382	18.72757	15.97212
GER	high	14.97270	20.68973	17.82031
GER	low	13.62390	19.17379	16.39672
GRE	high	14.45674	20.36418	17.56723
GRE	low	12.71918	18.38236	15.73442
GUJ	high	14.36836	20.57892	17.63308
GUJ	low	14.19541	19.59639	16.82177
HIN	high	15.41619	21.09405	18.33358
HIN	low	14.59187	20.14962	17.35096
IBO	high	15.07161	21.07239	18.06111
IBO	low	13.64845	18.74564	16.27427
IND	high	13.95436	20.12811	17.13321
IND	low	13.07838	18.61289	15.93580
ITA	high	14.40358	20.30740	17.37258
ITA	low	13.40478	18.74980	16.14432
JPN	high	14.33399	20.21688	17.25782
JPN	low	13.47457	19.01434	16.17036
KAN	high	15.60864	21.75202	18.74677
KAN	low	13.54344	19.61928	16.61888
KOR	high	15.36967	20.57560	17.91790
KOR	low	13.56424	18.81132	16.32295
MAL	high	14.89661	20.97647	18.02225
MAL	low	14.38193	20.32834	17.22895
MAR	high	15.17457	21.11376	18.06121
MAR	low	14.38692	20.03161	17.23579
NEP	high	15.03110	21.51021	18.32608
NEP	low	14.85312	20.22407	17.50173
PAN	high	14.79346	20.67256	17.98117
PAN	low	14.38446	19.61306	17.18925
POL	high	14.55938	20.46271	17.58511
POL	low	13.63650	18.82258	16.33064
POR	high	14.66998	20.24575	17.65120
POR	low	13.03345	18.45268	15.69171
RUM	high	14.69363	20.56084	17.67140
RUM	low	12.95566	18.20951	15.76591
RUS	high	15.07623	20.55559	17.84764
RUS	low	14.01565	19.44334	16.70459
SPA	high	14.78474	20.35636	17.62049
SPA	low	12.85185	18.51199	15.65550
TAM	high	15.09655	21.02593	18.22260
TAM	low	14.46361	19.94942	17.12117
TEL	high	15.01908	20.73831	17.98919
TEL	low	14.13862	19.65750	16.88112
TGL	high	14.65664	20.06857	17.51809
TGL	low	14.23585	19.65155	16.85779
THA	high	15.28873	20.75167	17.81894
THA	low	13.81847	19.33315	16.60962
TUR	high	14.37129	20.26415	17.40229
TUR	low	14.12079	19.27540	16.55622
URD	high	15.20931	21.26715	18.21548
URD	low	14.28098	19.57639	17.00679
VIE	high	14.82609	20.32882	17.67133
VIE	low	12.98558	18.37910	15.69448
YOR	high	15.40881	21.29999	18.31916
YOR	low	13.54515	19.39071	16.53499

Model

model_data <- tsne_dims_downsampled %>%
  left_join(grand_centroids) %>%
  unnest() %>%
  rowwise() %>%
  mutate(dist_from_grand_mean = 
           dist(rbind(c(center_tsne_X, center_tsne_Y),
                      c(tsne_X, tsne_Y)), method = "euclidean")[1])
model_data %>%
  group_by(L1_code) %>%
  do(tidy(lmer(dist_from_grand_mean ~  score_bin + (score_bin|prompt_id), data=.,control=lmerControl(optimizer = "bobyqa")))) %>%
  filter(term == "score_binlow") %>%
  arrange(statistic) %>%
  kable()

L1_code	term	estimate	std.error	statistic	group
BEN	score_binlow	-2.0482763	0.3133251	-6.5372246	fixed
JPN	score_binlow	-1.3008434	0.2276088	-5.7152607	fixed
POR	score_binlow	-1.9791854	0.3535199	-5.5985123	fixed
KOR	score_binlow	-1.1535076	0.2146429	-5.3740769	fixed
FRE	score_binlow	-1.0834534	0.2105391	-5.1460904	fixed
SPA	score_binlow	-1.3773475	0.2989218	-4.6077185	fixed
RUM	score_binlow	-1.8094388	0.4096531	-4.4170021	fixed
ENG	score_binlow	-1.8065191	0.4389514	-4.1155333	fixed
DUT	score_binlow	-1.3683001	0.3400837	-4.0234216	fixed
CHI	score_binlow	-1.6130749	0.4252086	-3.7936086	fixed
VIE	score_binlow	-1.5968721	0.4245418	-3.7614013	fixed
YOR	score_binlow	-1.7928780	0.4818305	-3.7209723	fixed
BUL	score_binlow	-1.4105576	0.3802423	-3.7096282	fixed
TAM	score_binlow	-1.3301323	0.3595957	-3.6989661	fixed
URD	score_binlow	-1.1841224	0.3247621	-3.6461227	fixed
GER	score_binlow	-1.2124774	0.3427081	-3.5379300	fixed
ARA	score_binlow	-1.4748536	0.4258047	-3.4636853	fixed
POL	score_binlow	-1.1744431	0.3450250	-3.4039360	fixed
GRE	score_binlow	-1.6208330	0.4967683	-3.2627547	fixed
TEL	score_binlow	-1.1784583	0.3646043	-3.2321566	fixed
FAS	score_binlow	-1.4194862	0.4558191	-3.1141435	fixed
IBO	score_binlow	-1.3715324	0.4635671	-2.9586492	fixed
THA	score_binlow	-1.0404292	0.3799435	-2.7383789	fixed
KAN	score_binlow	-1.5238304	0.5570942	-2.7353189	fixed
HIN	score_binlow	-0.9098248	0.3566241	-2.5512152	fixed
TUR	score_binlow	-0.7708002	0.3064908	-2.5149213	fixed
ITA	score_binlow	-0.9183046	0.3806489	-2.4124713	fixed
TGL	score_binlow	-0.6183860	0.2580783	-2.3961182	fixed
NEP	score_binlow	-0.8070802	0.3420564	-2.3594948	fixed
GUJ	score_binlow	-1.0060198	0.4411161	-2.2806237	fixed
MAL	score_binlow	-0.8865862	0.4395371	-2.0170907	fixed
RUS	score_binlow	-0.7835864	0.3926972	-1.9953957	fixed
IND	score_binlow	-0.9541267	0.5379901	-1.7735024	fixed
MAR	score_binlow	-0.6941345	0.4249078	-1.6336121	fixed
PAN	score_binlow	-0.2638990	0.3569011	-0.7394177	fixed

Across all languages, lower scoring essays are closer to group mean.

5.1.0.1 Effect sizes:

bin_dists_low <- distance_measures %>%
  group_by(L1_code, score_bin) %>%
  summarize(sd = sd(dist_from_grand_mean),
            mean = mean(dist_from_grand_mean),
            n = n()) %>%
filter(score_bin == "low") %>%
  rename(low_sd = sd,
         low_mean = mean,
         low_n = n)

bin_dists_high <- distance_measures %>%
  group_by(L1_code, score_bin) %>%
  summarize(sd = sd(dist_from_grand_mean),
            mean = mean(dist_from_grand_mean),
            n = n()) %>%
filter(score_bin == "high") %>%
  rename(high_sd = sd,
         high_mean = mean,
         high_n = n)

all_dists = inner_join(bin_dists_low, bin_dists_high, by = "L1_code")

effect_sizes = all_dists %>%
  rowwise()  %>%
  mutate(d = compute.es::mes( high_mean, low_mean, high_sd, low_sd,  high_n * 10, low_n * 10, verbose = FALSE)$d)

effect_sizes %>%
  select(L1_code, d) %>%
  arrange(-d) %>%
  kable()

L1_code	d
CHI	0.29
KAN	0.26
VIE	0.26
BEN	0.25
DUT	0.25
POR	0.25
RUM	0.25
SPA	0.25
GRE	0.24
FAS	0.23
YOR	0.23
IBO	0.22
FRE	0.21
KOR	0.21
ENG	0.19
ARA	0.18
GER	0.18
ITA	0.16
POL	0.16
RUS	0.16
THA	0.16
IND	0.15
URD	0.15
JPN	0.14
TAM	0.14
TEL	0.14
BUL	0.13
HIN	0.12
TUR	0.11
GUJ	0.10
MAL	0.10
MAR	0.10
NEP	0.10
PAN	0.10
TGL	0.09

5.2 Variance

Diameter as the average distance to all other points:

tsne_diameters_LB_mean <- tsne_dims_downsampled %>%
  split(.$L1_score_prompt) %>% 
  map_df(get_group_diameter, "L1_score_prompt", "mean_dist")  %>%
  separate(L1_score_prompt, c("L1_code" ,"score_bin", "prompt_id"), sep = "_") 

tsne_diameters_LB_mean %>%
  group_by(score_bin, L1_code) %>%
  multi_boot_standard(col = "diameter", na.rm = T) %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  facet_wrap(~L1_code, nrow = 5) +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (average dist to all within-group points)") +
  theme_minimal()

get_uneven_tscore <- function(current_lang, tsne_diameters_LB_mean){
 m <- tsne_diameters_LB_mean %>% 
   filter(L1_code == current_lang)
good_ids <- m %>%
  count(prompt_id) %>%
  as.data.frame() %>%
  filter(n == 2)

  k <-  m %>%
  filter(prompt_id %in% good_ids$prompt_id) 
  t.value <- t.test(k$diameter ~ k$score_bin, 
       paired = TRUE)$statistic
  
  data.frame(t.value, L1_code = current_lang)
}


map_df(unique(tsne_diameters_LB_mean$L1_code), get_uneven_tscore,tsne_diameters_LB_mean) %>%
    gather("L1_code", "t.value") %>%
  arrange(t.value) %>%
  kable()

t.value	L1_code
-4.3482652	CHI
-2.7806569	IBO
-2.4520653	ITA
-2.4110134	FAS
-1.9970025	BEN
-1.9844677	URD
-1.9796668	RUM
-1.9622424	VIE
-1.9094973	FRE
-1.8241657	KAN
-1.7478173	SPA
-1.6417110	YOR
-1.4032851	GRE
-1.3187022	RUS
-1.2189869	PAN
-1.1763872	MAR
-1.1432129	KOR
-0.9390124	POL
-0.8545246	ARA
-0.8229936	GUJ
-0.7858954	HIN
-0.6728167	IND
-0.6575782	TAM
-0.5593251	TUR
-0.5472320	TEL
-0.5067167	NEP
-0.4851766	JPN
-0.2190358	THA
-0.1851714	GER
-0.0495870	MAL
0.0962739	ENG
0.1134695	POR
0.4225591	TGL
0.5786750	DUT
2.5375746	BUL

There’s no effect here, except for a few languages.

L2ETS Study 2 analyses - Scores

Molly Lewis

2017-12-04