Start

First we load some packages we want to use. We also set the digits in the options, so that when we later output a long table, it will have nicer formatting.

library(pacman)
p_load(readr, kirkegaard, plyr, dplyr, gtools, purrr, curry, glmnetUtils, assertthat, rms, broom, doMC, relaxo, writexl)
options(digits = 3)

#print first n lines
print_n_lines = function(expr, n = 15) {
  #quote
  quoted_expr = quote(expr)
  
  #eval and print
  out = capture.output(print(eval(quoted_expr))) %>% str_split("\n", simplify = T)
  
  #concat again
  #dont try to print more than possible
  if (length(out) < n) n = length(n)
  out2 = out[1:n, ] %>% str_c(collapse = "\n")
  
  #print
  cat(out2)
  
  invisible(NULL)
}

#extract best lasso features
#https://stackoverflow.com/questions/27801130/extracting-coefficient-variable-names-from-glmnet-into-a-data-frame
extract_lasso_features = function(x) {
  tmp_coeffs <- coef(x, s = "lambda.min")
  data.frame(name = tmp_coeffs@Dimnames[[1]][tmp_coeffs@i + 1], coefficient = tmp_coeffs@x)
}

#calc R2 manually
calc_r2 = function(fitted, true) {
    #R2 from sums of squares
    #sum of squared errors
    SSE = sum((true - fitted)^2)
    
    #total sum of squares
    SSTO = sum((mean(true) - true)^2)
    
    #R2
    1 - (SSE / SSTO)
}

#NA predictors of a model
NA_preds = function(x) {
  names(coef(x)[is.na(coef(x))])
}

require(doMC)
registerDoMC(cores = detectCores() - 1)

Data

We load the data from the previous study. We only need to do a little cleaning. A few of the names are unisex names and have data for both sexes independently. These are marked by a suffix in the name indicating the sex, but which is redundant because there’s a separate column for it too. So, we get rid of it. This leaves us with some duplicated names. We could leave these cases in. They would basically act as two cases with exactly the same predictors but two different outcomes. Instead I decided to keep only the largest of the paired names. Usually one is much more common than the other.

#load data
d_orig = silence(read_csv("data/names.csv"))
origins = silence(read_csv("data/names_origin.csv"))

#merge
d = left_join(d_orig, origins, by = c("name" = "name"))

#remove those with no data
d = d[!is.na(d$S.both.no.age), ]

#rename
d$S = d$S.both.no.age

#add row ids
d$..row = 1:nrow(d)

#lowercase names
d$name_orig = d$name
d$name = str_to_lower(d$name)

#remove sex marker
d$name %<>% str_replace(pattern = " \\(\\w\\)$", "")

#remove smallest of duplicate name pairs
find_dups = function(x) {
  dups = duplicated(x) | duplicated(x, fromLast = T)

  split(which(dups), x[dups])
}

#keep largest of the pairs
invisible(map_lgl(names(find_dups(d$name)), function(x) {
  # browser()
  
  #find where the dups are
  drows = d %>% dplyr::filter(name == x)
  
  #which is smallest?
  smallest = which.min(drows$number)
  
  #which row is that?
  row_to_del = drows[smallest, "..row"] %>% unlist
  
  #delete that row
  d <<- dplyr::filter(d, ..row != row_to_del)
  
  T
}))

#recode to logical
d$Danish = as.logical(d$Danish)

Origin data

Recode origins data.

#recode
table2(d$main_origin_detailed) %>% print(n = Inf)
## # A tibble: 104 x 3
##     Group                            Count Percent
##     <chr>                            <dbl>   <dbl>
##   1 English                           333. 17.6   
##   2 NaN                               199. 10.5   
##   3 Danish                            195. 10.3   
##   4 German                            163.  8.62  
##   5 Swedish                           157.  8.31  
##   6 Arabic                            116.  6.14  
##   7 Turkish                            72.  3.81  
##   8 French                             70.  3.70  
##   9 Polish                             67.  3.54  
##  10 Italian                            56.  2.96  
##  11 Norwegian                          44.  2.33  
##  12 Spanish                            38.  2.01  
##  13 Russian                            27.  1.43  
##  14 Dutch                              25.  1.32  
##  15 Finnish                            19.  1.01  
##  16 Bosnian                            17.  0.899 
##  17 Irish                              17.  0.899 
##  18 Romanian                           16.  0.847 
##  19 Croatian                           14.  0.741 
##  20 Persian                            13.  0.688 
##  21 Scottish                           12.  0.635 
##  22 Czech                              11.  0.582 
##  23 Greek                              10.  0.529 
##  24 Danish (Rare)                       8.  0.423 
##  25 Hungarian                           8.  0.423 
##  26 Ukrainian                           8.  0.423 
##  27 Welsh                               8.  0.423 
##  28 Ancient Scandinavian                6.  0.317 
##  29 Biblical                            6.  0.317 
##  30 Low German                          6.  0.317 
##  31 Serbian                             6.  0.317 
##  32 Vietnamese                          6.  0.317 
##  33 Ancient Roman                       5.  0.265 
##  34 Bulgarian                           5.  0.265 
##  35 Icelandic                           5.  0.265 
##  36 Lithuanian                          5.  0.265 
##  37 Portuguese                          5.  0.265 
##  38 Scandinavian                        5.  0.265 
##  39 Various                             5.  0.265 
##  40 English (Modern)                    4.  0.212 
##  41 English (Rare)                      4.  0.212 
##  42 Hebrew                              4.  0.212 
##  43 Japanese                            4.  0.212 
##  44 Norse Mythology                     4.  0.212 
##  45 Old Danish                          4.  0.212 
##  46 Somali                              4.  0.212 
##  47 Arabic (Maghrebi)                   3.  0.159 
##  48 Chinese                             3.  0.159 
##  49 Frisian                             3.  0.159 
##  50 West Frisian                        3.  0.159 
##  51 English (Modern, Rare)              2.  0.106 
##  52 Greek Mythology                     2.  0.106 
##  53 Hindi                               2.  0.106 
##  54 Hinduism                            2.  0.106 
##  55 Indian                              2.  0.106 
##  56 Latvian                             2.  0.106 
##  57 Roman Mythology                     2.  0.106 
##  58 Slovene                             2.  0.106 
##  59 Urdu                                2.  0.106 
##  60 Afrikaans (Rare)                    1.  0.0529
##  61 Ancient Germanic                    1.  0.0529
##  62 Ancient Greek                       1.  0.0529
##  63 Ancient Persian                     1.  0.0529
##  64 Arabic (Egyptian)                   1.  0.0529
##  65 Azerbaijani                         1.  0.0529
##  66 Basque                              1.  0.0529
##  67 Belarusian                          1.  0.0529
##  68 Biblical Greek                      1.  0.0529
##  69 Biblical Latin                      1.  0.0529
##  70 Catalan                             1.  0.0529
##  71 Danish (Modern)                     1.  0.0529
##  72 Danish (Modern, Rare)               1.  0.0529
##  73 Danish (Rare, Archaic)              1.  0.0529
##  74 Dutch (Modern)                      1.  0.0529
##  75 English (American)                  1.  0.0529
##  76 English (American, Modern, Rare)    1.  0.0529
##  77 English (American, Rare)            1.  0.0529
##  78 English (Archaic)                   1.  0.0529
##  79 Estonian                            1.  0.0529
##  80 Faroese                             1.  0.0529
##  81 Fijian                              1.  0.0529
##  82 Flemish (Rare)                      1.  0.0529
##  83 Galician                            1.  0.0529
##  84 German (Archaic)                    1.  0.0529
##  85 Greek (Rare)                        1.  0.0529
##  86 Hawaiian                            1.  0.0529
##  87 Irish Mythology                     1.  0.0529
##  88 Japanese (Modern)                   1.  0.0529
##  89 Judeo-Christian Legend              1.  0.0529
##  90 Macedonian                          1.  0.0529
##  91 Manx                                1.  0.0529
##  92 Marathi                             1.  0.0529
##  93 Near Eastern Mythology              1.  0.0529
##  94 Norwegian (Rare)                    1.  0.0529
##  95 Old Swedish                         1.  0.0529
##  96 Ottoman Turkish                     1.  0.0529
##  97 Portuguese (Brazilian)              1.  0.0529
##  98 Sami                                1.  0.0529
##  99 Slovak                              1.  0.0529
## 100 South American                      1.  0.0529
## 101 Swedish (Rare)                      1.  0.0529
## 102 Thai                                1.  0.0529
## 103 West Frisian (Rare)                 1.  0.0529
## 104 <NA>                                0.  0.
#recode
#all temporal variations within countries
d$main_origin = d$main_origin_detailed %>% 
  #old versions in parens
  str_replace(" \\([\\w, ]+\\)", "") %>%
  #old versions in front
  str_replace("Old|Ancient ", "") %>% 
  #NaN to Unknown_other
  mapvalues("NaN", "Unknown") %>% 
  #gather biblicals
  mapvalues(c("Judeo-Christian Legend", "Biblical Greek", "Biblical Latin"), c("Biblical", "Biblical", "Biblical")) %>% 
  #clean whitespace
  str_trim() %>% 
  #turkish
  mapvalues("Ottoman Turkish", "Turkish") %>% 
  #Indian
  mapvalues(c("Hindi", "Hinduism", "Marathi", "Urdu"), rep("Indian", 4)) %>% 
  #Irish
  mapvalues("Irish Mythology", "Irish") %>% 
  #Greek
  mapvalues("Greek Mythology", "Greek") %>% 
  #Frisian
  mapvalues("West Frisian", "Frisian") %>% 
  #Batic
  mapvalues(c("Lithuanian", "Latvian", "Estonian"), rep("Baltic", 3)) %>% 
  #Dutch
  mapvalues("Afrikaans", "Dutch") %>% 
  #remainder
  fct_lump(prop = 1/length(.), other_level = "Other") %>% 
  #set Danish as comparison
  fct_relevel("Danish")

table2(d$main_origin) %>% print(n = Inf)
## # A tibble: 48 x 3
##    Group           Count Percent
##    <chr>           <dbl>   <dbl>
##  1 English          347.  18.4  
##  2 Danish           210.  11.1  
##  3 Unknown          199.  10.5  
##  4 German           164.   8.68 
##  5 Swedish          159.   8.41 
##  6 Arabic           120.   6.35 
##  7 Turkish           73.   3.86 
##  8 French            70.   3.70 
##  9 Polish            67.   3.54 
## 10 Italian           56.   2.96 
## 11 Norwegian         45.   2.38 
## 12 Spanish           38.   2.01 
## 13 Dutch             27.   1.43 
## 14 Russian           27.   1.43 
## 15 Finnish           19.   1.01 
## 16 Irish             18.   0.952
## 17 Bosnian           17.   0.899
## 18 Other             17.   0.899
## 19 Romanian          16.   0.847
## 20 Croatian          14.   0.741
## 21 Greek             14.   0.741
## 22 Persian           14.   0.741
## 23 Scottish          12.   0.635
## 24 Czech             11.   0.582
## 25 Scandinavian      11.   0.582
## 26 Biblical           9.   0.476
## 27 Indian             9.   0.476
## 28 Baltic             8.   0.423
## 29 Hungarian          8.   0.423
## 30 Ukrainian          8.   0.423
## 31 Welsh              8.   0.423
## 32 Frisian            7.   0.370
## 33 Low German         6.   0.317
## 34 Portuguese         6.   0.317
## 35 Serbian            6.   0.317
## 36 Vietnamese         6.   0.317
## 37 Bulgarian          5.   0.265
## 38 Icelandic          5.   0.265
## 39 Japanese           5.   0.265
## 40 Roman              5.   0.265
## 41 Various            5.   0.265
## 42 Hebrew             4.   0.212
## 43 Norse Mythology    4.   0.212
## 44 Somali             4.   0.212
## 45 Chinese            3.   0.159
## 46 Roman Mythology    2.   0.106
## 47 Slovene            2.   0.106
## 48 <NA>               0.   0.
#distributions
GG_denhist(d, "S", "Danish") +
  scale_x_continuous("General socioeconomic score (S)")
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/S_Danish_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#summary stats
describeBy(d$S, d$Danish)
## 
##  Descriptive statistics by group 
## group: FALSE
##    vars   n  mean   sd median trimmed  mad   min  max range skew kurtosis
## X1    1 591 -0.81 1.04  -0.93   -0.83 1.16 -3.33 1.86   5.2 0.23    -0.86
##      se
## X1 0.04
## -------------------------------------------------------- 
## group: TRUE
##    vars    n mean   sd median trimmed  mad   min  max range  skew kurtosis
## X1    1 1299 0.37 0.72   0.46    0.43 0.62 -3.15 2.39  5.54 -0.77     1.16
##      se
## X1 0.02
#top 10 origins, minus unknown
#have to do some recoding
main_origin_table = table2(d$main_origin)
commonest_10_origins = main_origin_table %>% .$Group %>% .[1:11] %>% setdiff("Unknown")
d10 = d %>% 
  filter(main_origin %in% commonest_10_origins)
d10 = d10 %>% 
  mutate(main_origin = fct_reorder(main_origin, d10$S))
main_origin_table10 = table2(d10$main_origin, sort_descending = NULL)

ggplot(d10 %>% filter(!is.na(main_origin)), aes(main_origin, S)) +
  geom_boxplot() +
  theme_bw() +
  scale_x_discrete("10 largest main origins", label = function(x) {x + "\nn = " + main_origin_table10 %>% .$Count}) +
  scale_y_continuous("General socioeconomic score (S)")
## Warning in stri_c(..., sep = sep, collapse = collapse, ignore_null = TRUE):
## longer object length is not a multiple of shorter object length

GG_save("figures/S_origin10_dist.png")
## Warning in stri_c(..., sep = sep, collapse = collapse, ignore_null = TRUE):
## longer object length is not a multiple of shorter object length
#S by origin
d %>% 
  group_by(main_origin) %>% 
  dplyr::summarize(
    S = median(S),
    n = n()
    ) %>%
  write_clipboard()
##        Main origin     S   N
## 1           Danish  0.74 210
## 2           Arabic -1.79 120
## 3           Baltic -0.28   8
## 4         Biblical -0.74   9
## 5          Bosnian -1.22  17
## 6        Bulgarian -0.33   5
## 7          Chinese -0.18   3
## 8         Croatian -0.44  14
## 9            Czech -0.74  11
## 10           Dutch  0.70  27
## 11         English  0.23 347
## 12         Finnish  0.64  19
## 13          French  0.35  70
## 14         Frisian  0.24   7
## 15          German  0.39 164
## 16           Greek  0.28  14
## 17          Hebrew -0.54   4
## 18       Hungarian  0.00   8
## 19       Icelandic -0.56   5
## 20          Indian -0.21   9
## 21           Irish  0.14  18
## 22         Italian  0.04  56
## 23        Japanese -0.02   5
## 24      Low German  1.27   6
## 25 Norse Mythology  0.43   4
## 26       Norwegian  0.64  45
## 27         Persian -1.36  14
## 28          Polish -1.55  67
## 29      Portuguese -0.03   6
## 30           Roman  1.09   5
## 31        Romanian -1.58  16
## 32 Roman Mythology  0.35   2
## 33         Russian -0.27  27
## 34    Scandinavian  0.34  11
## 35        Scottish  0.58  12
## 36         Serbian -0.97   6
## 37         Slovene  0.33   2
## 38          Somali -2.79   4
## 39         Spanish -0.54  38
## 40         Swedish  0.57 159
## 41         Turkish -1.21  73
## 42       Ukrainian -0.85   8
## 43         Unknown  0.60 199
## 44         Various  0.20   5
## 45      Vietnamese -0.14   6
## 46           Welsh  0.16   8
## 47           Other  0.70  17
#figure comparison
matched = readxl::read_xlsx("data/origin_national_group.xlsx")

GG_scatter(matched, "S_names", "S_official", case_names = "Main origin", repel_names = T)

GG_save("figures/matched.png")

n-grams prepare

We want to use n-grams to predict the outcomes for the names. We first make a vector of the Danish letters, which are the same as the English but we need to add another three vowel letters. After that, we make a function that outputs all the n-grams based on combinatorics (combinations with replacement).

#letters
danish_letters = c(letters[1:26], "æ", "ø", "å")

#ngram func
make_ngrams = function(n, letters = danish_letters) {
  
  inner_func = function(n, letters = letters) {
    #make combinations
    df = gtools::permutations(n = length(danish_letters), r = n, v = danish_letters, repeats.allowed = T)
    
    #collapser
    collapser = curry::partial(str_c, list(collapse = ""))
    
    #collapse rows
    plyr::aaply(df, .margins = 1, .fun = function(row) {
      do.call(collapser, args = list(x = row))
    })
  }
  
  #vectorize
  do.call(c, map(n, inner_func))
}

#example
make_ngrams(1:2)[1:100]
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##  "a"  "å"  "æ"  "b"  "c"  "d"  "e"  "f"  "g"  "h"  "i"  "j"  "k"  "l"  "m" 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29    1 
##  "n"  "o"  "ø"  "p"  "q"  "r"  "s"  "t"  "u"  "v"  "w"  "x"  "y"  "z" "aa" 
##    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## "aå" "aæ" "ab" "ac" "ad" "ae" "af" "ag" "ah" "ai" "aj" "ak" "al" "am" "an" 
##   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
## "ao" "aø" "ap" "aq" "ar" "as" "at" "au" "av" "aw" "ax" "ay" "az" "åa" "åå" 
##   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46 
## "åæ" "åb" "åc" "åd" "åe" "åf" "åg" "åh" "åi" "åj" "åk" "ål" "åm" "ån" "åo" 
##   47   48   49   50   51   52   53   54   55   56   57   58   59   60   61 
## "åø" "åp" "åq" "år" "ås" "åt" "åu" "åv" "åw" "åx" "åy" "åz" "æa" "æå" "ææ" 
##   62   63   64   65   66   67   68   69   70   71 
## "æb" "æc" "æd" "æe" "æf" "æg" "æh" "æi" "æj" "æk"

n-gram analysis

First, we set a tuning parameter that determines how we prune the n-grams. The idea is that most of the possible n-grams never appear in the names we have, and so are useless (zero variance). The parameter excludes all n-grams that do not appear at least n times.

To avoid testing a very large number of n-grams, we restrict ourselves to combinations of length 1-3, and also examine those that occur at the beginning or end of names, such as names that begin with “chr” or end with “er”. To get there, we generate all the possible n-grams up to the desired length. Then we prune the only that don’t occur often according to our tuning parameter. Then as a simple initial test for any predictive validity, we run a t-test for each n-gram on the data using social status as the outcome. Finally, we plot the distributions of p and d values. The null hypothesis of no relevant variation is a flat (uniform) p-curve and a normal distibution of d values symmetric around 0.

#we only care about ngrams that occur at least 5 times
ngram_length = 3

#make list of all relevant n-grams
ngrams = c(make_ngrams(1:ngram_length),
           "^" + make_ngrams(1:ngram_length),
           make_ngrams(1:ngram_length) + "$"
           )

#how many?
length(ngrams)
## [1] 75777
#inspect a random selection
set.seed(1)
sample(ngrams, size = 30)
##  19250                       14413                               3812 
##  "twt" "^æli" "^rnu" "rså$"  "øbz" "qvh$" "vae$" "^zlm" "^wør"  "cnk" 
##  14737  12507                                                         
##  "ønc"  "mwf" "atx$" "^bnm" "gfe$" "^lte" "bns$" "zff$" "^bcu" "gxy$" 
##         15202          8642  19373           145                      
## "uæs$"  "pæd" "^yph"  "ifz"  "uåa" "^bro"  "acz" "^biu" "øeq$"  "^oz"
#df
ngram_df = data_frame(
  ngram = ngrams,
  
  #count matches
  times = map_int(ngrams, function(x) {
  sum(stringr::str_detect(d$name, pattern = x))
  })
)

#subset
ngrams_chosen = ngram_df %>% filter(times >= 5) %>% .$ngram

#list of functions to score a given name for each feature
l_feature_funcs = list()

#add ngram functions
for (ng in ngrams_chosen) {
  #replace illegal chrs
  colname = str_replace(ng, "\\^", "_") %>% str_replace("\\$", "_")
  
  #prepend a dot to avoid syntactically invalid names
  colname = "." + colname
  
  #save
  l_feature_funcs[[colname]] = curry::partial(str_detect, args = list(pattern = ng))
}

#other feature funcs
l_feature_funcs[["length"]] = str_length

l_feature_funcs[["vowel_frac"]] = function(x) {
  #count vowels
  .matches = str_match_all(x, pattern = "[aeiouæøå]")
  
  #fraction of length
  nrow(.matches[[1]]) / str_length(x)
}

l_feature_funcs[["stop_frac"]] = function(x) {
  #count stops
  .matches = str_match_all(x, pattern = "[tdpbkg]")
  
  #fraction of length
  nrow(.matches[[1]]) / str_length(x)
}

l_feature_funcs[["nasal_frac"]] = function(x) {
  #count stops
  .matches = str_match_all(x, pattern = "[mn]")
  
  #fraction of length
  nrow(.matches[[1]]) / str_length(x)
}

l_feature_funcs[["has_dash"]] = function(x) {
  str_detect(x, pattern = "-")
}

#vector
other_features = c("length", "vowel_frac", "stop_frac", "nasal_frac", "has_dash")

#test
assert_that(l_feature_funcs[["vowel_frac"]]("emil") == 0.5)
## [1] TRUE
assert_that(are_equal(l_feature_funcs[["vowel_frac"]]("abc"), 1/3))
## [1] TRUE
#prepare a second dataset with syntactically legal names for each regex
d2 = data_frame(name = d$name,
                S = d$S,
                Danish = d$Danish,
                main_origin = d$main_origin
                )

#really compact function to make the feature data frame
#loops over each name, then applies each feature function to that name
#then collects and constructs as data frame
d2 = cbind(d2, map_df(d$name, function(.name) {
  plyr::each(l_feature_funcs)(.name)
}))

#test all n-gram using t-test
ngram_tests = map2_df(l_feature_funcs, names(l_feature_funcs), function(x, y) {
  #check if output is logical, if not, skip
  if (!is.logical(x(d$name))) return(NULL)
  
  #split to 2 vectors
  cur = d[x(d$name), "S"] %>% unlist()
  rest = d[!x(d$name), "S"] %>% unlist()
  
  #t-test
  t = t.test(cur, rest)
  data_frame(pattern = y, #the pattern name
             #the p value
             p = t$p.value,
             #p value corrected for multiple testing
             p_cor = p / length(ngrams_chosen),
             #d value to other names
             d = round(t$estimate[1] - t$estimate[2], 2),
             #number of names with pattern
             n = length(cur)
             )
})

#plots
#p
GG_denhist(ngram_tests$p, vline = NULL) + 
  scale_x_continuous("p value") + 
  theme_bw()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/p_val_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#under cutoffs
mean(ngram_tests$p < .05)
## [1] 0.426
mean(ngram_tests$p < .01)
## [1] 0.294
mean(ngram_tests$p < .005)
## [1] 0.246
mean(ngram_tests$p < .001)
## [1] 0.16
#describe
describe(ngram_tests$p)
##    vars    n mean  sd median trimmed  mad min max range skew kurtosis   se
## X1    1 1095 0.25 0.3   0.09     0.2 0.14   0   1     1 1.01    -0.35 0.01
#effct size distribution
GG_denhist(ngram_tests$d, vline = NULL) + 
  scale_x_continuous("d value") + 
  theme_bw()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/d_val_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#correlation of rarity with effect size
ngram_tests$abs_d = abs(ngram_tests$d)

GG_scatter(ngram_tests, "n", "d")

GG_scatter(ngram_tests, "n", "abs_d")

ngram_tests %>% filter(abs_d > 1) %>% pull(n) %>% max()
## [1] 71
ngram_tests %>% filter(abs_d > 1) %>% pull(n) %>% length()
## [1] 93
#top bottom 10
d %>% select(name, S) %>% arrange(S) %>% head(10) %>% write_clipboard()
##         Name     S
## 1     fadumo -3.33
## 2       abdi -3.15
## 3  abdullahi -2.99
## 4      fatme -2.96
## 5     mehmed -2.88
## 6   mustapha -2.86
## 7     karima -2.81
## 8      souad -2.78
## 9      salah -2.72
## 10    halima -2.64
d %>% select(name, S) %>% arrange(desc(S)) %>% head(10) %>% write_clipboard()
##        Name    S
## 1   lauritz 2.39
## 2   alberte 2.31
## 3    gustav 2.19
## 4   villads 2.18
## 5    eskild 2.16
## 6     lauge 2.13
## 7    trille 2.02
## 8  valdemar 2.00
## 9    alfred 2.00
## 10 jens-ole 1.99
#very negative patterns
ngram_tests %>% arrange(d) %>% head(20) %>% pull(n) %>% mean()
## [1] 6.15
ngram_tests %>% arrange(d) %>% head(100) %>% pull(n) %>% mean()
## [1] 9.68
ngram_tests$n %>% mean()
## [1] 28.4
#test all name features using OLS with control for main origin
ngram_tests_control = map_df(ngram_tests$pattern, function(x) {
  #model
  fit = ols(as.formula(sprintf("S ~ %s + main_origin", x)), data = d2)
  params = broom::tidy(summary.lm(fit))
  
  data_frame(pattern = x, #the pattern name
             #the p value
             p = params[2, 5],
             #p value corrected for multiple testing
             p_cor = p / nrow(ngram_tests),
             #d value to other names
             es = params[2, 2],
             #number of names with pattern
             se = params[2, 3]
             )
})

#describe
describe(ngram_tests_control$p)
##    vars    n mean  sd median trimmed  mad min max range skew kurtosis   se
## X1    1 1095 0.38 0.3   0.32    0.36 0.38   0   1     1 0.42    -1.12 0.01
mean(ngram_tests_control$p < .05)
## [1] 0.163
#before and after?
GG_scatter(data_frame(controlled_es = ngram_tests_control$es,
                      bivariate = ngram_tests$d),
           "bivariate", "controlled_es"
           )

Straightforward regression

Having seen that there seems to be some validity, we want to determine more precisely how much validity. How well can we predict social status from the features combined? We try OLS just to see how large the overfitting is.

#make a model
#difficult due to illegal names
#http://stackoverflow.com/questions/29555473/creating-formula-using-very-long-strings-in-r
preds_legal = ("." + ngrams_chosen) %>% str_replace("[\\^\\$]", "_")
preds_legal_quoted = sprintf("`%s`", preds_legal)
f = formula(paste("S ~ ", str_c(c(preds_legal_quoted, other_features), collapse = " + ")))
length(c(preds_legal_quoted, other_features))
## [1] 1099
#OLS
ols_fit = rms::ols(f, data = d2, x = T, y = T)
ols_fit %>% print_n_lines()
## Linear Regression Model
##  
##  rms::ols(formula = f, data = d2, x = T, y = T)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1890    LR chi2    3294.63    R2       0.825    
##  sigma0.6183    d.f.          1099    R2 adj   0.582    
##  d.f.    790    Pr(> chi2)  0.0000    g        1.009    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -1.675e+00 -2.448e-01  6.440e-15  2.646e-01  1.738e+00 
## 
# validate(ols_fit)2

#remove preds with NA estimates
#tidy(summary.lm(ols_fit)) #skips NA for some reason
(NA_coefs = names(coef(ols_fit))[is.na(coef(ols_fit))])
##  [1] ".abd"  ".ius"  ".jør"  ".kla"  ".lex"  ".moh"  ".nja"  ".ohn" 
##  [9] ".oph"  ".pau"  ".rha"  ".uis"  "._ju"  "._kr"  "._pa"  "._sv" 
## [17] "._abd" "._cat" "._chr" "._cla" "._far" "._fat" "._fra" "._fre"
## [25] "._fri" "._gun" "._hil" "._jac" "._jea" "._jen" "._jes" "._jim"
## [33] "._joh" "._jon" "._jos" "._jul" "._kam" "._kri" "._mag" "._maj"
## [41] "._mat" "._mic" "._mik" "._moh" "._pau" "._sam" "._sig" "._sus"
## [49] "._sve" "._tom" "._vic" ".ll_"  ".ud_"  ".eta_" ".ias_" ".ida_"
## [57] ".ild_" ".ita_" ".lie_" ".lly_" ".med_" ".ndy_" ".ner_" ".nia_"
## [65] ".nny_" ".ørn_" ".sie_" ".tta_" ".usz_" ".yna_"
# if (interactive()) ngram_tests %>% filter(pattern %in% NA_coefs) %>% View()

#do they differ numerically?
ngram_tests %>% filter(pattern %in% NA_coefs) %>% .$n %>% describe()
##    vars  n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 70 7.51 2.89      6    7.04 1.48   5  16    11 1.17     0.43 0.35
ngram_tests %>% filter(!pattern %in% NA_coefs) %>% .$n %>% describe()
##    vars    n mean   sd median trimmed  mad min  max range skew kurtosis
## X1    1 1025 29.8 80.8     10    14.5 7.41   5 1154  1149 8.48       89
##      se
## X1 2.53
ngram_tests %>% filter(pattern %in% NA_coefs) %>% .$d %>% abs() %>% describe()
##    vars  n mean   sd median trimmed  mad min  max range skew kurtosis   se
## X1    1 70 0.64 0.59   0.48    0.53 0.33   0 2.44  2.44 1.56     1.72 0.07
ngram_tests %>% filter(!pattern %in% NA_coefs) %>% .$d %>% abs() %>% describe()
##    vars    n mean   sd median trimmed  mad min  max range skew kurtosis
## X1    1 1025 0.47 0.39   0.38    0.41 0.34   0 2.44  2.44 1.62     3.48
##      se
## X1 0.01
nonNA_preds = preds_legal %>% setdiff(NA_coefs)
nonNA_preds_quoted = "`" + nonNA_preds + "`"

#refit without NA preds
f2 = as.formula("S ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + "))
length(c(nonNA_preds_quoted, other_features))
## [1] 1029
ols_fit2 = rms::ols(f2, data = d2, x = T, y = T)
ols_fit2 %>% print_n_lines()
## Linear Regression Model
##  
##  rms::ols(formula = f2, data = d2, x = T, y = T)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1890    LR chi2    3294.63    R2       0.825    
##  sigma0.6183    d.f.          1029    R2 adj   0.616    
##  d.f.    860    Pr(> chi2)  0.0000    g        1.009    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -1.675e+00 -2.448e-01  6.440e-15  2.646e-01  1.738e+00 
## 
# validate(ols_fit2)

#confirm no NA?
assert_that(!anyNA(coef(ols_fit2)))
## [1] TRUE
#control for Danish status
f3 = as.formula("S ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + ") + " + Danish")
ols_fit3 = rms::ols(f3, data = d2, x = T, y = T)
ols_fit3 %>% print_n_lines()
## Linear Regression Model
##  
##  rms::ols(formula = f3, data = d2, x = T, y = T)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1890    LR chi2    3439.35    R2       0.838    
##  sigma0.5954    d.f.          1030    R2 adj   0.644    
##  d.f.    859    Pr(> chi2)  0.0000    g        1.016    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -1.677e+00 -2.345e-01  1.569e-15  2.541e-01  1.586e+00 
## 
#control for main origin status
f4 = as.formula("S ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + ") + " + main_origin")
ols_fit4 = rms::ols(f4, data = d2, x = T, y = T)
ols_fit4 %>% print_n_lines()
## Linear Regression Model
##  
##  rms::ols(formula = f4, data = d2, x = T, y = T)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1890    LR chi2    3781.13    R2       0.865    
##  sigma0.5588    d.f.          1075    R2 adj   0.686    
##  d.f.    814    Pr(> chi2)  0.0000    g        1.028    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -1.46487 -0.22749  0.00952  0.24098  1.28850 
## 
#origin only
ols_fit5 = rms::ols(S ~ Danish, data = d2, x = T, y = T)
ols_fit5 %>% print_n_lines()
## Linear Regression Model
##  
##  rms::ols(formula = S ~ Danish, data = d2, x = T, y = T)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1890    LR chi2    683.09    R2       0.303    
##  sigma0.8327    d.f.            1    R2 adj   0.303    
##  d.f.   1888    Pr(> chi2) 0.0000    g        0.509    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -3.52190 -0.53105  0.04953  0.54654  2.67433 
## 
ols_fit6 = rms::ols(S ~ main_origin, data = d2, x = T, y = T)
ols_fit6 %>% print_n_lines()
## Linear Regression Model
##  
##  rms::ols(formula = S ~ main_origin, data = d2, x = T, y = T)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1890    LR chi2    1457.66    R2       0.538    
##  sigma0.6867    d.f.            46    R2 adj   0.526    
##  d.f.   1843    Pr(> chi2)  0.0000    g        0.737    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -3.42287 -0.36799  0.04452  0.44856  2.70205 
## 
#delta R2 adj.
glance(summary.lm(ols_fit4))$adj.r.squared - glance(summary.lm(ols_fit6))$adj.r.squared
## [1] 0.16
#how much can we fit nothing?
d2$S_random = sample(d2$S)
f7 = as.formula("S_random ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + ") + " + main_origin")
ols_fit7 = rms::ols(f7, data = d2)
ols_fit7 %>% print_n_lines()
## Linear Regression Model
##  
##  rms::ols(formula = f7, data = d2)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1890    LR chi2    1671.86    R2       0.587    
##  sigma0.9763    d.f.          1075    R2 adj   0.042    
##  d.f.    814    Pr(> chi2)  0.0000    g        0.861    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -2.926655 -0.365621  0.007881  0.421093  2.207214 
## 

Penalized regression

Intrigued by the OLS results, we try a better method: LASSO. The approach taken is a repeated approach where we 1) split data into training and test subsets, 2) use cv on the training set to estimate optimal shrinkage parameter, then use that on the full dataset, then 3) test that model on the test set. Because the whole thing relies on somewhat small samples, we repeat this entire procedure a number of times.

#matrix form
x_mat = stats::model.matrix(f4, data = d2) %>% 
  #remove intercept
  #pointless when Y is scaled
  .[, -1]
x_mat_scaled = scale(x_mat)

#ordinary lasso
LASSO_fit = glmnet::glmnet(x_mat, y = d2$S)
plot(LASSO_fit)

#cv
set.seed(1)
LASSO_cv = cv.glmnet(x_mat, y = d2$S, parallel = T)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loaded glmnet 2.0-16
## 
## Attaching package: 'glmnet'
## The following objects are masked from 'package:glmnetUtils':
## 
##     cv.glmnet, glmnet
plot(LASSO_cv)

#optimal penalty
set.seed(1)
runs_results = list()
d2[["...id"]] = seq_along_rows(d2)
runs = 5000
f5 = as.formula("S ~ " + str_c(c(preds_legal_quoted, other_features), collapse = " + ") + " + main_origin")

#load from disk? save time when e.g. writing to Rpubs
if (file.exists("data/lasso_runs.rds")) {
  runs_results = read_rds("data/lasso_runs.rds")
} else {
  time_start = lubridate::now()
  print(sprintf("Loop started at %s", as.character(time_start)))
  for (i in 1:runs) {
    message(sprintf("run: %d of %d", i, runs))
    #estimate shrinkage param
    
    #pick alpha value
    run_alpha = seq(.1, 1, length.out = 5)[(i %% 5) + 1]
  
    #test and train samples
    test_rows = sample(nrow(d2), 50)
    run_test = d2[test_rows, ]
    run_train = d2[setdiff(1:nrow(d2), test_rows), ]
    
    #make matrix for train
    run_x_mat_train = stats::model.matrix(f4, data = run_train)
    run_x_mat_test = stats::model.matrix(f4, data = run_test)
    
    #determine optimal lambda
    lambda_est = cv.glmnet(run_x_mat_train, y = run_train$S, alpha = run_alpha, parallel = T)
    
    #predict into test set
    run_test$pred_lasso = predict(lambda_est, newx = run_x_mat_test) %>% as.vector()
    
    #also fit OLS on selected variables
    #extract features, no intercept
    run_lasso_features = extract_lasso_features(lambda_est) %>% 
      .[-1, ] %>% 
      #no need to deal with main origin since we always include these
      filter(!str_detect(name, "main_origin")) %>% 
      #fix names
      mutate(name = str_replace(name, "TRUE", ""))
    
    #build formula
    run_lasso_chosen = as.formula("S ~ " + str_c(run_lasso_features$name, collapse = " + ") + " + main_origin")

    #fit
    run_ols_fit = ols(run_lasso_chosen, data = run_train)
    
    #any NA? then refit without the offending predictor
    if (anyNA(coef(run_ols_fit))) {
      #find NA preds
      run_NA_preds_lasso_ols = NA_preds(run_ols_fit)
      
      #remove from formula
      run_lasso_chosen = as.formula("S ~ " + str_c(run_lasso_features$name %>% setdiff(run_NA_preds_lasso_ols), collapse = " + ") + " + main_origin")
      
      #refit
      run_ols_fit = ols(run_lasso_chosen, data = run_train)
      
      #any NA again?
      if (anyNA(coef(run_ols_fit))) browser()
    }

    #try predict
    trial = try({run_test$pred_lasso_ols = predict(run_ols_fit, newdata = run_test)})
    #if failed, fill in NAs
    if (is_error(trial)) run_test$pred_lasso_ols = NA
    
    # #full OLS
    # all_preds = c(preds_legal_quoted, other_features, "main_origin")
    # run_f_all = as.formula("S ~ " + str_c(all_preds, collapse = " + "))
    # run_full_ols_fit = ols(run_f_all, data = run_train)
    # 
    # #remove any NA preds and refit
    # if (anyNA(coef(run_full_ols_fit))) {
    #   #nonNA preds
    #   run_NA_preds_full_ols = NA_preds(run_full_ols_fit)
    #   
    #   #non-NA
    #   run_nonNA_preds_full_ols = all_preds %>% setdiff(run_NA_preds_full_ols)
    #   
    #   #formula
    #   run_f_all2 = as.formula("S ~ " + str_c(run_nonNA_preds_full_ols, collapse = " + "))
    #   
    #   #refit
    #   run_full_ols_fit = ols(run_f_all2, data = run_train)
    #   
    #   #more NA?
    #   if (anyNA(coef(run_full_ols_fit))) browser()
    # }
    # 
    # #try predict
    # trial = try({run_test$pred_ols = predict(run_full_ols_fit, newdata = run_test)})
    # #if failed, fill in NAs
    # if (is_error(trial)) run_test$pred_ols = NA
    
    #save info
    runs_results[[i]] = list(
      run_cv = lambda_est,
      alpha = run_alpha,
      #standard lasso
      R2_test_lasso = calc_r2(run_test$pred_lasso, run_test$S),
      #ols lasso
      R2_test_lasso_ols = calc_r2(run_test$pred_lasso_ols, run_test$S),
      #predicted values
      oos_preds = run_test %>% select(pred_lasso, pred_lasso_ols, `...id`) %>% mutate(...run = i)
    )
  }
  
  time_end = lubridate::now()
  running_time = time_end - time_start
  cat(sprintf("Loop finished at %s\nrunning time of %s", as.character(time_end), capture.output(print(running_time))))
  
  #save to disk
  write_rds(runs_results, "data/lasso_runs.rds")
}

#summary stats
penalized_reg_summary = map_df(runs_results, function(x) {
  data_frame(
    R2_test_lasso = x$R2_test_lasso,
    R2_test_lasso_ols = x$R2_test_lasso_ols,
    alpha = x$alpha,
    lambda_1se = x$run_cv$lambda.1se,
    lambda_optimal = x$run_cv$lambda.min
  )
})

#results
penalized_reg_summary %>% 
  gather(key = predict_method, value = R2_test, R2_test_lasso, R2_test_lasso_ols) %>% 
  mutate(alpha = factor(alpha)) %>% 
  GG_group_means("R2_test", "alpha", "predict_method") +
  scale_y_continuous(breaks = seq(0, 1, .1)) +
  scale_fill_discrete("Predict method", labels = c("LASSO", "OLS"))
## Missing values were removed.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.

GG_save("figures/lasso_cv.png")

penalized_reg_summary %>% 
  gather(key = predict_method, value = R2_test, R2_test_lasso, R2_test_lasso_ols) %>% 
  mutate(alpha = factor(alpha)) %>% 
  unite(method, alpha, predict_method) %$% 
  describeBy(R2_test, method, mat = T) %>% print(digits = 3)
##      item                  group1 vars    n  mean     sd median trimmed
## X11     1       0.1_R2_test_lasso    1 1000 0.520 0.0957  0.530   0.525
## X12     2   0.1_R2_test_lasso_ols    1  999 0.461 0.1407  0.479   0.470
## X13     3     0.325_R2_test_lasso    1 1000 0.530 0.1060  0.541   0.536
## X14     4 0.325_R2_test_lasso_ols    1  999 0.493 0.1399  0.503   0.503
## X15     5      0.55_R2_test_lasso    1 1000 0.528 0.1050  0.540   0.534
## X16     6  0.55_R2_test_lasso_ols    1  996 0.497 0.1351  0.513   0.505
## X17     7     0.775_R2_test_lasso    1 1000 0.533 0.1018  0.545   0.539
## X18     8 0.775_R2_test_lasso_ols    1  998 0.503 0.1265  0.518   0.511
## X19     9         1_R2_test_lasso    1 1000 0.531 0.1035  0.539   0.536
## X110   10     1_R2_test_lasso_ols    1  998 0.501 0.1261  0.515   0.508
##         mad     min   max range   skew kurtosis      se
## X11  0.0933  0.1276 0.733 0.606 -0.581  0.38181 0.00303
## X12  0.1330 -0.1345 0.791 0.925 -0.783  1.04040 0.00445
## X13  0.1060  0.0045 0.753 0.749 -0.635  0.67044 0.00335
## X14  0.1351 -0.4046 0.790 1.194 -0.869  1.86951 0.00443
## X15  0.1059 -0.0221 0.799 0.821 -0.603  0.63959 0.00332
## X16  0.1274 -0.1053 0.819 0.924 -0.715  0.76864 0.00428
## X17  0.1001  0.1568 0.757 0.600 -0.585  0.21067 0.00322
## X18  0.1258 -0.1074 0.772 0.880 -0.674  0.56308 0.00400
## X19  0.0997  0.1329 0.799 0.666 -0.546  0.50924 0.00327
## X110 0.1179  0.0693 0.781 0.712 -0.513 -0.00968 0.00399
#one time rename
# for (i in seq_along(runs_results_dk)) {
#   runs_results_dk[[i]][["oos_preds"]] = runs_results_dk[[i]][["standard_lasso_preds"]]
#   runs_results_dk[[i]][["standard_lasso_preds"]] = NULL
# }

#oos predictions from runs
oos_data = map_df(runs_results, "oos_preds")

#take first from each observation
set.seed(1)
oos_data_rand1 = oos_data %>% 
  #group by id
  group_by(...id) %>% 
  #select 1 value at random
  do(sample_n(., 1)) %>% 
  #join with true values
  full_join(d2 %>% select(S, ...id, Danish), by = "...id")

oos_data_mean = oos_data %>% 
  #group by id
  group_by(...id) %>% 
  #select 1 value at random
  dplyr::summarize(pred_lasso = mean(pred_lasso)) %>% 
  #join with true values
  full_join(d2 %>% select(S, ...id, Danish), by = "...id")

#plot
GG_scatter(oos_data_rand1, "pred_lasso", "S", color = "Danish") +
  scale_x_continuous("Predicted S") +
  scale_color_discrete("Is Danish?")

GG_scatter(oos_data_mean, "pred_lasso", "S", color = "Danish") +
  scale_x_continuous("Predicted S") +
  scale_color_discrete("Is Danish?")

GG_save("figures/lasso_oos_scatter.png")

#predict from full lasso
d2$S_lasso_full = predict(LASSO_cv, newx = x_mat) %>% as.vector()

GG_scatter(d2, "S_lasso_full", "S", color = "Danish", alpha = .5) +
  scale_x_continuous("Predicted S score") +
  scale_color_discrete("Is Danish?")

GG_save("figures/lasso_full_cv.png")

#R2 from cor
cor(d2$S_lasso_full, d2$S)^2
## [1] 0.636

Relaxed LASSO

#this takes forever because it's not parallelized!
#relaxed lasso

#load from disk? save time when e.g. writing to Rpubs
if (file.exists("data/relaxo_runs.rds")) {
  runs_results = read_rds("data/relaxo_runs.rds")
} else {
  RLASSO_fit = relaxo::relaxo(x_mat_scaled, d2$S)
  plot(RLASSO_fit)
  
  #cv
  set.seed(1)
  RLASSO_cv = cvrelaxo(x_mat_scaled, d2$S)
  
  #save
  write_rds(runs_results, "data/relaxo_runs.rds")
}

Danish subset analysis

Since we only have 2 groups, we can be lazy and copy paste code instead of abstracting all the functionality.

Data preparation

The predictor set needs to be reduced due to sample reduction. Many predictors that previously had 5 or more names matching no longer have so.

#dk subset
dk = d %>% filter(Danish)

#how many
nrow(dk)
## [1] 1299
#make list of all relevant n-grams
ngrams_dk = c(make_ngrams(1:ngram_length),
           "^" + make_ngrams(1:ngram_length),
           make_ngrams(1:ngram_length) + "$"
           )

#df
ngram_df_dk = data_frame(
  ngram = ngrams_dk,
  
  #count matches
  times = map_int(ngrams_dk, function(x) {
  sum(stringr::str_detect(dk$name, pattern = x))
  })
)

#subset
ngrams_chosen_dk = ngram_df_dk %>% filter(times >= 5) %>% .$ngram

#list of functions to score a given name for each feature
l_feature_funcs_dk = list()

#add ngram functions
for (ng in ngrams_chosen_dk) {
  #replace illegal chrs
  colname = str_replace(ng, "\\^", "_") %>% str_replace("\\$", "_")
  
  #prepend a dot to avoid syntactically invalid names
  colname = "." + colname
  
  #save
  l_feature_funcs_dk[[colname]] = curry::partial(str_detect, args = list(pattern = ng))
}

#other feature funcs
l_feature_funcs_dk[["length"]] = str_length

l_feature_funcs_dk[["vowel_frac"]] = function(x) {
  #count vowels
  .matches = str_match_all(x, pattern = "[aeiouæøå]")
  
  #fraction of length
  nrow(.matches[[1]]) / str_length(x)
}

l_feature_funcs_dk[["stop_frac"]] = function(x) {
  #count stops
  .matches = str_match_all(x, pattern = "[tdpbkg]")
  
  #fraction of length
  nrow(.matches[[1]]) / str_length(x)
}

l_feature_funcs_dk[["nasal_frac"]] = function(x) {
  #count stops
  .matches = str_match_all(x, pattern = "[mn]")
  
  #fraction of length
  nrow(.matches[[1]]) / str_length(x)
}

l_feature_funcs_dk[["has_dash"]] = function(x) {
  str_detect(x, pattern = "-")
}

#vector
other_features = c("length", "vowel_frac", "stop_frac", "nasal_frac", "has_dash")

#test
assert_that(l_feature_funcs_dk[["vowel_frac"]]("emil") == 0.5)
## [1] TRUE
assert_that(are_equal(l_feature_funcs_dk[["vowel_frac"]]("abc"), 1/3))
## [1] TRUE
#prepare a second dataset with syntactically legal names for each regex
d2_dk = data_frame(name = dk$name,
                S = dk$S,
                Danish = dk$Danish,
                main_origin = dk$main_origin
                )

#really compact function to make the feature data frame
#loops over each name, then applies each feature function to that name
#then collects and constructs as data frame
d2_dk = cbind(d2_dk, 
              map_df(dk$name, function(.name) {
  plyr::each(l_feature_funcs_dk)(.name)
}))

#test all n-gram using t-test
ngram_tests_dk = map2_df(l_feature_funcs_dk, names(l_feature_funcs_dk), function(x, y) {
  #check if output is logical, if not, skip
  if (!is.logical(x(dk$name))) return(NULL)
  
  #split to 2 vectors
  cur = dk[x(dk$name), "S"] %>% unlist()
  rest = dk[!x(dk$name), "S"] %>% unlist()
  
  #t-test
  t = t.test(cur, rest)
  data_frame(pattern = y, #the pattern name
             #the p value
             p = t$p.value,
             #p value corrected for multiple testing
             p_cor = p / length(ngrams_chosen_dk),
             #d value to other names
             d = round(t$estimate[1] - t$estimate[2], 2),
             #number of names with pattern
             n = length(cur)
             )
})

#plots
#p
GG_denhist(ngram_tests_dk$p, vline = NULL) + 
  scale_x_continuous("p value") + 
  theme_bw()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/p_val_dist_dk.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#under cutoffs
mean(ngram_tests_dk$p < .05)
## [1] 0.259
mean(ngram_tests_dk$p < .01)
## [1] 0.136
mean(ngram_tests_dk$p < .005)
## [1] 0.105
mean(ngram_tests_dk$p < .001)
## [1] 0.0596
#describe
describe(ngram_tests_dk$p)
##    vars   n mean  sd median trimmed mad min max range skew kurtosis   se
## X1    1 806 0.32 0.3   0.22    0.28 0.3   0   1     1 0.77    -0.65 0.01
#d
GG_denhist(ngram_tests_dk$d, vline = NULL) + 
  scale_x_continuous("d value") + 
  theme_bw()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/d_val_dist_dk.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#top bottom 10
dk %>% select(name, S) %>% arrange(S) %>% head(10) %>% write_clipboard()
##        Name     S
## 1      abdi -3.15
## 2      amal -2.54
## 3      esme -2.14
## 4     hanan -2.12
## 5     manal -2.12
## 6   valborg -2.11
## 7    mohsen -2.02
## 8     layla -1.94
## 9  nielsine -1.93
## 10  anelise -1.80
dk %>% select(name, S) %>% arrange(desc(S)) %>% head(10) %>% write_clipboard()
##        Name    S
## 1   lauritz 2.39
## 2   alberte 2.31
## 3    gustav 2.19
## 4   villads 2.18
## 5    eskild 2.16
## 6     lauge 2.13
## 7    trille 2.02
## 8  valdemar 2.00
## 9    alfred 2.00
## 10 jens-ole 1.99

OLS regression

#make a model
#difficult due to illegal names
#http://stackoverflow.com/questions/29555473/creating-formula-using-very-long-strings-in-r
preds_legal = ("." + ngrams_chosen_dk) %>% str_replace("[\\^\\$]", "_")
preds_legal_quoted = sprintf("`%s`", preds_legal)
f = formula(paste("S ~ ", str_c(c(preds_legal_quoted, other_features), collapse = " + ")))
length(c(preds_legal_quoted, other_features))
## [1] 810
#OLS
ols_fit = rms::ols(f, data = d2_dk, x = T, y = T)
ols_fit %>% print_n_lines()
## Linear Regression Model
##  
##  rms::ols(formula = f, data = d2_dk, x = T, y = T)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1299    LR chi2    1909.72    R2       0.770    
##  sigma0.5312    d.f.           810    R2 adj   0.389    
##  d.f.    488    Pr(> chi2)  0.0000    g        0.700    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -1.624e+00 -2.134e-01  1.521e-15  2.231e-01  1.509e+00 
## 
# validate(ols_fit)2

#remove preds with NA estimates
#tidy(summary.lm(ols_fit)) #skips NA for some reason
(NA_coefs = names(coef(ols_fit))[is.na(coef(ols_fit))])
##  [1] ".alt"  ".bri"  ".jør"  ".liz"  ".lou"  ".nja"  ".oha"  ".ohn" 
##  [9] ".oph"  ".oui"  ".pau"  ".sve"  ".uis"  "._do"  "._ji"  "._ju" 
## [17] "._kr"  "._pa"  "._sv"  "._bri" "._cat" "._cla" "._emi" "._fra"
## [25] "._fre" "._gun" "._hil" "._isa" "._jea" "._jen" "._jes" "._jim"
## [33] "._joh" "._jon" "._jos" "._kir" "._kri" "._lil" "._mag" "._mai"
## [41] "._maj" "._mat" "._mic" "._pau" "._sig" "._sil" "._sus" "._sve"
## [49] "._vic" ".ll_"  ".my_"  ".ias_" ".ild_" ".ita_" ".lie_" ".lly_"
## [57] ".nda_" ".ndy_" ".ner_" ".nny_" ".ørn_" ".sie_" ".tta_"
# if (interactive()) ngram_tests %>% filter(pattern %in% NA_coefs) %>% View()

#do they differ numerically?
ngram_tests_dk %>% filter(pattern %in% NA_coefs) %>% .$n %>% describe()
##    vars  n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 63    7 2.76      6    6.49 1.48   5  21    16 2.59     9.15 0.35
ngram_tests_dk %>% filter(!pattern %in% NA_coefs) %>% .$n %>% describe()
##    vars   n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 743 27.7 64.3     11    14.8 7.41   5 723   718 7.13     61.6 2.36
ngram_tests_dk %>% filter(pattern %in% NA_coefs) %>% .$d %>% abs() %>% describe()
##    vars  n mean   sd median trimmed  mad min  max range skew kurtosis   se
## X1    1 63 0.29 0.24   0.23    0.25 0.19   0 1.19  1.19 1.62     3.03 0.03
ngram_tests_dk %>% filter(!pattern %in% NA_coefs) %>% .$d %>% abs() %>% describe()
##    vars   n mean   sd median trimmed  mad min  max range skew kurtosis
## X1    1 743 0.26 0.22   0.21    0.23 0.19   0 1.38  1.38 1.49        3
##      se
## X1 0.01
nonNA_preds = preds_legal %>% setdiff(NA_coefs)
nonNA_preds_quoted = "`" + nonNA_preds + "`"

#refit without NA preds
f2 = as.formula("S ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + "))
length(c(nonNA_preds_quoted, other_features))
## [1] 747
ols_fit2 = rms::ols(f2, data = d2_dk, x = T, y = T)
ols_fit2 %>% print_n_lines()
## Linear Regression Model
##  
##  rms::ols(formula = f2, data = d2_dk, x = T, y = T)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1299    LR chi2    1909.72    R2       0.770    
##  sigma0.5312    d.f.           747    R2 adj   0.458    
##  d.f.    551    Pr(> chi2)  0.0000    g        0.700    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -1.624e+00 -2.134e-01  1.521e-15  2.231e-01  1.509e+00 
## 
# validate(ols_fit2)

#confirm no NA?
assert_that(!anyNA(coef(ols_fit2)))
## [1] TRUE
#control for main origin status
f4 = as.formula("S ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + ") + " + main_origin")
ols_fit4 = rms::ols(f4, data = d2_dk, x = T, y = T)
ols_fit4 %>% print_n_lines()
## Linear Regression Model
##  
##  rms::ols(formula = f4, data = d2_dk, x = T, y = T)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1299    LR chi2    2104.35    R2       0.802    
##  sigma0.5117    d.f.           788    R2 adj   0.496    
##  d.f.    510    Pr(> chi2)  0.0000    g        0.712    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -1.233e+00 -1.944e-01  3.036e-16  2.047e-01  1.537e+00 
## 
#origin only
ols_fit6 = rms::ols(S ~ main_origin, data = d2_dk, x = T, y = T)
ols_fit6 %>% print_n_lines()
## Linear Regression Model
##  
##  rms::ols(formula = S ~ main_origin, data = d2_dk, x = T, y = T)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1299    LR chi2    242.88    R2       0.171    
##  sigma0.6680    d.f.           41    R2 adj   0.143    
##  d.f.   1257    Pr(> chi2) 0.0000    g        0.306    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.68948 -0.32220  0.04868  0.44072  1.83790 
## 
#how much can we fit nothing?
d2_dk$S_random = sample(d2_dk$S)
f7 = as.formula("S_random ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + ") + " + main_origin")
ols_fit7 = rms::ols(f7, data = d2_dk)
ols_fit7 %>% print_n_lines()
## Linear Regression Model
##  
##  rms::ols(formula = f7, data = d2_dk)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1299    LR chi2    1171.85    R2       0.594    
##  sigma0.7327    d.f.           788    R2 adj  -0.033    
##  d.f.    510    Pr(> chi2)  0.0000    g        0.614    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -1.916e+00 -2.461e-01  1.813e-15  2.828e-01  1.537e+00 
## 

LASSO

#matrix form
x_mat = stats::model.matrix(f4, data = d2_dk) %>% 
  #remove intercept
  #pointless when Y is scaled
  .[, -1]
x_mat_scaled = scale(x_mat)

#ordinary lasso
LASSO_fit_dk = glmnet::glmnet(x_mat, y = d2_dk$S)
plot(LASSO_fit_dk)

#cv
set.seed(1)
LASSO_cv_dk = cv.glmnet(x_mat, y = d2_dk$S, parallel = T)
plot(LASSO_cv_dk)

#optimal penalty
set.seed(1)
runs_results_dk = list()
d2_dk[["...id"]] = seq_along_rows(d2_dk)
runs = 5000
f5 = as.formula("S ~ " + str_c(c(preds_legal_quoted, other_features), collapse = " + ") + " + main_origin")

#load from disk? save time when e.g. writing to Rpubs
if (file.exists("data/lasso_runs_dk.rds")) {
  runs_results_dk = read_rds("data/lasso_runs_dk.rds")
} else {
  time_start = lubridate::now()
  print(sprintf("Loop started at %s", as.character(time_start)))
  for (i in 1:runs) {
    message(sprintf("run: %d of %d", i, runs))
    #estimate shrinkage param
    
    #pick alpha value
    run_alpha = seq(.1, 1, length.out = 5)[(i %% 5) + 1]
  
    #test and train samples
    test_rows = sample(nrow(d2_dk), 50)
    run_test = d2_dk[test_rows, ]
    run_train = d2_dk[setdiff(1:nrow(d2_dk), test_rows), ]
    
    #make matrix for train
    run_x_mat_train = stats::model.matrix(f4, data = run_train)
    run_x_mat_test = stats::model.matrix(f4, data = run_test)
    
    #determine optimal lambda
    lambda_est = cv.glmnet(run_x_mat_train, y = run_train$S, alpha = run_alpha, parallel = T)
    
    #predict into test set
    run_test$pred_lasso = predict(lambda_est, newx = run_x_mat_test) %>% as.vector()
    
    #also fit OLS on selected variables
    #extract features, no intercept
    run_lasso_features = extract_lasso_features(lambda_est) %>% 
      .[-1, ] %>% 
      #no need to deal with main origin since we always include these
      filter(!str_detect(name, "main_origin")) %>% 
      #fix names
      mutate(name = str_replace(name, "TRUE", ""))
    
    #build formula
    run_lasso_chosen = as.formula("S ~ " + str_c(run_lasso_features$name, collapse = " + ") + " + main_origin")

    #fit
    run_ols_fit = ols(run_lasso_chosen, data = run_train)
    
    #any NA? then refit without the offending predictor
    if (anyNA(coef(run_ols_fit))) {
      #find NA preds
      run_NA_preds_lasso_ols = NA_preds(run_ols_fit)
      
      #remove from formula
      run_lasso_chosen = as.formula("S ~ " + str_c(run_lasso_features$name %>% setdiff(run_NA_preds_lasso_ols), collapse = " + ") + " + main_origin")
      
      #refit
      run_ols_fit = ols(run_lasso_chosen, data = run_train)
      
      #any NA again?
      if (anyNA(coef(run_ols_fit))) browser()
    }

    #try predict
    trial = try({run_test$pred_lasso_ols = predict(run_ols_fit, newdata = run_test)})
    #if failed, fill in NAs
    if (is_error(trial)) run_test$pred_lasso_ols = NA
    
    # #full OLS
    # all_preds = c(preds_legal_quoted, other_features, "main_origin")
    # run_f_all = as.formula("S ~ " + str_c(all_preds, collapse = " + "))
    # run_full_ols_fit = ols(run_f_all, data = run_train)
    # 
    # #remove any NA preds and refit
    # if (anyNA(coef(run_full_ols_fit))) {
    #   #nonNA preds
    #   run_NA_preds_full_ols = NA_preds(run_full_ols_fit)
    #   
    #   #non-NA
    #   run_nonNA_preds_full_ols = all_preds %>% setdiff(run_NA_preds_full_ols)
    #   
    #   #formula
    #   run_f_all2 = as.formula("S ~ " + str_c(run_nonNA_preds_full_ols, collapse = " + "))
    #   
    #   #refit
    #   run_full_ols_fit = ols(run_f_all2, data = run_train)
    #   
    #   #more NA?
    #   if (anyNA(coef(run_full_ols_fit))) browser()
    # }
    # 
    # #try predict
    # trial = try({run_test$pred_ols = predict(run_full_ols_fit, newdata = run_test)})
    # #if failed, fill in NAs
    # if (is_error(trial)) run_test$pred_ols = NA
    
    #save info
    runs_results_dk[[i]] = list(
      run_cv = lambda_est,
      alpha = run_alpha,
      #standard lasso
      R2_test_lasso = calc_r2(run_test$pred_lasso, run_test$S),
      #ols lasso
      R2_test_lasso_ols = calc_r2(run_test$pred_lasso_ols, run_test$S),
      #predicted values
      oos_preds = run_test %>% select(pred_lasso, pred_lasso_ols, `...id`) %>% mutate(...run = i)
    )
  }
  
  time_end = lubridate::now()
  running_time = time_end - time_start
  cat(sprintf("Loop finished at %s\nrunning time of %s", as.character(time_end), capture.output(print(running_time))))
  
  #save to disk
  write_rds(runs_results_dk, "data/lasso_runs_dk.rds")
}

#summary stats
penalized_reg_summary_dk = map_df(runs_results_dk, function(x) {
  data_frame(
    R2_test_lasso = x$R2_test_lasso,
    R2_test_lasso_ols = x$R2_test_lasso_ols,
    alpha = x$alpha,
    lambda_1se = x$run_cv$lambda.1se,
    lambda_optimal = x$run_cv$lambda.min
  )
})

#results
penalized_reg_summary_dk %>% 
  gather(key = predict_method, value = R2_test, R2_test_lasso, R2_test_lasso_ols) %>% 
  mutate(alpha = factor(alpha)) %>% 
  GG_group_means("R2_test", "alpha", "predict_method") +
  scale_y_continuous(breaks = seq(0, 1, .1)) +
  scale_fill_discrete("Predict method", labels = c("LASSO", "OLS"))
## Missing values were removed.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.

GG_save("figures/lasso_cv_dk.png")

penalized_reg_summary_dk %>% 
  gather(key = predict_method, value = R2_test, R2_test_lasso, R2_test_lasso_ols) %>% 
  mutate(alpha = factor(alpha)) %>% 
  unite(method, alpha, predict_method) %$% 
  describeBy(R2_test, method, mat = T) %>% print(digits = 3)
##      item                  group1 vars    n   mean     sd median trimmed
## X11     1       0.1_R2_test_lasso    1 1000 0.1724 0.0837 0.1770  0.1749
## X12     2   0.1_R2_test_lasso_ols    1  786 0.0390 0.2514 0.0753  0.0615
## X13     3     0.325_R2_test_lasso    1 1000 0.1722 0.0913 0.1723  0.1741
## X14     4 0.325_R2_test_lasso_ols    1  741 0.0577 0.2434 0.1008  0.0830
## X15     5      0.55_R2_test_lasso    1 1000 0.1687 0.0909 0.1716  0.1711
## X16     6  0.55_R2_test_lasso_ols    1  762 0.0726 0.2161 0.0931  0.0894
## X17     7     0.775_R2_test_lasso    1 1000 0.1708 0.0934 0.1727  0.1726
## X18     8 0.775_R2_test_lasso_ols    1  772 0.0804 0.2208 0.1164  0.1017
## X19     9         1_R2_test_lasso    1 1000 0.1697 0.0925 0.1716  0.1705
## X110   10     1_R2_test_lasso_ols    1  759 0.0802 0.2234 0.1022  0.0972
##         mad    min   max range   skew kurtosis      se
## X11  0.0789 -0.168 0.423 0.590 -0.414    0.779 0.00265
## X12  0.2105 -1.577 0.571 2.147 -1.456    4.965 0.00897
## X13  0.0856 -0.354 0.451 0.805 -0.424    1.423 0.00289
## X14  0.2031 -1.492 0.609 2.101 -1.614    5.255 0.00894
## X15  0.0890 -0.176 0.443 0.619 -0.276    0.253 0.00287
## X16  0.1983 -1.158 0.542 1.700 -1.008    2.286 0.00783
## X17  0.0923 -0.337 0.486 0.823 -0.280    0.649 0.00295
## X18  0.1838 -1.165 0.536 1.700 -1.203    2.692 0.00795
## X19  0.0865 -0.203 0.432 0.635 -0.176    0.326 0.00292
## X110 0.2173 -0.994 0.607 1.602 -0.852    1.293 0.00811
#oos predictions from runs
oos_data_dk = map_df(runs_results_dk, "oos_preds")

#take first from each observation
set.seed(1)
oos_data_rand1_dk = oos_data_dk %>% 
  #group by id
  group_by(...id) %>% 
  #select 1 value at random
  do(sample_n(., 1)) %>% 
  #join with true values
  full_join(d2_dk %>% select(S, ...id), by = "...id")

oos_data_mean_dk = oos_data_dk %>% 
  #group by id
  group_by(...id) %>% 
  #select 1 value at random
  dplyr::summarize(pred_lasso = mean(pred_lasso)) %>% 
  #join with true values
  full_join(d2_dk %>% select(S, ...id), by = "...id")

#plot
GG_scatter(oos_data_rand1_dk, "pred_lasso", "S") +
  scale_x_continuous("Predicted S")

GG_scatter(oos_data_mean_dk, "pred_lasso", "S") +
  scale_x_continuous("Predicted S")

GG_save("figures/lasso_oos_scatter_dk.png")

#predict from full lasso
d2_dk$S_lasso_full = predict(LASSO_cv_dk, newx = x_mat) %>% as.vector()

GG_scatter(d2_dk, "S_lasso_full", "S", alpha = .5) +
  scale_x_continuous("Predicted S score")

GG_save("figures/lasso_full_cv_dk.png")

Output data for reuse