First we load some packages we want to use. We also set the digits in the options, so that when we later output a long table, it will have nicer formatting.
library(pacman)
p_load(readr, kirkegaard, plyr, dplyr, gtools, purrr, curry, glmnetUtils, assertthat, rms, broom, doMC, relaxo, writexl)
options(digits = 3)
#print first n lines
print_n_lines = function(expr, n = 15) {
#quote
quoted_expr = quote(expr)
#eval and print
out = capture.output(print(eval(quoted_expr))) %>% str_split("\n", simplify = T)
#concat again
#dont try to print more than possible
if (length(out) < n) n = length(n)
out2 = out[1:n, ] %>% str_c(collapse = "\n")
#print
cat(out2)
invisible(NULL)
}
#extract best lasso features
#https://stackoverflow.com/questions/27801130/extracting-coefficient-variable-names-from-glmnet-into-a-data-frame
extract_lasso_features = function(x) {
tmp_coeffs <- coef(x, s = "lambda.min")
data.frame(name = tmp_coeffs@Dimnames[[1]][tmp_coeffs@i + 1], coefficient = tmp_coeffs@x)
}
#calc R2 manually
calc_r2 = function(fitted, true) {
#R2 from sums of squares
#sum of squared errors
SSE = sum((true - fitted)^2)
#total sum of squares
SSTO = sum((mean(true) - true)^2)
#R2
1 - (SSE / SSTO)
}
#NA predictors of a model
NA_preds = function(x) {
names(coef(x)[is.na(coef(x))])
}
require(doMC)
registerDoMC(cores = detectCores() - 1)
We load the data from the previous study. We only need to do a little cleaning. A few of the names are unisex names and have data for both sexes independently. These are marked by a suffix in the name indicating the sex, but which is redundant because there’s a separate column for it too. So, we get rid of it. This leaves us with some duplicated names. We could leave these cases in. They would basically act as two cases with exactly the same predictors but two different outcomes. Instead I decided to keep only the largest of the paired names. Usually one is much more common than the other.
#load data
d_orig = silence(read_csv("data/names.csv"))
origins = silence(read_csv("data/names_origin.csv"))
#merge
d = left_join(d_orig, origins, by = c("name" = "name"))
#remove those with no data
d = d[!is.na(d$S.both.no.age), ]
#rename
d$S = d$S.both.no.age
#add row ids
d$..row = 1:nrow(d)
#lowercase names
d$name_orig = d$name
d$name = str_to_lower(d$name)
#remove sex marker
d$name %<>% str_replace(pattern = " \\(\\w\\)$", "")
#remove smallest of duplicate name pairs
find_dups = function(x) {
dups = duplicated(x) | duplicated(x, fromLast = T)
split(which(dups), x[dups])
}
#keep largest of the pairs
invisible(map_lgl(names(find_dups(d$name)), function(x) {
# browser()
#find where the dups are
drows = d %>% dplyr::filter(name == x)
#which is smallest?
smallest = which.min(drows$number)
#which row is that?
row_to_del = drows[smallest, "..row"] %>% unlist
#delete that row
d <<- dplyr::filter(d, ..row != row_to_del)
T
}))
#recode to logical
d$Danish = as.logical(d$Danish)
Recode origins data.
#recode
table2(d$main_origin_detailed) %>% print(n = Inf)
## # A tibble: 104 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 English 333. 17.6
## 2 NaN 199. 10.5
## 3 Danish 195. 10.3
## 4 German 163. 8.62
## 5 Swedish 157. 8.31
## 6 Arabic 116. 6.14
## 7 Turkish 72. 3.81
## 8 French 70. 3.70
## 9 Polish 67. 3.54
## 10 Italian 56. 2.96
## 11 Norwegian 44. 2.33
## 12 Spanish 38. 2.01
## 13 Russian 27. 1.43
## 14 Dutch 25. 1.32
## 15 Finnish 19. 1.01
## 16 Bosnian 17. 0.899
## 17 Irish 17. 0.899
## 18 Romanian 16. 0.847
## 19 Croatian 14. 0.741
## 20 Persian 13. 0.688
## 21 Scottish 12. 0.635
## 22 Czech 11. 0.582
## 23 Greek 10. 0.529
## 24 Danish (Rare) 8. 0.423
## 25 Hungarian 8. 0.423
## 26 Ukrainian 8. 0.423
## 27 Welsh 8. 0.423
## 28 Ancient Scandinavian 6. 0.317
## 29 Biblical 6. 0.317
## 30 Low German 6. 0.317
## 31 Serbian 6. 0.317
## 32 Vietnamese 6. 0.317
## 33 Ancient Roman 5. 0.265
## 34 Bulgarian 5. 0.265
## 35 Icelandic 5. 0.265
## 36 Lithuanian 5. 0.265
## 37 Portuguese 5. 0.265
## 38 Scandinavian 5. 0.265
## 39 Various 5. 0.265
## 40 English (Modern) 4. 0.212
## 41 English (Rare) 4. 0.212
## 42 Hebrew 4. 0.212
## 43 Japanese 4. 0.212
## 44 Norse Mythology 4. 0.212
## 45 Old Danish 4. 0.212
## 46 Somali 4. 0.212
## 47 Arabic (Maghrebi) 3. 0.159
## 48 Chinese 3. 0.159
## 49 Frisian 3. 0.159
## 50 West Frisian 3. 0.159
## 51 English (Modern, Rare) 2. 0.106
## 52 Greek Mythology 2. 0.106
## 53 Hindi 2. 0.106
## 54 Hinduism 2. 0.106
## 55 Indian 2. 0.106
## 56 Latvian 2. 0.106
## 57 Roman Mythology 2. 0.106
## 58 Slovene 2. 0.106
## 59 Urdu 2. 0.106
## 60 Afrikaans (Rare) 1. 0.0529
## 61 Ancient Germanic 1. 0.0529
## 62 Ancient Greek 1. 0.0529
## 63 Ancient Persian 1. 0.0529
## 64 Arabic (Egyptian) 1. 0.0529
## 65 Azerbaijani 1. 0.0529
## 66 Basque 1. 0.0529
## 67 Belarusian 1. 0.0529
## 68 Biblical Greek 1. 0.0529
## 69 Biblical Latin 1. 0.0529
## 70 Catalan 1. 0.0529
## 71 Danish (Modern) 1. 0.0529
## 72 Danish (Modern, Rare) 1. 0.0529
## 73 Danish (Rare, Archaic) 1. 0.0529
## 74 Dutch (Modern) 1. 0.0529
## 75 English (American) 1. 0.0529
## 76 English (American, Modern, Rare) 1. 0.0529
## 77 English (American, Rare) 1. 0.0529
## 78 English (Archaic) 1. 0.0529
## 79 Estonian 1. 0.0529
## 80 Faroese 1. 0.0529
## 81 Fijian 1. 0.0529
## 82 Flemish (Rare) 1. 0.0529
## 83 Galician 1. 0.0529
## 84 German (Archaic) 1. 0.0529
## 85 Greek (Rare) 1. 0.0529
## 86 Hawaiian 1. 0.0529
## 87 Irish Mythology 1. 0.0529
## 88 Japanese (Modern) 1. 0.0529
## 89 Judeo-Christian Legend 1. 0.0529
## 90 Macedonian 1. 0.0529
## 91 Manx 1. 0.0529
## 92 Marathi 1. 0.0529
## 93 Near Eastern Mythology 1. 0.0529
## 94 Norwegian (Rare) 1. 0.0529
## 95 Old Swedish 1. 0.0529
## 96 Ottoman Turkish 1. 0.0529
## 97 Portuguese (Brazilian) 1. 0.0529
## 98 Sami 1. 0.0529
## 99 Slovak 1. 0.0529
## 100 South American 1. 0.0529
## 101 Swedish (Rare) 1. 0.0529
## 102 Thai 1. 0.0529
## 103 West Frisian (Rare) 1. 0.0529
## 104 <NA> 0. 0.
#recode
#all temporal variations within countries
d$main_origin = d$main_origin_detailed %>%
#old versions in parens
str_replace(" \\([\\w, ]+\\)", "") %>%
#old versions in front
str_replace("Old|Ancient ", "") %>%
#NaN to Unknown_other
mapvalues("NaN", "Unknown") %>%
#gather biblicals
mapvalues(c("Judeo-Christian Legend", "Biblical Greek", "Biblical Latin"), c("Biblical", "Biblical", "Biblical")) %>%
#clean whitespace
str_trim() %>%
#turkish
mapvalues("Ottoman Turkish", "Turkish") %>%
#Indian
mapvalues(c("Hindi", "Hinduism", "Marathi", "Urdu"), rep("Indian", 4)) %>%
#Irish
mapvalues("Irish Mythology", "Irish") %>%
#Greek
mapvalues("Greek Mythology", "Greek") %>%
#Frisian
mapvalues("West Frisian", "Frisian") %>%
#Batic
mapvalues(c("Lithuanian", "Latvian", "Estonian"), rep("Baltic", 3)) %>%
#Dutch
mapvalues("Afrikaans", "Dutch") %>%
#remainder
fct_lump(prop = 1/length(.), other_level = "Other") %>%
#set Danish as comparison
fct_relevel("Danish")
table2(d$main_origin) %>% print(n = Inf)
## # A tibble: 48 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 English 347. 18.4
## 2 Danish 210. 11.1
## 3 Unknown 199. 10.5
## 4 German 164. 8.68
## 5 Swedish 159. 8.41
## 6 Arabic 120. 6.35
## 7 Turkish 73. 3.86
## 8 French 70. 3.70
## 9 Polish 67. 3.54
## 10 Italian 56. 2.96
## 11 Norwegian 45. 2.38
## 12 Spanish 38. 2.01
## 13 Dutch 27. 1.43
## 14 Russian 27. 1.43
## 15 Finnish 19. 1.01
## 16 Irish 18. 0.952
## 17 Bosnian 17. 0.899
## 18 Other 17. 0.899
## 19 Romanian 16. 0.847
## 20 Croatian 14. 0.741
## 21 Greek 14. 0.741
## 22 Persian 14. 0.741
## 23 Scottish 12. 0.635
## 24 Czech 11. 0.582
## 25 Scandinavian 11. 0.582
## 26 Biblical 9. 0.476
## 27 Indian 9. 0.476
## 28 Baltic 8. 0.423
## 29 Hungarian 8. 0.423
## 30 Ukrainian 8. 0.423
## 31 Welsh 8. 0.423
## 32 Frisian 7. 0.370
## 33 Low German 6. 0.317
## 34 Portuguese 6. 0.317
## 35 Serbian 6. 0.317
## 36 Vietnamese 6. 0.317
## 37 Bulgarian 5. 0.265
## 38 Icelandic 5. 0.265
## 39 Japanese 5. 0.265
## 40 Roman 5. 0.265
## 41 Various 5. 0.265
## 42 Hebrew 4. 0.212
## 43 Norse Mythology 4. 0.212
## 44 Somali 4. 0.212
## 45 Chinese 3. 0.159
## 46 Roman Mythology 2. 0.106
## 47 Slovene 2. 0.106
## 48 <NA> 0. 0.
#distributions
GG_denhist(d, "S", "Danish") +
scale_x_continuous("General socioeconomic score (S)")
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figures/S_Danish_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#summary stats
describeBy(d$S, d$Danish)
##
## Descriptive statistics by group
## group: FALSE
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 591 -0.81 1.04 -0.93 -0.83 1.16 -3.33 1.86 5.2 0.23 -0.86
## se
## X1 0.04
## --------------------------------------------------------
## group: TRUE
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1299 0.37 0.72 0.46 0.43 0.62 -3.15 2.39 5.54 -0.77 1.16
## se
## X1 0.02
#top 10 origins, minus unknown
#have to do some recoding
main_origin_table = table2(d$main_origin)
commonest_10_origins = main_origin_table %>% .$Group %>% .[1:11] %>% setdiff("Unknown")
d10 = d %>%
filter(main_origin %in% commonest_10_origins)
d10 = d10 %>%
mutate(main_origin = fct_reorder(main_origin, d10$S))
main_origin_table10 = table2(d10$main_origin, sort_descending = NULL)
ggplot(d10 %>% filter(!is.na(main_origin)), aes(main_origin, S)) +
geom_boxplot() +
theme_bw() +
scale_x_discrete("10 largest main origins", label = function(x) {x + "\nn = " + main_origin_table10 %>% .$Count}) +
scale_y_continuous("General socioeconomic score (S)")
## Warning in stri_c(..., sep = sep, collapse = collapse, ignore_null = TRUE):
## longer object length is not a multiple of shorter object length
GG_save("figures/S_origin10_dist.png")
## Warning in stri_c(..., sep = sep, collapse = collapse, ignore_null = TRUE):
## longer object length is not a multiple of shorter object length
#S by origin
d %>%
group_by(main_origin) %>%
dplyr::summarize(
S = median(S),
n = n()
) %>%
write_clipboard()
## Main origin S N
## 1 Danish 0.74 210
## 2 Arabic -1.79 120
## 3 Baltic -0.28 8
## 4 Biblical -0.74 9
## 5 Bosnian -1.22 17
## 6 Bulgarian -0.33 5
## 7 Chinese -0.18 3
## 8 Croatian -0.44 14
## 9 Czech -0.74 11
## 10 Dutch 0.70 27
## 11 English 0.23 347
## 12 Finnish 0.64 19
## 13 French 0.35 70
## 14 Frisian 0.24 7
## 15 German 0.39 164
## 16 Greek 0.28 14
## 17 Hebrew -0.54 4
## 18 Hungarian 0.00 8
## 19 Icelandic -0.56 5
## 20 Indian -0.21 9
## 21 Irish 0.14 18
## 22 Italian 0.04 56
## 23 Japanese -0.02 5
## 24 Low German 1.27 6
## 25 Norse Mythology 0.43 4
## 26 Norwegian 0.64 45
## 27 Persian -1.36 14
## 28 Polish -1.55 67
## 29 Portuguese -0.03 6
## 30 Roman 1.09 5
## 31 Romanian -1.58 16
## 32 Roman Mythology 0.35 2
## 33 Russian -0.27 27
## 34 Scandinavian 0.34 11
## 35 Scottish 0.58 12
## 36 Serbian -0.97 6
## 37 Slovene 0.33 2
## 38 Somali -2.79 4
## 39 Spanish -0.54 38
## 40 Swedish 0.57 159
## 41 Turkish -1.21 73
## 42 Ukrainian -0.85 8
## 43 Unknown 0.60 199
## 44 Various 0.20 5
## 45 Vietnamese -0.14 6
## 46 Welsh 0.16 8
## 47 Other 0.70 17
#figure comparison
matched = readxl::read_xlsx("data/origin_national_group.xlsx")
GG_scatter(matched, "S_names", "S_official", case_names = "Main origin", repel_names = T)
GG_save("figures/matched.png")
We want to use n-grams to predict the outcomes for the names. We first make a vector of the Danish letters, which are the same as the English but we need to add another three vowel letters. After that, we make a function that outputs all the n-grams based on combinatorics (combinations with replacement).
#letters
danish_letters = c(letters[1:26], "æ", "ø", "å")
#ngram func
make_ngrams = function(n, letters = danish_letters) {
inner_func = function(n, letters = letters) {
#make combinations
df = gtools::permutations(n = length(danish_letters), r = n, v = danish_letters, repeats.allowed = T)
#collapser
collapser = curry::partial(str_c, list(collapse = ""))
#collapse rows
plyr::aaply(df, .margins = 1, .fun = function(row) {
do.call(collapser, args = list(x = row))
})
}
#vectorize
do.call(c, map(n, inner_func))
}
#example
make_ngrams(1:2)[1:100]
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## "a" "å" "æ" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m"
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 1
## "n" "o" "ø" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z" "aa"
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## "aå" "aæ" "ab" "ac" "ad" "ae" "af" "ag" "ah" "ai" "aj" "ak" "al" "am" "an"
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## "ao" "aø" "ap" "aq" "ar" "as" "at" "au" "av" "aw" "ax" "ay" "az" "åa" "åå"
## 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## "åæ" "åb" "åc" "åd" "åe" "åf" "åg" "åh" "åi" "åj" "åk" "ål" "åm" "ån" "åo"
## 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
## "åø" "åp" "åq" "år" "ås" "åt" "åu" "åv" "åw" "åx" "åy" "åz" "æa" "æå" "ææ"
## 62 63 64 65 66 67 68 69 70 71
## "æb" "æc" "æd" "æe" "æf" "æg" "æh" "æi" "æj" "æk"
First, we set a tuning parameter that determines how we prune the n-grams. The idea is that most of the possible n-grams never appear in the names we have, and so are useless (zero variance). The parameter excludes all n-grams that do not appear at least n times.
To avoid testing a very large number of n-grams, we restrict ourselves to combinations of length 1-3, and also examine those that occur at the beginning or end of names, such as names that begin with “chr” or end with “er”. To get there, we generate all the possible n-grams up to the desired length. Then we prune the only that don’t occur often according to our tuning parameter. Then as a simple initial test for any predictive validity, we run a t-test for each n-gram on the data using social status as the outcome. Finally, we plot the distributions of p and d values. The null hypothesis of no relevant variation is a flat (uniform) p-curve and a normal distibution of d values symmetric around 0.
#we only care about ngrams that occur at least 5 times
ngram_length = 3
#make list of all relevant n-grams
ngrams = c(make_ngrams(1:ngram_length),
"^" + make_ngrams(1:ngram_length),
make_ngrams(1:ngram_length) + "$"
)
#how many?
length(ngrams)
## [1] 75777
#inspect a random selection
set.seed(1)
sample(ngrams, size = 30)
## 19250 14413 3812
## "twt" "^æli" "^rnu" "rså$" "øbz" "qvh$" "vae$" "^zlm" "^wør" "cnk"
## 14737 12507
## "ønc" "mwf" "atx$" "^bnm" "gfe$" "^lte" "bns$" "zff$" "^bcu" "gxy$"
## 15202 8642 19373 145
## "uæs$" "pæd" "^yph" "ifz" "uåa" "^bro" "acz" "^biu" "øeq$" "^oz"
#df
ngram_df = data_frame(
ngram = ngrams,
#count matches
times = map_int(ngrams, function(x) {
sum(stringr::str_detect(d$name, pattern = x))
})
)
#subset
ngrams_chosen = ngram_df %>% filter(times >= 5) %>% .$ngram
#list of functions to score a given name for each feature
l_feature_funcs = list()
#add ngram functions
for (ng in ngrams_chosen) {
#replace illegal chrs
colname = str_replace(ng, "\\^", "_") %>% str_replace("\\$", "_")
#prepend a dot to avoid syntactically invalid names
colname = "." + colname
#save
l_feature_funcs[[colname]] = curry::partial(str_detect, args = list(pattern = ng))
}
#other feature funcs
l_feature_funcs[["length"]] = str_length
l_feature_funcs[["vowel_frac"]] = function(x) {
#count vowels
.matches = str_match_all(x, pattern = "[aeiouæøå]")
#fraction of length
nrow(.matches[[1]]) / str_length(x)
}
l_feature_funcs[["stop_frac"]] = function(x) {
#count stops
.matches = str_match_all(x, pattern = "[tdpbkg]")
#fraction of length
nrow(.matches[[1]]) / str_length(x)
}
l_feature_funcs[["nasal_frac"]] = function(x) {
#count stops
.matches = str_match_all(x, pattern = "[mn]")
#fraction of length
nrow(.matches[[1]]) / str_length(x)
}
l_feature_funcs[["has_dash"]] = function(x) {
str_detect(x, pattern = "-")
}
#vector
other_features = c("length", "vowel_frac", "stop_frac", "nasal_frac", "has_dash")
#test
assert_that(l_feature_funcs[["vowel_frac"]]("emil") == 0.5)
## [1] TRUE
assert_that(are_equal(l_feature_funcs[["vowel_frac"]]("abc"), 1/3))
## [1] TRUE
#prepare a second dataset with syntactically legal names for each regex
d2 = data_frame(name = d$name,
S = d$S,
Danish = d$Danish,
main_origin = d$main_origin
)
#really compact function to make the feature data frame
#loops over each name, then applies each feature function to that name
#then collects and constructs as data frame
d2 = cbind(d2, map_df(d$name, function(.name) {
plyr::each(l_feature_funcs)(.name)
}))
#test all n-gram using t-test
ngram_tests = map2_df(l_feature_funcs, names(l_feature_funcs), function(x, y) {
#check if output is logical, if not, skip
if (!is.logical(x(d$name))) return(NULL)
#split to 2 vectors
cur = d[x(d$name), "S"] %>% unlist()
rest = d[!x(d$name), "S"] %>% unlist()
#t-test
t = t.test(cur, rest)
data_frame(pattern = y, #the pattern name
#the p value
p = t$p.value,
#p value corrected for multiple testing
p_cor = p / length(ngrams_chosen),
#d value to other names
d = round(t$estimate[1] - t$estimate[2], 2),
#number of names with pattern
n = length(cur)
)
})
#plots
#p
GG_denhist(ngram_tests$p, vline = NULL) +
scale_x_continuous("p value") +
theme_bw()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figures/p_val_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#under cutoffs
mean(ngram_tests$p < .05)
## [1] 0.426
mean(ngram_tests$p < .01)
## [1] 0.294
mean(ngram_tests$p < .005)
## [1] 0.246
mean(ngram_tests$p < .001)
## [1] 0.16
#describe
describe(ngram_tests$p)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1095 0.25 0.3 0.09 0.2 0.14 0 1 1 1.01 -0.35 0.01
#effct size distribution
GG_denhist(ngram_tests$d, vline = NULL) +
scale_x_continuous("d value") +
theme_bw()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figures/d_val_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#correlation of rarity with effect size
ngram_tests$abs_d = abs(ngram_tests$d)
GG_scatter(ngram_tests, "n", "d")
GG_scatter(ngram_tests, "n", "abs_d")
ngram_tests %>% filter(abs_d > 1) %>% pull(n) %>% max()
## [1] 71
ngram_tests %>% filter(abs_d > 1) %>% pull(n) %>% length()
## [1] 93
#top bottom 10
d %>% select(name, S) %>% arrange(S) %>% head(10) %>% write_clipboard()
## Name S
## 1 fadumo -3.33
## 2 abdi -3.15
## 3 abdullahi -2.99
## 4 fatme -2.96
## 5 mehmed -2.88
## 6 mustapha -2.86
## 7 karima -2.81
## 8 souad -2.78
## 9 salah -2.72
## 10 halima -2.64
d %>% select(name, S) %>% arrange(desc(S)) %>% head(10) %>% write_clipboard()
## Name S
## 1 lauritz 2.39
## 2 alberte 2.31
## 3 gustav 2.19
## 4 villads 2.18
## 5 eskild 2.16
## 6 lauge 2.13
## 7 trille 2.02
## 8 valdemar 2.00
## 9 alfred 2.00
## 10 jens-ole 1.99
#very negative patterns
ngram_tests %>% arrange(d) %>% head(20) %>% pull(n) %>% mean()
## [1] 6.15
ngram_tests %>% arrange(d) %>% head(100) %>% pull(n) %>% mean()
## [1] 9.68
ngram_tests$n %>% mean()
## [1] 28.4
#test all name features using OLS with control for main origin
ngram_tests_control = map_df(ngram_tests$pattern, function(x) {
#model
fit = ols(as.formula(sprintf("S ~ %s + main_origin", x)), data = d2)
params = broom::tidy(summary.lm(fit))
data_frame(pattern = x, #the pattern name
#the p value
p = params[2, 5],
#p value corrected for multiple testing
p_cor = p / nrow(ngram_tests),
#d value to other names
es = params[2, 2],
#number of names with pattern
se = params[2, 3]
)
})
#describe
describe(ngram_tests_control$p)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1095 0.38 0.3 0.32 0.36 0.38 0 1 1 0.42 -1.12 0.01
mean(ngram_tests_control$p < .05)
## [1] 0.163
#before and after?
GG_scatter(data_frame(controlled_es = ngram_tests_control$es,
bivariate = ngram_tests$d),
"bivariate", "controlled_es"
)
Having seen that there seems to be some validity, we want to determine more precisely how much validity. How well can we predict social status from the features combined? We try OLS just to see how large the overfitting is.
#make a model
#difficult due to illegal names
#http://stackoverflow.com/questions/29555473/creating-formula-using-very-long-strings-in-r
preds_legal = ("." + ngrams_chosen) %>% str_replace("[\\^\\$]", "_")
preds_legal_quoted = sprintf("`%s`", preds_legal)
f = formula(paste("S ~ ", str_c(c(preds_legal_quoted, other_features), collapse = " + ")))
length(c(preds_legal_quoted, other_features))
## [1] 1099
#OLS
ols_fit = rms::ols(f, data = d2, x = T, y = T)
ols_fit %>% print_n_lines()
## Linear Regression Model
##
## rms::ols(formula = f, data = d2, x = T, y = T)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1890 LR chi2 3294.63 R2 0.825
## sigma0.6183 d.f. 1099 R2 adj 0.582
## d.f. 790 Pr(> chi2) 0.0000 g 1.009
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.675e+00 -2.448e-01 6.440e-15 2.646e-01 1.738e+00
##
# validate(ols_fit)2
#remove preds with NA estimates
#tidy(summary.lm(ols_fit)) #skips NA for some reason
(NA_coefs = names(coef(ols_fit))[is.na(coef(ols_fit))])
## [1] ".abd" ".ius" ".jør" ".kla" ".lex" ".moh" ".nja" ".ohn"
## [9] ".oph" ".pau" ".rha" ".uis" "._ju" "._kr" "._pa" "._sv"
## [17] "._abd" "._cat" "._chr" "._cla" "._far" "._fat" "._fra" "._fre"
## [25] "._fri" "._gun" "._hil" "._jac" "._jea" "._jen" "._jes" "._jim"
## [33] "._joh" "._jon" "._jos" "._jul" "._kam" "._kri" "._mag" "._maj"
## [41] "._mat" "._mic" "._mik" "._moh" "._pau" "._sam" "._sig" "._sus"
## [49] "._sve" "._tom" "._vic" ".ll_" ".ud_" ".eta_" ".ias_" ".ida_"
## [57] ".ild_" ".ita_" ".lie_" ".lly_" ".med_" ".ndy_" ".ner_" ".nia_"
## [65] ".nny_" ".ørn_" ".sie_" ".tta_" ".usz_" ".yna_"
# if (interactive()) ngram_tests %>% filter(pattern %in% NA_coefs) %>% View()
#do they differ numerically?
ngram_tests %>% filter(pattern %in% NA_coefs) %>% .$n %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 70 7.51 2.89 6 7.04 1.48 5 16 11 1.17 0.43 0.35
ngram_tests %>% filter(!pattern %in% NA_coefs) %>% .$n %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1025 29.8 80.8 10 14.5 7.41 5 1154 1149 8.48 89
## se
## X1 2.53
ngram_tests %>% filter(pattern %in% NA_coefs) %>% .$d %>% abs() %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 70 0.64 0.59 0.48 0.53 0.33 0 2.44 2.44 1.56 1.72 0.07
ngram_tests %>% filter(!pattern %in% NA_coefs) %>% .$d %>% abs() %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1025 0.47 0.39 0.38 0.41 0.34 0 2.44 2.44 1.62 3.48
## se
## X1 0.01
nonNA_preds = preds_legal %>% setdiff(NA_coefs)
nonNA_preds_quoted = "`" + nonNA_preds + "`"
#refit without NA preds
f2 = as.formula("S ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + "))
length(c(nonNA_preds_quoted, other_features))
## [1] 1029
ols_fit2 = rms::ols(f2, data = d2, x = T, y = T)
ols_fit2 %>% print_n_lines()
## Linear Regression Model
##
## rms::ols(formula = f2, data = d2, x = T, y = T)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1890 LR chi2 3294.63 R2 0.825
## sigma0.6183 d.f. 1029 R2 adj 0.616
## d.f. 860 Pr(> chi2) 0.0000 g 1.009
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.675e+00 -2.448e-01 6.440e-15 2.646e-01 1.738e+00
##
# validate(ols_fit2)
#confirm no NA?
assert_that(!anyNA(coef(ols_fit2)))
## [1] TRUE
#control for Danish status
f3 = as.formula("S ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + ") + " + Danish")
ols_fit3 = rms::ols(f3, data = d2, x = T, y = T)
ols_fit3 %>% print_n_lines()
## Linear Regression Model
##
## rms::ols(formula = f3, data = d2, x = T, y = T)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1890 LR chi2 3439.35 R2 0.838
## sigma0.5954 d.f. 1030 R2 adj 0.644
## d.f. 859 Pr(> chi2) 0.0000 g 1.016
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.677e+00 -2.345e-01 1.569e-15 2.541e-01 1.586e+00
##
#control for main origin status
f4 = as.formula("S ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + ") + " + main_origin")
ols_fit4 = rms::ols(f4, data = d2, x = T, y = T)
ols_fit4 %>% print_n_lines()
## Linear Regression Model
##
## rms::ols(formula = f4, data = d2, x = T, y = T)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1890 LR chi2 3781.13 R2 0.865
## sigma0.5588 d.f. 1075 R2 adj 0.686
## d.f. 814 Pr(> chi2) 0.0000 g 1.028
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.46487 -0.22749 0.00952 0.24098 1.28850
##
#origin only
ols_fit5 = rms::ols(S ~ Danish, data = d2, x = T, y = T)
ols_fit5 %>% print_n_lines()
## Linear Regression Model
##
## rms::ols(formula = S ~ Danish, data = d2, x = T, y = T)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1890 LR chi2 683.09 R2 0.303
## sigma0.8327 d.f. 1 R2 adj 0.303
## d.f. 1888 Pr(> chi2) 0.0000 g 0.509
##
## Residuals
##
## Min 1Q Median 3Q Max
## -3.52190 -0.53105 0.04953 0.54654 2.67433
##
ols_fit6 = rms::ols(S ~ main_origin, data = d2, x = T, y = T)
ols_fit6 %>% print_n_lines()
## Linear Regression Model
##
## rms::ols(formula = S ~ main_origin, data = d2, x = T, y = T)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1890 LR chi2 1457.66 R2 0.538
## sigma0.6867 d.f. 46 R2 adj 0.526
## d.f. 1843 Pr(> chi2) 0.0000 g 0.737
##
## Residuals
##
## Min 1Q Median 3Q Max
## -3.42287 -0.36799 0.04452 0.44856 2.70205
##
#delta R2 adj.
glance(summary.lm(ols_fit4))$adj.r.squared - glance(summary.lm(ols_fit6))$adj.r.squared
## [1] 0.16
#how much can we fit nothing?
d2$S_random = sample(d2$S)
f7 = as.formula("S_random ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + ") + " + main_origin")
ols_fit7 = rms::ols(f7, data = d2)
ols_fit7 %>% print_n_lines()
## Linear Regression Model
##
## rms::ols(formula = f7, data = d2)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1890 LR chi2 1671.86 R2 0.587
## sigma0.9763 d.f. 1075 R2 adj 0.042
## d.f. 814 Pr(> chi2) 0.0000 g 0.861
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.926655 -0.365621 0.007881 0.421093 2.207214
##
Intrigued by the OLS results, we try a better method: LASSO. The approach taken is a repeated approach where we 1) split data into training and test subsets, 2) use cv on the training set to estimate optimal shrinkage parameter, then use that on the full dataset, then 3) test that model on the test set. Because the whole thing relies on somewhat small samples, we repeat this entire procedure a number of times.
#matrix form
x_mat = stats::model.matrix(f4, data = d2) %>%
#remove intercept
#pointless when Y is scaled
.[, -1]
x_mat_scaled = scale(x_mat)
#ordinary lasso
LASSO_fit = glmnet::glmnet(x_mat, y = d2$S)
plot(LASSO_fit)
#cv
set.seed(1)
LASSO_cv = cv.glmnet(x_mat, y = d2$S, parallel = T)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loaded glmnet 2.0-16
##
## Attaching package: 'glmnet'
## The following objects are masked from 'package:glmnetUtils':
##
## cv.glmnet, glmnet
plot(LASSO_cv)
#optimal penalty
set.seed(1)
runs_results = list()
d2[["...id"]] = seq_along_rows(d2)
runs = 5000
f5 = as.formula("S ~ " + str_c(c(preds_legal_quoted, other_features), collapse = " + ") + " + main_origin")
#load from disk? save time when e.g. writing to Rpubs
if (file.exists("data/lasso_runs.rds")) {
runs_results = read_rds("data/lasso_runs.rds")
} else {
time_start = lubridate::now()
print(sprintf("Loop started at %s", as.character(time_start)))
for (i in 1:runs) {
message(sprintf("run: %d of %d", i, runs))
#estimate shrinkage param
#pick alpha value
run_alpha = seq(.1, 1, length.out = 5)[(i %% 5) + 1]
#test and train samples
test_rows = sample(nrow(d2), 50)
run_test = d2[test_rows, ]
run_train = d2[setdiff(1:nrow(d2), test_rows), ]
#make matrix for train
run_x_mat_train = stats::model.matrix(f4, data = run_train)
run_x_mat_test = stats::model.matrix(f4, data = run_test)
#determine optimal lambda
lambda_est = cv.glmnet(run_x_mat_train, y = run_train$S, alpha = run_alpha, parallel = T)
#predict into test set
run_test$pred_lasso = predict(lambda_est, newx = run_x_mat_test) %>% as.vector()
#also fit OLS on selected variables
#extract features, no intercept
run_lasso_features = extract_lasso_features(lambda_est) %>%
.[-1, ] %>%
#no need to deal with main origin since we always include these
filter(!str_detect(name, "main_origin")) %>%
#fix names
mutate(name = str_replace(name, "TRUE", ""))
#build formula
run_lasso_chosen = as.formula("S ~ " + str_c(run_lasso_features$name, collapse = " + ") + " + main_origin")
#fit
run_ols_fit = ols(run_lasso_chosen, data = run_train)
#any NA? then refit without the offending predictor
if (anyNA(coef(run_ols_fit))) {
#find NA preds
run_NA_preds_lasso_ols = NA_preds(run_ols_fit)
#remove from formula
run_lasso_chosen = as.formula("S ~ " + str_c(run_lasso_features$name %>% setdiff(run_NA_preds_lasso_ols), collapse = " + ") + " + main_origin")
#refit
run_ols_fit = ols(run_lasso_chosen, data = run_train)
#any NA again?
if (anyNA(coef(run_ols_fit))) browser()
}
#try predict
trial = try({run_test$pred_lasso_ols = predict(run_ols_fit, newdata = run_test)})
#if failed, fill in NAs
if (is_error(trial)) run_test$pred_lasso_ols = NA
# #full OLS
# all_preds = c(preds_legal_quoted, other_features, "main_origin")
# run_f_all = as.formula("S ~ " + str_c(all_preds, collapse = " + "))
# run_full_ols_fit = ols(run_f_all, data = run_train)
#
# #remove any NA preds and refit
# if (anyNA(coef(run_full_ols_fit))) {
# #nonNA preds
# run_NA_preds_full_ols = NA_preds(run_full_ols_fit)
#
# #non-NA
# run_nonNA_preds_full_ols = all_preds %>% setdiff(run_NA_preds_full_ols)
#
# #formula
# run_f_all2 = as.formula("S ~ " + str_c(run_nonNA_preds_full_ols, collapse = " + "))
#
# #refit
# run_full_ols_fit = ols(run_f_all2, data = run_train)
#
# #more NA?
# if (anyNA(coef(run_full_ols_fit))) browser()
# }
#
# #try predict
# trial = try({run_test$pred_ols = predict(run_full_ols_fit, newdata = run_test)})
# #if failed, fill in NAs
# if (is_error(trial)) run_test$pred_ols = NA
#save info
runs_results[[i]] = list(
run_cv = lambda_est,
alpha = run_alpha,
#standard lasso
R2_test_lasso = calc_r2(run_test$pred_lasso, run_test$S),
#ols lasso
R2_test_lasso_ols = calc_r2(run_test$pred_lasso_ols, run_test$S),
#predicted values
oos_preds = run_test %>% select(pred_lasso, pred_lasso_ols, `...id`) %>% mutate(...run = i)
)
}
time_end = lubridate::now()
running_time = time_end - time_start
cat(sprintf("Loop finished at %s\nrunning time of %s", as.character(time_end), capture.output(print(running_time))))
#save to disk
write_rds(runs_results, "data/lasso_runs.rds")
}
#summary stats
penalized_reg_summary = map_df(runs_results, function(x) {
data_frame(
R2_test_lasso = x$R2_test_lasso,
R2_test_lasso_ols = x$R2_test_lasso_ols,
alpha = x$alpha,
lambda_1se = x$run_cv$lambda.1se,
lambda_optimal = x$run_cv$lambda.min
)
})
#results
penalized_reg_summary %>%
gather(key = predict_method, value = R2_test, R2_test_lasso, R2_test_lasso_ols) %>%
mutate(alpha = factor(alpha)) %>%
GG_group_means("R2_test", "alpha", "predict_method") +
scale_y_continuous(breaks = seq(0, 1, .1)) +
scale_fill_discrete("Predict method", labels = c("LASSO", "OLS"))
## Missing values were removed.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
GG_save("figures/lasso_cv.png")
penalized_reg_summary %>%
gather(key = predict_method, value = R2_test, R2_test_lasso, R2_test_lasso_ols) %>%
mutate(alpha = factor(alpha)) %>%
unite(method, alpha, predict_method) %$%
describeBy(R2_test, method, mat = T) %>% print(digits = 3)
## item group1 vars n mean sd median trimmed
## X11 1 0.1_R2_test_lasso 1 1000 0.520 0.0957 0.530 0.525
## X12 2 0.1_R2_test_lasso_ols 1 999 0.461 0.1407 0.479 0.470
## X13 3 0.325_R2_test_lasso 1 1000 0.530 0.1060 0.541 0.536
## X14 4 0.325_R2_test_lasso_ols 1 999 0.493 0.1399 0.503 0.503
## X15 5 0.55_R2_test_lasso 1 1000 0.528 0.1050 0.540 0.534
## X16 6 0.55_R2_test_lasso_ols 1 996 0.497 0.1351 0.513 0.505
## X17 7 0.775_R2_test_lasso 1 1000 0.533 0.1018 0.545 0.539
## X18 8 0.775_R2_test_lasso_ols 1 998 0.503 0.1265 0.518 0.511
## X19 9 1_R2_test_lasso 1 1000 0.531 0.1035 0.539 0.536
## X110 10 1_R2_test_lasso_ols 1 998 0.501 0.1261 0.515 0.508
## mad min max range skew kurtosis se
## X11 0.0933 0.1276 0.733 0.606 -0.581 0.38181 0.00303
## X12 0.1330 -0.1345 0.791 0.925 -0.783 1.04040 0.00445
## X13 0.1060 0.0045 0.753 0.749 -0.635 0.67044 0.00335
## X14 0.1351 -0.4046 0.790 1.194 -0.869 1.86951 0.00443
## X15 0.1059 -0.0221 0.799 0.821 -0.603 0.63959 0.00332
## X16 0.1274 -0.1053 0.819 0.924 -0.715 0.76864 0.00428
## X17 0.1001 0.1568 0.757 0.600 -0.585 0.21067 0.00322
## X18 0.1258 -0.1074 0.772 0.880 -0.674 0.56308 0.00400
## X19 0.0997 0.1329 0.799 0.666 -0.546 0.50924 0.00327
## X110 0.1179 0.0693 0.781 0.712 -0.513 -0.00968 0.00399
#one time rename
# for (i in seq_along(runs_results_dk)) {
# runs_results_dk[[i]][["oos_preds"]] = runs_results_dk[[i]][["standard_lasso_preds"]]
# runs_results_dk[[i]][["standard_lasso_preds"]] = NULL
# }
#oos predictions from runs
oos_data = map_df(runs_results, "oos_preds")
#take first from each observation
set.seed(1)
oos_data_rand1 = oos_data %>%
#group by id
group_by(...id) %>%
#select 1 value at random
do(sample_n(., 1)) %>%
#join with true values
full_join(d2 %>% select(S, ...id, Danish), by = "...id")
oos_data_mean = oos_data %>%
#group by id
group_by(...id) %>%
#select 1 value at random
dplyr::summarize(pred_lasso = mean(pred_lasso)) %>%
#join with true values
full_join(d2 %>% select(S, ...id, Danish), by = "...id")
#plot
GG_scatter(oos_data_rand1, "pred_lasso", "S", color = "Danish") +
scale_x_continuous("Predicted S") +
scale_color_discrete("Is Danish?")
GG_scatter(oos_data_mean, "pred_lasso", "S", color = "Danish") +
scale_x_continuous("Predicted S") +
scale_color_discrete("Is Danish?")
GG_save("figures/lasso_oos_scatter.png")
#predict from full lasso
d2$S_lasso_full = predict(LASSO_cv, newx = x_mat) %>% as.vector()
GG_scatter(d2, "S_lasso_full", "S", color = "Danish", alpha = .5) +
scale_x_continuous("Predicted S score") +
scale_color_discrete("Is Danish?")
GG_save("figures/lasso_full_cv.png")
#R2 from cor
cor(d2$S_lasso_full, d2$S)^2
## [1] 0.636
#this takes forever because it's not parallelized!
#relaxed lasso
#load from disk? save time when e.g. writing to Rpubs
if (file.exists("data/relaxo_runs.rds")) {
runs_results = read_rds("data/relaxo_runs.rds")
} else {
RLASSO_fit = relaxo::relaxo(x_mat_scaled, d2$S)
plot(RLASSO_fit)
#cv
set.seed(1)
RLASSO_cv = cvrelaxo(x_mat_scaled, d2$S)
#save
write_rds(runs_results, "data/relaxo_runs.rds")
}
Since we only have 2 groups, we can be lazy and copy paste code instead of abstracting all the functionality.
The predictor set needs to be reduced due to sample reduction. Many predictors that previously had 5 or more names matching no longer have so.
#dk subset
dk = d %>% filter(Danish)
#how many
nrow(dk)
## [1] 1299
#make list of all relevant n-grams
ngrams_dk = c(make_ngrams(1:ngram_length),
"^" + make_ngrams(1:ngram_length),
make_ngrams(1:ngram_length) + "$"
)
#df
ngram_df_dk = data_frame(
ngram = ngrams_dk,
#count matches
times = map_int(ngrams_dk, function(x) {
sum(stringr::str_detect(dk$name, pattern = x))
})
)
#subset
ngrams_chosen_dk = ngram_df_dk %>% filter(times >= 5) %>% .$ngram
#list of functions to score a given name for each feature
l_feature_funcs_dk = list()
#add ngram functions
for (ng in ngrams_chosen_dk) {
#replace illegal chrs
colname = str_replace(ng, "\\^", "_") %>% str_replace("\\$", "_")
#prepend a dot to avoid syntactically invalid names
colname = "." + colname
#save
l_feature_funcs_dk[[colname]] = curry::partial(str_detect, args = list(pattern = ng))
}
#other feature funcs
l_feature_funcs_dk[["length"]] = str_length
l_feature_funcs_dk[["vowel_frac"]] = function(x) {
#count vowels
.matches = str_match_all(x, pattern = "[aeiouæøå]")
#fraction of length
nrow(.matches[[1]]) / str_length(x)
}
l_feature_funcs_dk[["stop_frac"]] = function(x) {
#count stops
.matches = str_match_all(x, pattern = "[tdpbkg]")
#fraction of length
nrow(.matches[[1]]) / str_length(x)
}
l_feature_funcs_dk[["nasal_frac"]] = function(x) {
#count stops
.matches = str_match_all(x, pattern = "[mn]")
#fraction of length
nrow(.matches[[1]]) / str_length(x)
}
l_feature_funcs_dk[["has_dash"]] = function(x) {
str_detect(x, pattern = "-")
}
#vector
other_features = c("length", "vowel_frac", "stop_frac", "nasal_frac", "has_dash")
#test
assert_that(l_feature_funcs_dk[["vowel_frac"]]("emil") == 0.5)
## [1] TRUE
assert_that(are_equal(l_feature_funcs_dk[["vowel_frac"]]("abc"), 1/3))
## [1] TRUE
#prepare a second dataset with syntactically legal names for each regex
d2_dk = data_frame(name = dk$name,
S = dk$S,
Danish = dk$Danish,
main_origin = dk$main_origin
)
#really compact function to make the feature data frame
#loops over each name, then applies each feature function to that name
#then collects and constructs as data frame
d2_dk = cbind(d2_dk,
map_df(dk$name, function(.name) {
plyr::each(l_feature_funcs_dk)(.name)
}))
#test all n-gram using t-test
ngram_tests_dk = map2_df(l_feature_funcs_dk, names(l_feature_funcs_dk), function(x, y) {
#check if output is logical, if not, skip
if (!is.logical(x(dk$name))) return(NULL)
#split to 2 vectors
cur = dk[x(dk$name), "S"] %>% unlist()
rest = dk[!x(dk$name), "S"] %>% unlist()
#t-test
t = t.test(cur, rest)
data_frame(pattern = y, #the pattern name
#the p value
p = t$p.value,
#p value corrected for multiple testing
p_cor = p / length(ngrams_chosen_dk),
#d value to other names
d = round(t$estimate[1] - t$estimate[2], 2),
#number of names with pattern
n = length(cur)
)
})
#plots
#p
GG_denhist(ngram_tests_dk$p, vline = NULL) +
scale_x_continuous("p value") +
theme_bw()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figures/p_val_dist_dk.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#under cutoffs
mean(ngram_tests_dk$p < .05)
## [1] 0.259
mean(ngram_tests_dk$p < .01)
## [1] 0.136
mean(ngram_tests_dk$p < .005)
## [1] 0.105
mean(ngram_tests_dk$p < .001)
## [1] 0.0596
#describe
describe(ngram_tests_dk$p)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 806 0.32 0.3 0.22 0.28 0.3 0 1 1 0.77 -0.65 0.01
#d
GG_denhist(ngram_tests_dk$d, vline = NULL) +
scale_x_continuous("d value") +
theme_bw()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figures/d_val_dist_dk.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#top bottom 10
dk %>% select(name, S) %>% arrange(S) %>% head(10) %>% write_clipboard()
## Name S
## 1 abdi -3.15
## 2 amal -2.54
## 3 esme -2.14
## 4 hanan -2.12
## 5 manal -2.12
## 6 valborg -2.11
## 7 mohsen -2.02
## 8 layla -1.94
## 9 nielsine -1.93
## 10 anelise -1.80
dk %>% select(name, S) %>% arrange(desc(S)) %>% head(10) %>% write_clipboard()
## Name S
## 1 lauritz 2.39
## 2 alberte 2.31
## 3 gustav 2.19
## 4 villads 2.18
## 5 eskild 2.16
## 6 lauge 2.13
## 7 trille 2.02
## 8 valdemar 2.00
## 9 alfred 2.00
## 10 jens-ole 1.99
#make a model
#difficult due to illegal names
#http://stackoverflow.com/questions/29555473/creating-formula-using-very-long-strings-in-r
preds_legal = ("." + ngrams_chosen_dk) %>% str_replace("[\\^\\$]", "_")
preds_legal_quoted = sprintf("`%s`", preds_legal)
f = formula(paste("S ~ ", str_c(c(preds_legal_quoted, other_features), collapse = " + ")))
length(c(preds_legal_quoted, other_features))
## [1] 810
#OLS
ols_fit = rms::ols(f, data = d2_dk, x = T, y = T)
ols_fit %>% print_n_lines()
## Linear Regression Model
##
## rms::ols(formula = f, data = d2_dk, x = T, y = T)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1299 LR chi2 1909.72 R2 0.770
## sigma0.5312 d.f. 810 R2 adj 0.389
## d.f. 488 Pr(> chi2) 0.0000 g 0.700
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.624e+00 -2.134e-01 1.521e-15 2.231e-01 1.509e+00
##
# validate(ols_fit)2
#remove preds with NA estimates
#tidy(summary.lm(ols_fit)) #skips NA for some reason
(NA_coefs = names(coef(ols_fit))[is.na(coef(ols_fit))])
## [1] ".alt" ".bri" ".jør" ".liz" ".lou" ".nja" ".oha" ".ohn"
## [9] ".oph" ".oui" ".pau" ".sve" ".uis" "._do" "._ji" "._ju"
## [17] "._kr" "._pa" "._sv" "._bri" "._cat" "._cla" "._emi" "._fra"
## [25] "._fre" "._gun" "._hil" "._isa" "._jea" "._jen" "._jes" "._jim"
## [33] "._joh" "._jon" "._jos" "._kir" "._kri" "._lil" "._mag" "._mai"
## [41] "._maj" "._mat" "._mic" "._pau" "._sig" "._sil" "._sus" "._sve"
## [49] "._vic" ".ll_" ".my_" ".ias_" ".ild_" ".ita_" ".lie_" ".lly_"
## [57] ".nda_" ".ndy_" ".ner_" ".nny_" ".ørn_" ".sie_" ".tta_"
# if (interactive()) ngram_tests %>% filter(pattern %in% NA_coefs) %>% View()
#do they differ numerically?
ngram_tests_dk %>% filter(pattern %in% NA_coefs) %>% .$n %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 63 7 2.76 6 6.49 1.48 5 21 16 2.59 9.15 0.35
ngram_tests_dk %>% filter(!pattern %in% NA_coefs) %>% .$n %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 743 27.7 64.3 11 14.8 7.41 5 723 718 7.13 61.6 2.36
ngram_tests_dk %>% filter(pattern %in% NA_coefs) %>% .$d %>% abs() %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 63 0.29 0.24 0.23 0.25 0.19 0 1.19 1.19 1.62 3.03 0.03
ngram_tests_dk %>% filter(!pattern %in% NA_coefs) %>% .$d %>% abs() %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 743 0.26 0.22 0.21 0.23 0.19 0 1.38 1.38 1.49 3
## se
## X1 0.01
nonNA_preds = preds_legal %>% setdiff(NA_coefs)
nonNA_preds_quoted = "`" + nonNA_preds + "`"
#refit without NA preds
f2 = as.formula("S ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + "))
length(c(nonNA_preds_quoted, other_features))
## [1] 747
ols_fit2 = rms::ols(f2, data = d2_dk, x = T, y = T)
ols_fit2 %>% print_n_lines()
## Linear Regression Model
##
## rms::ols(formula = f2, data = d2_dk, x = T, y = T)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1299 LR chi2 1909.72 R2 0.770
## sigma0.5312 d.f. 747 R2 adj 0.458
## d.f. 551 Pr(> chi2) 0.0000 g 0.700
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.624e+00 -2.134e-01 1.521e-15 2.231e-01 1.509e+00
##
# validate(ols_fit2)
#confirm no NA?
assert_that(!anyNA(coef(ols_fit2)))
## [1] TRUE
#control for main origin status
f4 = as.formula("S ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + ") + " + main_origin")
ols_fit4 = rms::ols(f4, data = d2_dk, x = T, y = T)
ols_fit4 %>% print_n_lines()
## Linear Regression Model
##
## rms::ols(formula = f4, data = d2_dk, x = T, y = T)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1299 LR chi2 2104.35 R2 0.802
## sigma0.5117 d.f. 788 R2 adj 0.496
## d.f. 510 Pr(> chi2) 0.0000 g 0.712
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.233e+00 -1.944e-01 3.036e-16 2.047e-01 1.537e+00
##
#origin only
ols_fit6 = rms::ols(S ~ main_origin, data = d2_dk, x = T, y = T)
ols_fit6 %>% print_n_lines()
## Linear Regression Model
##
## rms::ols(formula = S ~ main_origin, data = d2_dk, x = T, y = T)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1299 LR chi2 242.88 R2 0.171
## sigma0.6680 d.f. 41 R2 adj 0.143
## d.f. 1257 Pr(> chi2) 0.0000 g 0.306
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.68948 -0.32220 0.04868 0.44072 1.83790
##
#how much can we fit nothing?
d2_dk$S_random = sample(d2_dk$S)
f7 = as.formula("S_random ~ " + str_c(c(nonNA_preds_quoted, other_features), collapse = " + ") + " + main_origin")
ols_fit7 = rms::ols(f7, data = d2_dk)
ols_fit7 %>% print_n_lines()
## Linear Regression Model
##
## rms::ols(formula = f7, data = d2_dk)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1299 LR chi2 1171.85 R2 0.594
## sigma0.7327 d.f. 788 R2 adj -0.033
## d.f. 510 Pr(> chi2) 0.0000 g 0.614
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.916e+00 -2.461e-01 1.813e-15 2.828e-01 1.537e+00
##
#matrix form
x_mat = stats::model.matrix(f4, data = d2_dk) %>%
#remove intercept
#pointless when Y is scaled
.[, -1]
x_mat_scaled = scale(x_mat)
#ordinary lasso
LASSO_fit_dk = glmnet::glmnet(x_mat, y = d2_dk$S)
plot(LASSO_fit_dk)
#cv
set.seed(1)
LASSO_cv_dk = cv.glmnet(x_mat, y = d2_dk$S, parallel = T)
plot(LASSO_cv_dk)
#optimal penalty
set.seed(1)
runs_results_dk = list()
d2_dk[["...id"]] = seq_along_rows(d2_dk)
runs = 5000
f5 = as.formula("S ~ " + str_c(c(preds_legal_quoted, other_features), collapse = " + ") + " + main_origin")
#load from disk? save time when e.g. writing to Rpubs
if (file.exists("data/lasso_runs_dk.rds")) {
runs_results_dk = read_rds("data/lasso_runs_dk.rds")
} else {
time_start = lubridate::now()
print(sprintf("Loop started at %s", as.character(time_start)))
for (i in 1:runs) {
message(sprintf("run: %d of %d", i, runs))
#estimate shrinkage param
#pick alpha value
run_alpha = seq(.1, 1, length.out = 5)[(i %% 5) + 1]
#test and train samples
test_rows = sample(nrow(d2_dk), 50)
run_test = d2_dk[test_rows, ]
run_train = d2_dk[setdiff(1:nrow(d2_dk), test_rows), ]
#make matrix for train
run_x_mat_train = stats::model.matrix(f4, data = run_train)
run_x_mat_test = stats::model.matrix(f4, data = run_test)
#determine optimal lambda
lambda_est = cv.glmnet(run_x_mat_train, y = run_train$S, alpha = run_alpha, parallel = T)
#predict into test set
run_test$pred_lasso = predict(lambda_est, newx = run_x_mat_test) %>% as.vector()
#also fit OLS on selected variables
#extract features, no intercept
run_lasso_features = extract_lasso_features(lambda_est) %>%
.[-1, ] %>%
#no need to deal with main origin since we always include these
filter(!str_detect(name, "main_origin")) %>%
#fix names
mutate(name = str_replace(name, "TRUE", ""))
#build formula
run_lasso_chosen = as.formula("S ~ " + str_c(run_lasso_features$name, collapse = " + ") + " + main_origin")
#fit
run_ols_fit = ols(run_lasso_chosen, data = run_train)
#any NA? then refit without the offending predictor
if (anyNA(coef(run_ols_fit))) {
#find NA preds
run_NA_preds_lasso_ols = NA_preds(run_ols_fit)
#remove from formula
run_lasso_chosen = as.formula("S ~ " + str_c(run_lasso_features$name %>% setdiff(run_NA_preds_lasso_ols), collapse = " + ") + " + main_origin")
#refit
run_ols_fit = ols(run_lasso_chosen, data = run_train)
#any NA again?
if (anyNA(coef(run_ols_fit))) browser()
}
#try predict
trial = try({run_test$pred_lasso_ols = predict(run_ols_fit, newdata = run_test)})
#if failed, fill in NAs
if (is_error(trial)) run_test$pred_lasso_ols = NA
# #full OLS
# all_preds = c(preds_legal_quoted, other_features, "main_origin")
# run_f_all = as.formula("S ~ " + str_c(all_preds, collapse = " + "))
# run_full_ols_fit = ols(run_f_all, data = run_train)
#
# #remove any NA preds and refit
# if (anyNA(coef(run_full_ols_fit))) {
# #nonNA preds
# run_NA_preds_full_ols = NA_preds(run_full_ols_fit)
#
# #non-NA
# run_nonNA_preds_full_ols = all_preds %>% setdiff(run_NA_preds_full_ols)
#
# #formula
# run_f_all2 = as.formula("S ~ " + str_c(run_nonNA_preds_full_ols, collapse = " + "))
#
# #refit
# run_full_ols_fit = ols(run_f_all2, data = run_train)
#
# #more NA?
# if (anyNA(coef(run_full_ols_fit))) browser()
# }
#
# #try predict
# trial = try({run_test$pred_ols = predict(run_full_ols_fit, newdata = run_test)})
# #if failed, fill in NAs
# if (is_error(trial)) run_test$pred_ols = NA
#save info
runs_results_dk[[i]] = list(
run_cv = lambda_est,
alpha = run_alpha,
#standard lasso
R2_test_lasso = calc_r2(run_test$pred_lasso, run_test$S),
#ols lasso
R2_test_lasso_ols = calc_r2(run_test$pred_lasso_ols, run_test$S),
#predicted values
oos_preds = run_test %>% select(pred_lasso, pred_lasso_ols, `...id`) %>% mutate(...run = i)
)
}
time_end = lubridate::now()
running_time = time_end - time_start
cat(sprintf("Loop finished at %s\nrunning time of %s", as.character(time_end), capture.output(print(running_time))))
#save to disk
write_rds(runs_results_dk, "data/lasso_runs_dk.rds")
}
#summary stats
penalized_reg_summary_dk = map_df(runs_results_dk, function(x) {
data_frame(
R2_test_lasso = x$R2_test_lasso,
R2_test_lasso_ols = x$R2_test_lasso_ols,
alpha = x$alpha,
lambda_1se = x$run_cv$lambda.1se,
lambda_optimal = x$run_cv$lambda.min
)
})
#results
penalized_reg_summary_dk %>%
gather(key = predict_method, value = R2_test, R2_test_lasso, R2_test_lasso_ols) %>%
mutate(alpha = factor(alpha)) %>%
GG_group_means("R2_test", "alpha", "predict_method") +
scale_y_continuous(breaks = seq(0, 1, .1)) +
scale_fill_discrete("Predict method", labels = c("LASSO", "OLS"))
## Missing values were removed.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
GG_save("figures/lasso_cv_dk.png")
penalized_reg_summary_dk %>%
gather(key = predict_method, value = R2_test, R2_test_lasso, R2_test_lasso_ols) %>%
mutate(alpha = factor(alpha)) %>%
unite(method, alpha, predict_method) %$%
describeBy(R2_test, method, mat = T) %>% print(digits = 3)
## item group1 vars n mean sd median trimmed
## X11 1 0.1_R2_test_lasso 1 1000 0.1724 0.0837 0.1770 0.1749
## X12 2 0.1_R2_test_lasso_ols 1 786 0.0390 0.2514 0.0753 0.0615
## X13 3 0.325_R2_test_lasso 1 1000 0.1722 0.0913 0.1723 0.1741
## X14 4 0.325_R2_test_lasso_ols 1 741 0.0577 0.2434 0.1008 0.0830
## X15 5 0.55_R2_test_lasso 1 1000 0.1687 0.0909 0.1716 0.1711
## X16 6 0.55_R2_test_lasso_ols 1 762 0.0726 0.2161 0.0931 0.0894
## X17 7 0.775_R2_test_lasso 1 1000 0.1708 0.0934 0.1727 0.1726
## X18 8 0.775_R2_test_lasso_ols 1 772 0.0804 0.2208 0.1164 0.1017
## X19 9 1_R2_test_lasso 1 1000 0.1697 0.0925 0.1716 0.1705
## X110 10 1_R2_test_lasso_ols 1 759 0.0802 0.2234 0.1022 0.0972
## mad min max range skew kurtosis se
## X11 0.0789 -0.168 0.423 0.590 -0.414 0.779 0.00265
## X12 0.2105 -1.577 0.571 2.147 -1.456 4.965 0.00897
## X13 0.0856 -0.354 0.451 0.805 -0.424 1.423 0.00289
## X14 0.2031 -1.492 0.609 2.101 -1.614 5.255 0.00894
## X15 0.0890 -0.176 0.443 0.619 -0.276 0.253 0.00287
## X16 0.1983 -1.158 0.542 1.700 -1.008 2.286 0.00783
## X17 0.0923 -0.337 0.486 0.823 -0.280 0.649 0.00295
## X18 0.1838 -1.165 0.536 1.700 -1.203 2.692 0.00795
## X19 0.0865 -0.203 0.432 0.635 -0.176 0.326 0.00292
## X110 0.2173 -0.994 0.607 1.602 -0.852 1.293 0.00811
#oos predictions from runs
oos_data_dk = map_df(runs_results_dk, "oos_preds")
#take first from each observation
set.seed(1)
oos_data_rand1_dk = oos_data_dk %>%
#group by id
group_by(...id) %>%
#select 1 value at random
do(sample_n(., 1)) %>%
#join with true values
full_join(d2_dk %>% select(S, ...id), by = "...id")
oos_data_mean_dk = oos_data_dk %>%
#group by id
group_by(...id) %>%
#select 1 value at random
dplyr::summarize(pred_lasso = mean(pred_lasso)) %>%
#join with true values
full_join(d2_dk %>% select(S, ...id), by = "...id")
#plot
GG_scatter(oos_data_rand1_dk, "pred_lasso", "S") +
scale_x_continuous("Predicted S")
GG_scatter(oos_data_mean_dk, "pred_lasso", "S") +
scale_x_continuous("Predicted S")
GG_save("figures/lasso_oos_scatter_dk.png")
#predict from full lasso
d2_dk$S_lasso_full = predict(LASSO_cv_dk, newx = x_mat) %>% as.vector()
GG_scatter(d2_dk, "S_lasso_full", "S", alpha = .5) +
scale_x_continuous("Predicted S score")
GG_save("figures/lasso_full_cv_dk.png")