Dataset from https://projecteuler.net/locations You need to make a free user to this see page.
Project Euler is a solve math for fun site. Describes itself as:
Project Euler is a series of challenging mathematical/computer programming problems that will require more than just mathematical insights to solve. Although mathematics will help you arrive at elegant and efficient methods, the use of a computer and programming skills will be required to solve most problems.
The motivation for starting Project Euler, and its continuation, is to provide a platform for the inquiring mind to delve into unfamiliar areas and learn new concepts in a fun and recreational context.
Who are the problems aimed at?
The intended audience include students for whom the basic curriculum is not feeding their hunger to learn, adults whose background was not primarily mathematics but had an interest in things mathematical, and professionals who want to keep their problem solving and mathematics on the cutting edge.
Currently we have 1033865 registered members who have solved at least one problem, representing 219 locations throughout the world, and collectively using 105 different programming langues to solve the problems.
options(
digits = 2,
scipen = 10
)
library(pacman)
p_load(
kirkegaard,
rms,
googlesheets4
)
theme_set(theme_bw())
#auth
gs4_auth("the.dfx@gmail.com")
#get data
euler = read_sheet("https://docs.google.com/spreadsheets/d/1uGXNwgAyCWvhCsIaaFCm0Sg7tlYwEnjvqXVoSWJzW1U/edit#gid=0") %>%
df_legalize_names() %>%
#mutate data
mutate(
prop_l1 = Proportion_of_Members_at_Level_1plus %>% str_match("(\\d+\\.?\\d?)") %>% .[, 2] %>% as.numeric(),
fraction_members = Total_Number_of_Members / sum(Total_Number_of_Members),
ISO = pu_translate(country)
)
## Reading from "Project Euler dataset"
## Range "Sheet1"
## New names:
## * `` -> ...5
## No exact match: North Macedonia
## No exact match: St Vincent Grenadines
## No exact match: Trinidad Tobago
## No exact match: US Virgin Islands
## Best fuzzy match found: North Macedonia -> FYR Macedonia with distance 5.00
## Best fuzzy match found: St Vincent Grenadines -> St. Vincent & Grenadines with distance 3.00
## Best fuzzy match found: Trinidad Tobago -> Trinidad & Tobago with distance 2.00
## Best fuzzy match found: US Virgin Islands -> U.S. Virgin Islands with distance 2.00
#national IQ data
#reuse from GMAT project
GMAT = read_sheet("https://docs.google.com/spreadsheets/d/12lMDL_8sf7ugsT86_VSwfus7bgA-9BxoqRwBRrzjGMk/edit#gid=0") %>%
mutate(
ISO = pu_translate(Country_Rename)
)
## Reading from "GMAT datafile"
## Range "data_path"
## No exact match: Congo DRC (Zaire)
## No exact match: Congo Republic
## No exact match: Ivory Coast (Cote d'Ivore)
## No exact match: Netherland Antilles
## No exact match: Papua-New Guinea
## No exact match: Tajjikistan
## Best fuzzy match found: Congo DRC (Zaire) -> Congo (Zaire) with distance 4.00
## Best fuzzy match found: Congo Republic -> Congo Republic of with distance 3.00
## Best fuzzy match found: Ivory Coast (Cote d'Ivore) -> Ivory Coast with distance 15.00
## Best fuzzy match found: Netherland Antilles -> Netherlands Antilles with distance 1.00
## Best fuzzy match found: Papua-New Guinea -> Papua New Guinea with distance 1.00
## Best fuzzy match found: Tajjikistan -> Tajikistan with distance 1.00
#smart fraction
sf_data = read_rds("data/smart fraction data.rds")
#merge
d = full_join(
euler,
GMAT %>% select(ISO, GMAT, TOEFL, English_Proficiency)
) %>% left_join(
sf_data %>% select(ISO, LyVa_IQ, LyVa_IQc, CA_totc, x95pct_IQc, IC_reg, population2017, UN_macroregion)
)
## Joining, by = "ISO"
## Joining, by = "ISO"
assert_that(!any(duplicated(d$ISO)))
## [1] TRUE
#add missing population counts
#these are all dependent nations of sorts
#populations from Wikipedia
d$population2017[d$ISO == "ABW"] = 116600
d$population2017[d$ISO == "CXR"] = 1843
d$population2017[d$ISO == "ENG"] = 56286961
d$population2017[d$ISO == "FRO"] = 52110
d$population2017[d$ISO == "HKG"] = 7500700
d$population2017[d$ISO == "JEY"] = 107800
d$population2017[d$ISO == "KSV"] = 1810463
d$population2017[d$ISO == "MAC"] = 696100
d$population2017[d$ISO == "NIR"] = 1810863
d$population2017[d$ISO == "USA_PRI"] = 3193694
d$population2017[d$ISO == "SCO"] = 5463300
d$population2017[d$ISO == "SXM"] = 41486
d$population2017[d$ISO == "WAL"] = 3153000
#mutate again
d %<>% mutate(
members_pc = Total_Number_of_Members / population2017,
IQ = LyVa_IQc,
log_pop = log10(population2017),
log_n_members = log10(Total_Number_of_Members)
)
#correlations for main variables
d %>%
select(IQ, members_pc, prop_l1, Mean_Percentage_of_Problems_Solved, GMAT, English_Proficiency, TOEFL) %>%
wtd.cors()
## IQ members_pc prop_l1
## IQ 1.00 0.447 0.44
## members_pc 0.45 1.000 -0.10
## prop_l1 0.44 -0.101 1.00
## Mean_Percentage_of_Problems_Solved 0.39 -0.075 0.65
## GMAT 0.76 0.400 0.50
## English_Proficiency 0.31 0.448 0.32
## TOEFL 0.62 0.503 0.44
## Mean_Percentage_of_Problems_Solved GMAT
## IQ 0.392 0.76
## members_pc -0.075 0.40
## prop_l1 0.655 0.50
## Mean_Percentage_of_Problems_Solved 1.000 0.60
## GMAT 0.596 1.00
## English_Proficiency 0.356 0.27
## TOEFL 0.500 0.76
## English_Proficiency TOEFL
## IQ 0.31 0.62
## members_pc 0.45 0.50
## prop_l1 0.32 0.44
## Mean_Percentage_of_Problems_Solved 0.36 0.50
## GMAT 0.27 0.76
## English_Proficiency 1.00 0.53
## TOEFL 0.53 1.00
#with sqrt weights
d %>%
select(IQ, members_pc, prop_l1, Mean_Percentage_of_Problems_Solved, GMAT, English_Proficiency, TOEFL) %>%
wtd.cors(weight = d$log_pop) %>%
GG_heatmap(short_x_labels = F, reorder_vars = F)
#filter to countries with 1M population and >= 50 members
d %>%
filter(
population2017 > 1e6,
Total_Number_of_Members >= 50
) %>%
select(IQ, members_pc, prop_l1, Mean_Percentage_of_Problems_Solved, GMAT, English_Proficiency, TOEFL) %>%
wtd.cors() %>%
GG_heatmap(short_x_labels = F, reorder_vars = F)
d %>%
select(members_pc, prop_l1, Mean_Percentage_of_Problems_Solved) %>%
GGally::ggpairs()
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Warning: Removed 3 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method = "pearson", :
## Removed 3 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method = "pearson", :
## Removed 3 rows containing missing values
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method = "pearson", :
## Removed 3 rows containing missing values
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_density).
#simplest approach: per capita
#bad residuals! heteroscedastic as function of population size
GG_scatter(d, "IQ", "members_pc", case_names = "country")
## `geom_smooth()` using formula 'y ~ x'
#errors fucked up actually?
pc_model = ols(members_pc ~ IQ, data = d)
d$pc_model_residual = resid(pc_model)
GG_scatter(d, "log_pop", "pc_model_residual", case_names = "country")
## `geom_smooth()` using formula 'y ~ x'
#filter to countries with 1M population
GG_scatter(d, "IQ", "members_pc", case_names = "country")
## `geom_smooth()` using formula 'y ~ x'
#simple
GG_scatter(d, "IQ", "prop_l1", case_names = "country")
## `geom_smooth()` using formula 'y ~ x'
#filter to countries with 1M population
GG_scatter(d %>% filter(population2017 > 1e6), "IQ", "prop_l1", case_names = "country")
## `geom_smooth()` using formula 'y ~ x'
#filter to countries with at least 50 members
GG_scatter(d %>% filter(Total_Number_of_Members >= 50), "IQ", "prop_l1", case_names = "country")
## `geom_smooth()` using formula 'y ~ x'
#simple
GG_scatter(d, "IQ", "Mean_Percentage_of_Problems_Solved", case_names = "country")
## `geom_smooth()` using formula 'y ~ x'
#filter to countries with 1M population
GG_scatter(d %>% filter(population2017 > 1e6), "IQ", "Mean_Percentage_of_Problems_Solved", case_names = "country")
## `geom_smooth()` using formula 'y ~ x'
#filter to countries with at least 50 members
GG_scatter(d %>% filter(Total_Number_of_Members >= 50), "IQ", "Mean_Percentage_of_Problems_Solved", case_names = "country")
## `geom_smooth()` using formula 'y ~ x'
#standardized dataset
d_std = df_standardize(d, messages = F, exclude = c("Total_Number_of_Members", "log_pop"))
#at least 50 members
d_std_50m = d %>% filter(Total_Number_of_Members >= 50) %>% df_standardize( messages = F, exclude = c("Total_Number_of_Members", "log_pop"))
#per capita
#DO NOT USE THIS BROKEN METHOD!
list(
ols(members_pc ~ IQ, data = d_std, weights = d_std$log_pop),
ols(members_pc ~ IQ + English_Proficiency, data = d_std, weights = d_std$log_pop),
ols(members_pc ~ IQ + English_Proficiency + UN_macroregion, data = d_std, weights = d_std$log_pop)
) %>%
summarize_models()
#log log approach
list(
ols(log_n_members ~ log_pop + IQ, data = d_std, weights = d_std$log_pop),
ols(log_n_members ~ log_pop + IQ + English_Proficiency, data = d_std, weights = d_std$log_pop),
ols(log_n_members ~ log_pop + IQ + English_Proficiency + UN_macroregion, data = d_std, weights = d_std$log_pop)
) %>%
summarize_models()
#level >=1 %
list(
ols(prop_l1 ~ IQ, data = d_std_50m, weights = d_std_50m$log_pop),
ols(prop_l1 ~ IQ + English_Proficiency, data = d_std_50m, weights = d_std_50m$log_pop),
ols(prop_l1 ~ IQ + English_Proficiency + UN_macroregion, data = d_std_50m, weights = d_std_50m$log_pop)
) %>%
summarize_models()
#problems solved %
list(
ols(Mean_Percentage_of_Problems_Solved ~ IQ, data = d_std_50m, weights = d_std_50m$log_pop),
ols(Mean_Percentage_of_Problems_Solved ~ IQ + English_Proficiency, data = d_std_50m, weights = d_std_50m$log_pop),
ols(Mean_Percentage_of_Problems_Solved ~ IQ + English_Proficiency + UN_macroregion, data = d_std_50m, weights = d_std_50m$log_pop)
) %>%
summarize_models()