behaviours <- read.csv("~/Downloads/okcupid_profiles.csv",
stringsAsFactors = FALSE)
instances <- read.csv(
"~/Downloads/archive (4)/lovoo_v3_users_instances.csv",
stringsAsFactors = FALSE)
api <- read.csv(
"~/Downloads/archive (4)/lovoo_v3_users_api-results.csv",
stringsAsFactors = FALSE)
The Lovoo dataset is filtered to female users only. One row per user
is produced by averaging repeated snapshots, then the instances and API
tables are joined. A conversion_rate column (kisses ÷
profile visits) is added for Beat 5.
instances_clean <- instances %>%
filter(gender == FALSE) %>%
group_by(userId) %>%
summarise(
age = first(age),
profileVisits = mean(counts_profileVisits, na.rm = TRUE),
kisses = mean(counts_kisses, na.rm = TRUE),
.groups = "drop"
)
api_clean <- api %>%
group_by(userId) %>%
summarise(
fans = mean(counts_fans, na.rm = TRUE),
profileVisits = mean(counts_profileVisits, na.rm = TRUE),
kisses = mean(counts_kisses, na.rm = TRUE),
.groups = "drop"
)
data <- inner_join(instances_clean, api_clean, by = "userId") %>%
rename(
profileVisits = profileVisits.y,
kisses = kisses.y
) %>%
filter(
!is.na(profileVisits), profileVisits > 0,
!is.na(kisses), kisses >= 0,
!is.na(fans), fans >= 0,
!is.na(age), age >= 18, age <= 70
) %>%
mutate(
conversion_rate = kisses / profileVisits,
attention_bin = ntile(profileVisits, 10)
)
cat("Rows after cleaning:", nrow(data), "\n")
## Rows after cleaning: 2813
power_model <- lm(log(kisses + 1) ~ log(profileVisits), data = data)
tidy_model <- tidy(power_model)
beta <- round(coef(power_model)[["log(profileVisits)"]], 3)
r2 <- round(summary(power_model)$r.squared, 3)
ggplot(data, aes(x = profileVisits, y = kisses + 1)) +
geom_point(alpha = 0.12, size = 0.7, colour = accent_col) +
geom_smooth(method = "lm", se = TRUE,
colour = "#993C1D", fill = accent_col,
alpha = 0.12, linewidth = 1.1) +
scale_x_log10(labels = label_number(big.mark = ",")) +
scale_y_log10(labels = label_number(big.mark = ",")) +
labs(
title = "Attention follows a power law",
subtitle = paste0("β = ", beta, " · R² = ", r2,
" · β > 1 confirms superlinear returns"),
x = "Profile visits (log scale)",
y = "Kisses received + 1 (log scale)",
caption = paste0(
"OLS on log-log scale. Each point = one user. n = ",
format(nrow(data), big.mark = ","), "."
)
) +
theme_essay()
Log-log scatter of profile visits vs kisses. Slope β is the power law exponent.
tidy_model %>%
mutate(term = recode(term,
"(Intercept)" = "Intercept",
"log(profileVisits)" = "log(profile visits) [β]"
)) %>%
select(Term = term, Estimate = estimate,
`Std. Error` = std.error, Statistic = statistic,
`p-value` = p.value) %>%
mutate(across(where(is.numeric), ~round(.x, 4))) %>%
knitr::kable(caption = "OLS model — log(kisses+1) ~ log(profileVisits)")
| Term | Estimate | Std. Error | Statistic | p-value |
|---|---|---|---|---|
| Intercept | -2.7025 | 0.0586 | -46.1026 | 0 |
| log(profile visits) [β] | 0.9131 | 0.0080 | 114.7267 | 0 |
Script line to quote: “For every 1% more attention a profile receives, it gains 0.913% more kisses — well above the 1% you’d expect if the relationship were simply proportional.”
##s 2 — Attention inequality
“Human systems, when left to scale, rarely stay equal.”
attention_vec <- data$profileVisits
gini_val <- round(ineq(attention_vec, type = "Gini"), 3)
kisses_gini <- round(ineq(data$kisses + 1, type = "Gini"), 3)
# Top 10% share
n_top10 <- ceiling(nrow(data) * 0.1)
top10_share <- round(
sum(sort(attention_vec, decreasing = TRUE)[seq_len(n_top10)]) /
sum(attention_vec) * 100, 1
)
cat("Gini (profile visits):", gini_val, "\n")
## Gini (profile visits): 0.677
cat("Gini (kisses) :", kisses_gini, "\n")
## Gini (kisses) : 0.723
cat("Top 10% attention share:", top10_share, "%\n")
## Top 10% attention share: 51.2 %
cat("US wealth Gini (ref) : ~0.49\n")
## US wealth Gini (ref) : ~0.49
lorenz_df <- tibble(
cum_pop = c(0, seq_len(nrow(data)) / nrow(data)),
cum_att = c(0, cumsum(sort(attention_vec)) / sum(attention_vec))
)
ggplot(lorenz_df, aes(x = cum_pop, y = cum_att)) +
geom_abline(slope = 1, intercept = 0,
linetype = "dashed", colour = "grey65", linewidth = 0.8) +
geom_area(fill = "#7F77DD", alpha = 0.18) +
geom_line(colour = "#534AB7", linewidth = 1.2) +
annotate("text", x = 0.22, y = 0.78,
label = paste0("Gini = ", gini_val),
colour = "#534AB7", size = 5, fontface = "bold") +
annotate("text", x = 0.22, y = 0.71,
label = paste0("Top 10% capture ", top10_share, "% of all attention"),
colour = "#534AB7", size = 3.8) +
annotate("text", x = 0.22, y = 0.64,
label = "US wealth Gini ≈ 0.49 (for reference)",
colour = "grey55", size = 3.4, fontface = "italic") +
scale_x_continuous(labels = percent_format(accuracy = 1)) +
scale_y_continuous(labels = percent_format(accuracy = 1)) +
labs(
title = "The Lorenz curve of romantic attention",
subtitle = paste0(
"Gini = ", gini_val,
" — attention is more concentrated than most national wealth distributions"
),
x = "Cumulative share of users (ranked by profile visits)",
y = "Cumulative share of total attention",
caption = "Dashed line = perfect equality."
) +
theme_essay()
Lorenz curve for profile visits. The shaded area between the curve and the equality line represents the Gini coefficient.
ggplot(data, aes(x = profileVisits)) +
geom_histogram(bins = 60, fill = "#7F77DD", alpha = 0.85, colour = "white",
linewidth = 0.2) +
scale_x_log10(labels = label_number(big.mark = ",")) +
labs(
title = "Attention distribution across the platform",
subtitle = paste0(
"Most users receive very few visits; a small minority dominates — ",
"Gini = ", gini_val
),
x = "Profile visits (log scale)",
y = "Number of users",
caption = paste0("n = ", format(nrow(data), big.mark = ","), " users.")
) +
theme_essay()
Histogram of profile visits on log scale. The right skew is the visual signature of a power law.
ggplot(data, aes(x = age, y = profileVisits)) +
geom_smooth(method = "loess", span = 0.45, se = TRUE,
colour = accent_col, fill = accent_col,
linewidth = 1.2, alpha = 0.12) +
scale_y_continuous(labels = label_number(big.mark = ",")) +
labs(
title = "Attention across the lifespan",
subtitle = "When does algorithmic visibility peak — and when does it decline?",
x = "Age",
y = "Average profile visits",
caption = "LOESS smoother · ribbon = 95% CI."
) +
theme_essay()
LOESS smoother of profile visits against age. The peak and decline reveal how algorithmic exposure tracks age.
ggplot(data, aes(x = age, y = kisses)) +
geom_smooth(method = "loess", span = 0.45, se = TRUE,
colour = accent_col, fill = accent_col,
linewidth = 1.2, alpha = 0.12) +
scale_y_continuous(labels = label_number(big.mark = ",")) +
labs(
title = "Kisses across the lifespan",
subtitle = "Does romantic engagement follow the same age curve as raw attention?",
x = "Age",
y = "Average kisses received",
caption = "LOESS smoother · ribbon = 95% CI."
) +
theme_essay()
Kisses by age — the engagement version of the same story.
“You’re not just choosing. You’re also strategising.”
Does the top decile convert visits into kisses at a higher rate, or do they simply receive more traffic? This separates algorithmic amplification (scale effect only) from genuine appeal (people who visit actually engage more).
conversion_summary <- data %>%
group_by(attention_bin) %>%
summarise(
avg_conversion = round(mean(conversion_rate, na.rm = TRUE), 4),
med_conversion = round(median(conversion_rate, na.rm = TRUE), 4),
avg_kisses = round(mean(kisses, na.rm = TRUE), 1),
avg_visits = round(mean(profileVisits, na.rm = TRUE), 0),
n = n(),
.groups = "drop"
)
knitr::kable(conversion_summary,
col.names = c("Decile", "Avg conversion", "Median conversion",
"Avg kisses", "Avg visits", "n"),
caption = "Kisses ÷ profile visits by attention decile")
| Decile | Avg conversion | Median conversion | Avg kisses | Avg visits | n |
|---|---|---|---|---|---|
| 1 | 0.0464 | 0.0240 | 3.1 | 70 | 282 |
| 2 | 0.0371 | 0.0288 | 9.6 | 251 | 282 |
| 3 | 0.0411 | 0.0335 | 19.1 | 454 | 282 |
| 4 | 0.0417 | 0.0334 | 31.1 | 745 | 281 |
| 5 | 0.0418 | 0.0347 | 48.3 | 1154 | 281 |
| 6 | 0.0412 | 0.0349 | 72.5 | 1753 | 281 |
| 7 | 0.0388 | 0.0345 | 108.8 | 2830 | 281 |
| 8 | 0.0389 | 0.0323 | 177.9 | 4593 | 281 |
| 9 | 0.0376 | 0.0319 | 291.7 | 7739 | 281 |
| 10 | 0.0434 | 0.0393 | 932.0 | 20473 | 281 |
ggplot(conversion_summary, aes(x = attention_bin, y = avg_conversion)) +
geom_col(fill = "#1D9E75", alpha = 0.85, width = 0.7) +
geom_text(aes(label = sprintf("%.3f", avg_conversion)),
vjust = -0.45, size = 3.2, colour = "grey40") +
scale_x_continuous(breaks = 1:10) +
scale_y_continuous(labels = label_number(accuracy = 0.001),
expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Does popularity reflect quality — or just scale?",
subtitle = "Average kisses per profile visit by attention decile",
x = "Attention decile (1 = least visited, 10 = most)",
y = "Avg kisses per profile visit",
caption = paste0(
"Flat trend → algorithmic scale effect.\n",
"Rising trend → top profiles are genuinely more compelling."
)
) +
theme_essay()
If conversion rises with decile, popular profiles are genuinely more compelling. If flat, the algorithm is manufacturing popularity.
OkCupid is a Western platform (San Francisco-heavy sample) that skews older and more text-driven than Lovoo. Using it to cross-check demographic patterns and self-presentation behaviour strengthens the generalisation claim. Filtered to female profiles to match the Lovoo sample.
okcupid <- behaviours %>%
filter(
!is.na(age), age >= 18, age <= 70,
sex == "f"
) %>%
mutate(
essay_length = nchar(replace_na(essay0, "")) +
nchar(replace_na(essay1, "")) +
nchar(replace_na(essay2, "")) +
nchar(replace_na(essay3, "")) +
nchar(replace_na(essay4, ""))
)
cat("OkCupid rows after cleaning:", nrow(okcupid), "\n")
## OkCupid rows after cleaning: 24116
ggplot(okcupid, aes(x = age)) +
geom_histogram(binwidth = 2, fill = accent_col, alpha = 0.80,
colour = "white", linewidth = 0.15) +
labs(
title = "Who is on dating apps? (OkCupid, San Francisco — female profiles)",
subtitle = "Cross-validates Lovoo demographic patterns on a western audience",
x = "Age",
y = "Number of profiles",
caption = paste0("n = ", format(nrow(okcupid), big.mark = ","), " profiles.")
) +
theme_essay()
OkCupid age distribution — female profiles. Western, text-first app; different user-base, same structural question.
ggplot(okcupid %>% filter(essay_length > 0),
aes(x = age, y = essay_length)) +
geom_smooth(method = "loess", span = 0.5, se = TRUE,
colour = accent_col, fill = accent_col,
linewidth = 1.2, alpha = 0.12) +
scale_y_continuous(labels = label_number(big.mark = ",")) +
labs(
title = "Profile depth across the lifespan",
subtitle = "Essay length (total characters across 5 essays) by age",
x = "Age",
y = "Total essay length (characters)",
caption = paste0(
"Profiles with zero essay content excluded.\n",
"Declining trend supports: 'profiles become curated, messages become calculated'."
)
) +
theme_essay()
Essay length declines with age. This contradicts ‘more experience = more depth’ and supports the optimisation hypothesis.
category_completion <- okcupid %>%
summarise(
across(
c(body_type, diet, drinks, drugs, education,
job, offspring, pets, religion, smokes),
~ mean(!is.na(.) & . != "", na.rm = TRUE),
.names = "pct_{.col}"
)
) %>%
pivot_longer(everything(),
names_to = "field",
values_to = "pct_filled") %>%
mutate(
field = str_remove(field, "pct_") %>%
str_replace_all("_", " ") %>%
str_to_title(),
pct_filled = round(pct_filled * 100, 1)
) %>%
arrange(desc(pct_filled))
ggplot(category_completion,
aes(x = reorder(field, pct_filled), y = pct_filled)) +
geom_col(fill = "#7F77DD", alpha = 0.85, width = 0.7) +
geom_text(aes(label = paste0(pct_filled, "%")),
hjust = -0.15, size = 3.2, colour = "grey40") +
coord_flip() +
scale_y_continuous(limits = c(0, 115),
labels = function(x) paste0(x, "%"),
expand = expansion(mult = c(0, 0))) +
labs(
title = "What do people choose to reveal?",
subtitle = "% of OkCupid profiles that filled each optional field",
x = NULL,
y = "% of profiles completed",
caption = "Fields left blank are a form of strategic omission."
) +
theme_essay()
What people choose to reveal — and what they strategically omit.
Data sources: Lovoo v3 (Kaggle · utkarshx27), OkCupid profiles (Kaggle · andrewmvd)