Data loading

behaviours <- read.csv("~/Downloads/okcupid_profiles.csv",
                       stringsAsFactors = FALSE)

instances  <- read.csv(
  "~/Downloads/archive (4)/lovoo_v3_users_instances.csv",
  stringsAsFactors = FALSE)

api        <- read.csv(
  "~/Downloads/archive (4)/lovoo_v3_users_api-results.csv",
  stringsAsFactors = FALSE)

Data preparation

The Lovoo dataset is filtered to female users only. One row per user is produced by averaging repeated snapshots, then the instances and API tables are joined. A conversion_rate column (kisses ÷ profile visits) is added for Beat 5.

instances_clean <- instances %>%
  filter(gender == FALSE) %>%      
  group_by(userId) %>%
  summarise(
    age           = first(age),
    profileVisits = mean(counts_profileVisits, na.rm = TRUE),
    kisses        = mean(counts_kisses,        na.rm = TRUE),
    .groups = "drop"
  )

api_clean <- api %>%
  group_by(userId) %>%
  summarise(
    fans          = mean(counts_fans,          na.rm = TRUE),
    profileVisits = mean(counts_profileVisits, na.rm = TRUE),
    kisses        = mean(counts_kisses,        na.rm = TRUE),
    .groups = "drop"
  )

data <- inner_join(instances_clean, api_clean, by = "userId") %>%
  rename(
    profileVisits = profileVisits.y,
    kisses        = kisses.y
  ) %>%
  filter(
    !is.na(profileVisits), profileVisits > 0,
    !is.na(kisses),        kisses >= 0,
    !is.na(fans),          fans   >= 0,
    !is.na(age),           age >= 18, age <= 70
  ) %>%
  mutate(
    conversion_rate = kisses / profileVisits,
    attention_bin   = ntile(profileVisits, 10)
  )
cat("Rows after cleaning:", nrow(data), "\n")
## Rows after cleaning: 2813

Beat 1 — The power law

power_model <- lm(log(kisses + 1) ~ log(profileVisits), data = data)
tidy_model  <- tidy(power_model)

beta <- round(coef(power_model)[["log(profileVisits)"]], 3)
r2   <- round(summary(power_model)$r.squared, 3)

ggplot(data, aes(x = profileVisits, y = kisses + 1)) +
  geom_point(alpha = 0.12, size = 0.7, colour = accent_col) +
  geom_smooth(method = "lm", se = TRUE,
              colour = "#993C1D", fill = accent_col,
              alpha = 0.12, linewidth = 1.1) +
  scale_x_log10(labels = label_number(big.mark = ",")) +
  scale_y_log10(labels = label_number(big.mark = ",")) +
  labs(
    title    = "Attention follows a power law",
    subtitle = paste0("β = ", beta, "  ·  R² = ", r2,
                      "  ·  β > 1 confirms superlinear returns"),
    x        = "Profile visits (log scale)",
    y        = "Kisses received + 1 (log scale)",
    caption  = paste0(
      "OLS on log-log scale. Each point = one user.  n = ",
      format(nrow(data), big.mark = ","), "."
    )
  ) +
  theme_essay()
Log-log scatter of profile visits vs kisses. Slope β is the power law exponent.

Log-log scatter of profile visits vs kisses. Slope β is the power law exponent.

tidy_model %>%
  mutate(term = recode(term,
    "(Intercept)"        = "Intercept",
    "log(profileVisits)" = "log(profile visits)  [β]"
  )) %>%
  select(Term = term, Estimate = estimate,
         `Std. Error` = std.error, Statistic = statistic,
         `p-value` = p.value) %>%
  mutate(across(where(is.numeric), ~round(.x, 4))) %>%
  knitr::kable(caption = "OLS model — log(kisses+1) ~ log(profileVisits)")
OLS model — log(kisses+1) ~ log(profileVisits)
Term Estimate Std. Error Statistic p-value
Intercept -2.7025 0.0586 -46.1026 0
log(profile visits) [β] 0.9131 0.0080 114.7267 0

Script line to quote: “For every 1% more attention a profile receives, it gains 0.913% more kisses — well above the 1% you’d expect if the relationship were simply proportional.”


##s 2 — Attention inequality

“Human systems, when left to scale, rarely stay equal.”

attention_vec <- data$profileVisits
gini_val      <- round(ineq(attention_vec, type = "Gini"), 3)
kisses_gini   <- round(ineq(data$kisses + 1, type = "Gini"), 3)

# Top 10% share
n_top10     <- ceiling(nrow(data) * 0.1)
top10_share <- round(
  sum(sort(attention_vec, decreasing = TRUE)[seq_len(n_top10)]) /
    sum(attention_vec) * 100, 1
)

cat("Gini (profile visits):", gini_val,    "\n")
## Gini (profile visits): 0.677
cat("Gini (kisses)        :", kisses_gini, "\n")
## Gini (kisses)        : 0.723
cat("Top 10% attention share:", top10_share, "%\n")
## Top 10% attention share: 51.2 %
cat("US wealth Gini (ref) : ~0.49\n")
## US wealth Gini (ref) : ~0.49
lorenz_df <- tibble(
  cum_pop = c(0, seq_len(nrow(data)) / nrow(data)),
  cum_att = c(0, cumsum(sort(attention_vec)) / sum(attention_vec))
)

ggplot(lorenz_df, aes(x = cum_pop, y = cum_att)) +
  geom_abline(slope = 1, intercept = 0,
              linetype = "dashed", colour = "grey65", linewidth = 0.8) +
  geom_area(fill = "#7F77DD", alpha = 0.18) +
  geom_line(colour = "#534AB7", linewidth = 1.2) +
  annotate("text", x = 0.22, y = 0.78,
           label  = paste0("Gini = ", gini_val),
           colour = "#534AB7", size = 5, fontface = "bold") +
  annotate("text", x = 0.22, y = 0.71,
           label  = paste0("Top 10% capture ", top10_share, "% of all attention"),
           colour = "#534AB7", size = 3.8) +
  annotate("text", x = 0.22, y = 0.64,
           label  = "US wealth Gini ≈ 0.49 (for reference)",
           colour = "grey55", size = 3.4, fontface = "italic") +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    title    = "The Lorenz curve of romantic attention",
    subtitle = paste0(
      "Gini = ", gini_val,
      " — attention is more concentrated than most national wealth distributions"
    ),
    x       = "Cumulative share of users (ranked by profile visits)",
    y       = "Cumulative share of total attention",
    caption = "Dashed line = perfect equality."
  ) +
  theme_essay()
Lorenz curve for profile visits. The shaded area between the curve and the equality line represents the Gini coefficient.

Lorenz curve for profile visits. The shaded area between the curve and the equality line represents the Gini coefficient.

ggplot(data, aes(x = profileVisits)) +
  geom_histogram(bins = 60, fill = "#7F77DD", alpha = 0.85, colour = "white",
                 linewidth = 0.2) +
  scale_x_log10(labels = label_number(big.mark = ",")) +
  labs(
    title    = "Attention distribution across the platform",
    subtitle = paste0(
      "Most users receive very few visits; a small minority dominates — ",
      "Gini = ", gini_val
    ),
    x       = "Profile visits (log scale)",
    y       = "Number of users",
    caption = paste0("n = ", format(nrow(data), big.mark = ","), " users.")
  ) +
  theme_essay()
Histogram of profile visits on log scale. The right skew is the visual signature of a power law.

Histogram of profile visits on log scale. The right skew is the visual signature of a power law.


3 — Age and the arc of desirability

ggplot(data, aes(x = age, y = profileVisits)) +
  geom_smooth(method = "loess", span = 0.45, se = TRUE,
              colour = accent_col, fill = accent_col,
              linewidth = 1.2, alpha = 0.12) +
  scale_y_continuous(labels = label_number(big.mark = ",")) +
  labs(
    title    = "Attention across the lifespan",
    subtitle = "When does algorithmic visibility peak — and when does it decline?",
    x        = "Age",
    y        = "Average profile visits",
    caption  = "LOESS smoother · ribbon = 95% CI."
  ) +
  theme_essay()
LOESS smoother of profile visits against age. The peak and decline reveal how algorithmic exposure tracks age.

LOESS smoother of profile visits against age. The peak and decline reveal how algorithmic exposure tracks age.

ggplot(data, aes(x = age, y = kisses)) +
  geom_smooth(method = "loess", span = 0.45, se = TRUE,
              colour = accent_col, fill = accent_col,
              linewidth = 1.2, alpha = 0.12) +
  scale_y_continuous(labels = label_number(big.mark = ",")) +
  labs(
    title    = "Kisses across the lifespan",
    subtitle = "Does romantic engagement follow the same age curve as raw attention?",
    x        = "Age",
    y        = "Average kisses received",
    caption  = "LOESS smoother · ribbon = 95% CI."
  ) +
  theme_essay()
Kisses by age — the engagement version of the same story.

Kisses by age — the engagement version of the same story.


4 — Conversion rate: traffic vs genuine appeal

“You’re not just choosing. You’re also strategising.”

Does the top decile convert visits into kisses at a higher rate, or do they simply receive more traffic? This separates algorithmic amplification (scale effect only) from genuine appeal (people who visit actually engage more).

conversion_summary <- data %>%
  group_by(attention_bin) %>%
  summarise(
    avg_conversion = round(mean(conversion_rate, na.rm = TRUE), 4),
    med_conversion = round(median(conversion_rate, na.rm = TRUE), 4),
    avg_kisses     = round(mean(kisses,           na.rm = TRUE), 1),
    avg_visits     = round(mean(profileVisits,    na.rm = TRUE), 0),
    n              = n(),
    .groups = "drop"
  )

knitr::kable(conversion_summary,
             col.names = c("Decile", "Avg conversion", "Median conversion",
                           "Avg kisses", "Avg visits", "n"),
             caption   = "Kisses ÷ profile visits by attention decile")
Kisses ÷ profile visits by attention decile
Decile Avg conversion Median conversion Avg kisses Avg visits n
1 0.0464 0.0240 3.1 70 282
2 0.0371 0.0288 9.6 251 282
3 0.0411 0.0335 19.1 454 282
4 0.0417 0.0334 31.1 745 281
5 0.0418 0.0347 48.3 1154 281
6 0.0412 0.0349 72.5 1753 281
7 0.0388 0.0345 108.8 2830 281
8 0.0389 0.0323 177.9 4593 281
9 0.0376 0.0319 291.7 7739 281
10 0.0434 0.0393 932.0 20473 281
ggplot(conversion_summary, aes(x = attention_bin, y = avg_conversion)) +
  geom_col(fill = "#1D9E75", alpha = 0.85, width = 0.7) +
  geom_text(aes(label = sprintf("%.3f", avg_conversion)),
            vjust = -0.45, size = 3.2, colour = "grey40") +
  scale_x_continuous(breaks = 1:10) +
  scale_y_continuous(labels = label_number(accuracy = 0.001),
                     expand = expansion(mult = c(0, 0.15))) +
  labs(
    title    = "Does popularity reflect quality — or just scale?",
    subtitle = "Average kisses per profile visit by attention decile",
    x        = "Attention decile (1 = least visited, 10 = most)",
    y        = "Avg kisses per profile visit",
    caption  = paste0(
      "Flat trend → algorithmic scale effect.\n",
      "Rising trend → top profiles are genuinely more compelling."
    )
  ) +
  theme_essay()
If conversion rises with decile, popular profiles are genuinely more compelling. If flat, the algorithm is manufacturing popularity.

If conversion rises with decile, popular profiles are genuinely more compelling. If flat, the algorithm is manufacturing popularity.


6 OkCupid: cross-platform validation

OkCupid is a Western platform (San Francisco-heavy sample) that skews older and more text-driven than Lovoo. Using it to cross-check demographic patterns and self-presentation behaviour strengthens the generalisation claim. Filtered to female profiles to match the Lovoo sample.

okcupid <- behaviours %>%
  filter(
    !is.na(age), age >= 18, age <= 70,
    sex == "f"
  ) %>%
  mutate(
    essay_length = nchar(replace_na(essay0, "")) +
                   nchar(replace_na(essay1, "")) +
                   nchar(replace_na(essay2, "")) +
                   nchar(replace_na(essay3, "")) +
                   nchar(replace_na(essay4, ""))
  )

cat("OkCupid rows after cleaning:", nrow(okcupid), "\n")
## OkCupid rows after cleaning: 24116
ggplot(okcupid, aes(x = age)) +
  geom_histogram(binwidth = 2, fill = accent_col, alpha = 0.80,
                 colour = "white", linewidth = 0.15) +
  labs(
    title    = "Who is on dating apps? (OkCupid, San Francisco — female profiles)",
    subtitle = "Cross-validates Lovoo demographic patterns on a western audience",
    x        = "Age",
    y        = "Number of profiles",
    caption  = paste0("n = ", format(nrow(okcupid), big.mark = ","), " profiles.")
  ) +
  theme_essay()
OkCupid age distribution — female profiles. Western, text-first app; different user-base, same structural question.

OkCupid age distribution — female profiles. Western, text-first app; different user-base, same structural question.

ggplot(okcupid %>% filter(essay_length > 0),
       aes(x = age, y = essay_length)) +
  geom_smooth(method = "loess", span = 0.5, se = TRUE,
              colour = accent_col, fill = accent_col,
              linewidth = 1.2, alpha = 0.12) +
  scale_y_continuous(labels = label_number(big.mark = ",")) +
  labs(
    title    = "Profile depth across the lifespan",
    subtitle = "Essay length (total characters across 5 essays) by age",
    x        = "Age",
    y        = "Total essay length (characters)",
    caption  = paste0(
      "Profiles with zero essay content excluded.\n",
      "Declining trend supports: 'profiles become curated, messages become calculated'."
    )
  ) +
  theme_essay()
Essay length declines with age. This contradicts 'more experience = more depth' and supports the optimisation hypothesis.

Essay length declines with age. This contradicts ‘more experience = more depth’ and supports the optimisation hypothesis.

category_completion <- okcupid %>%
  summarise(
    across(
      c(body_type, diet, drinks, drugs, education,
        job, offspring, pets, religion, smokes),
      ~ mean(!is.na(.) & . != "", na.rm = TRUE),
      .names = "pct_{.col}"
    )
  ) %>%
  pivot_longer(everything(),
               names_to  = "field",
               values_to = "pct_filled") %>%
  mutate(
    field      = str_remove(field, "pct_") %>%
                   str_replace_all("_", " ") %>%
                   str_to_title(),
    pct_filled = round(pct_filled * 100, 1)
  ) %>%
  arrange(desc(pct_filled))

ggplot(category_completion,
       aes(x = reorder(field, pct_filled), y = pct_filled)) +
  geom_col(fill = "#7F77DD", alpha = 0.85, width = 0.7) +
  geom_text(aes(label = paste0(pct_filled, "%")),
            hjust = -0.15, size = 3.2, colour = "grey40") +
  coord_flip() +
  scale_y_continuous(limits  = c(0, 115),
                     labels  = function(x) paste0(x, "%"),
                     expand  = expansion(mult = c(0, 0))) +
  labs(
    title    = "What do people choose to reveal?",
    subtitle = "% of OkCupid profiles that filled each optional field",
    x        = NULL,
    y        = "% of profiles completed",
    caption  = "Fields left blank are a form of strategic omission."
  ) +
  theme_essay()
What people choose to reveal — and what they strategically omit.

What people choose to reveal — and what they strategically omit.

Data sources: Lovoo v3 (Kaggle · utkarshx27), OkCupid profiles (Kaggle · andrewmvd)