The Mathematics of Human Behaviour

Data loading

behaviours <- read.csv("~/Downloads/okcupid_profiles.csv",
                       stringsAsFactors = FALSE)

instances  <- read.csv(
  "~/Downloads/archive (4)/lovoo_v3_users_instances.csv",
  stringsAsFactors = FALSE)

api        <- read.csv(
  "~/Downloads/archive (4)/lovoo_v3_users_api-results.csv",
  stringsAsFactors = FALSE)

Data preparation

The Lovoo dataset is filtered to female users only. One row per user is produced by averaging repeated snapshots, then the instances and API tables are joined. A conversion_rate column (kisses ÷ profile visits) is added for Beat 5.

instances_clean <- instances %>%
  filter(gender == FALSE) %>%      
  group_by(userId) %>%
  summarise(
    age           = first(age),
    profileVisits = mean(counts_profileVisits, na.rm = TRUE),
    kisses        = mean(counts_kisses,        na.rm = TRUE),
    .groups = "drop"
  )

api_clean <- api %>%
  group_by(userId) %>%
  summarise(
    fans          = mean(counts_fans,          na.rm = TRUE),
    profileVisits = mean(counts_profileVisits, na.rm = TRUE),
    kisses        = mean(counts_kisses,        na.rm = TRUE),
    .groups = "drop"
  )

data <- inner_join(instances_clean, api_clean, by = "userId") %>%
  rename(
    profileVisits = profileVisits.y,
    kisses        = kisses.y
  ) %>%
  filter(
    !is.na(profileVisits), profileVisits > 0,
    !is.na(kisses),        kisses >= 0,
    !is.na(fans),          fans   >= 0,
    !is.na(age),           age >= 18, age <= 70
  ) %>%
  mutate(
    conversion_rate = kisses / profileVisits,
    attention_bin   = ntile(profileVisits, 10)
  )
cat("Rows after cleaning:", nrow(data), "\n")

## Rows after cleaning: 2813

Beat 1 — The power law

power_model <- lm(log(kisses + 1) ~ log(profileVisits), data = data)
tidy_model  <- tidy(power_model)

beta <- round(coef(power_model)[["log(profileVisits)"]], 3)
r2   <- round(summary(power_model)$r.squared, 3)

ggplot(data, aes(x = profileVisits, y = kisses + 1)) +
  geom_point(alpha = 0.12, size = 0.7, colour = accent_col) +
  geom_smooth(method = "lm", se = TRUE,
              colour = "#993C1D", fill = accent_col,
              alpha = 0.12, linewidth = 1.1) +
  scale_x_log10(labels = label_number(big.mark = ",")) +
  scale_y_log10(labels = label_number(big.mark = ",")) +
  labs(
    title    = "Attention follows a power law",
    subtitle = paste0("β = ", beta, "  ·  R² = ", r2,
                      "  ·  β > 1 confirms superlinear returns"),
    x        = "Profile visits (log scale)",
    y        = "Kisses received + 1 (log scale)",
    caption  = paste0(
      "OLS on log-log scale. Each point = one user.  n = ",
      format(nrow(data), big.mark = ","), "."
    )
  ) +
  theme_essay()

Log-log scatter of profile visits vs kisses. Slope β is the power law exponent.

tidy_model %>%
  mutate(term = recode(term,
    "(Intercept)"        = "Intercept",
    "log(profileVisits)" = "log(profile visits)  [β]"
  )) %>%
  select(Term = term, Estimate = estimate,
         `Std. Error` = std.error, Statistic = statistic,
         `p-value` = p.value) %>%
  mutate(across(where(is.numeric), ~round(.x, 4))) %>%
  knitr::kable(caption = "OLS model — log(kisses+1) ~ log(profileVisits)")

OLS model — log(kisses+1) ~ log(profileVisits)
Term	Estimate	Std. Error	Statistic	p-value
Intercept	-2.7025	0.0586	-46.1026	0
log(profile visits) [β]	0.9131	0.0080	114.7267	0

Script line to quote: “For every 1% more attention a profile receives, it gains 0.913% more kisses — well above the 1% you’d expect if the relationship were simply proportional.”

##s 2 — Attention inequality

“Human systems, when left to scale, rarely stay equal.”

attention_vec <- data$profileVisits
gini_val      <- round(ineq(attention_vec, type = "Gini"), 3)
kisses_gini   <- round(ineq(data$kisses + 1, type = "Gini"), 3)

# Top 10% share
n_top10     <- ceiling(nrow(data) * 0.1)
top10_share <- round(
  sum(sort(attention_vec, decreasing = TRUE)[seq_len(n_top10)]) /
    sum(attention_vec) * 100, 1
)

cat("Gini (profile visits):", gini_val,    "\n")

## Gini (profile visits): 0.677

cat("Gini (kisses)        :", kisses_gini, "\n")

## Gini (kisses)        : 0.723

cat("Top 10% attention share:", top10_share, "%\n")

## Top 10% attention share: 51.2 %

cat("US wealth Gini (ref) : ~0.49\n")

## US wealth Gini (ref) : ~0.49

lorenz_df <- tibble(
  cum_pop = c(0, seq_len(nrow(data)) / nrow(data)),
  cum_att = c(0, cumsum(sort(attention_vec)) / sum(attention_vec))
)

ggplot(lorenz_df, aes(x = cum_pop, y = cum_att)) +
  geom_abline(slope = 1, intercept = 0,
              linetype = "dashed", colour = "grey65", linewidth = 0.8) +
  geom_area(fill = "#7F77DD", alpha = 0.18) +
  geom_line(colour = "#534AB7", linewidth = 1.2) +
  annotate("text", x = 0.22, y = 0.78,
           label  = paste0("Gini = ", gini_val),
           colour = "#534AB7", size = 5, fontface = "bold") +
  annotate("text", x = 0.22, y = 0.71,
           label  = paste0("Top 10% capture ", top10_share, "% of all attention"),
           colour = "#534AB7", size = 3.8) +
  annotate("text", x = 0.22, y = 0.64,
           label  = "US wealth Gini ≈ 0.49 (for reference)",
           colour = "grey55", size = 3.4, fontface = "italic") +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    title    = "The Lorenz curve of romantic attention",
    subtitle = paste0(
      "Gini = ", gini_val,
      " — attention is more concentrated than most national wealth distributions"
    ),
    x       = "Cumulative share of users (ranked by profile visits)",
    y       = "Cumulative share of total attention",
    caption = "Dashed line = perfect equality."
  ) +
  theme_essay()

Lorenz curve for profile visits. The shaded area between the curve and the equality line represents the Gini coefficient.

ggplot(data, aes(x = profileVisits)) +
  geom_histogram(bins = 60, fill = "#7F77DD", alpha = 0.85, colour = "white",
                 linewidth = 0.2) +
  scale_x_log10(labels = label_number(big.mark = ",")) +
  labs(
    title    = "Attention distribution across the platform",
    subtitle = paste0(
      "Most users receive very few visits; a small minority dominates — ",
      "Gini = ", gini_val
    ),
    x       = "Profile visits (log scale)",
    y       = "Number of users",
    caption = paste0("n = ", format(nrow(data), big.mark = ","), " users.")
  ) +
  theme_essay()

Histogram of profile visits on log scale. The right skew is the visual signature of a power law.

3 — Age and the arc of desirability

ggplot(data, aes(x = age, y = profileVisits)) +
  geom_smooth(method = "loess", span = 0.45, se = TRUE,
              colour = accent_col, fill = accent_col,
              linewidth = 1.2, alpha = 0.12) +
  scale_y_continuous(labels = label_number(big.mark = ",")) +
  labs(
    title    = "Attention across the lifespan",
    subtitle = "When does algorithmic visibility peak — and when does it decline?",
    x        = "Age",
    y        = "Average profile visits",
    caption  = "LOESS smoother · ribbon = 95% CI."
  ) +
  theme_essay()

LOESS smoother of profile visits against age. The peak and decline reveal how algorithmic exposure tracks age.

ggplot(data, aes(x = age, y = kisses)) +
  geom_smooth(method = "loess", span = 0.45, se = TRUE,
              colour = accent_col, fill = accent_col,
              linewidth = 1.2, alpha = 0.12) +
  scale_y_continuous(labels = label_number(big.mark = ",")) +
  labs(
    title    = "Kisses across the lifespan",
    subtitle = "Does romantic engagement follow the same age curve as raw attention?",
    x        = "Age",
    y        = "Average kisses received",
    caption  = "LOESS smoother · ribbon = 95% CI."
  ) +
  theme_essay()

Kisses by age — the engagement version of the same story.

4 — Conversion rate: traffic vs genuine appeal

“You’re not just choosing. You’re also strategising.”

Does the top decile convert visits into kisses at a higher rate, or do they simply receive more traffic? This separates algorithmic amplification (scale effect only) from genuine appeal (people who visit actually engage more).

conversion_summary <- data %>%
  group_by(attention_bin) %>%
  summarise(
    avg_conversion = round(mean(conversion_rate, na.rm = TRUE), 4),
    med_conversion = round(median(conversion_rate, na.rm = TRUE), 4),
    avg_kisses     = round(mean(kisses,           na.rm = TRUE), 1),
    avg_visits     = round(mean(profileVisits,    na.rm = TRUE), 0),
    n              = n(),
    .groups = "drop"
  )

knitr::kable(conversion_summary,
             col.names = c("Decile", "Avg conversion", "Median conversion",
                           "Avg kisses", "Avg visits", "n"),
             caption   = "Kisses ÷ profile visits by attention decile")

Kisses ÷ profile visits by attention decile
Decile	Avg conversion	Median conversion	Avg kisses	Avg visits	n
1	0.0464	0.0240	3.1	70	282
2	0.0371	0.0288	9.6	251	282
3	0.0411	0.0335	19.1	454	282
4	0.0417	0.0334	31.1	745	281
5	0.0418	0.0347	48.3	1154	281
6	0.0412	0.0349	72.5	1753	281
7	0.0388	0.0345	108.8	2830	281
8	0.0389	0.0323	177.9	4593	281
9	0.0376	0.0319	291.7	7739	281
10	0.0434	0.0393	932.0	20473	281

ggplot(conversion_summary, aes(x = attention_bin, y = avg_conversion)) +
  geom_col(fill = "#1D9E75", alpha = 0.85, width = 0.7) +
  geom_text(aes(label = sprintf("%.3f", avg_conversion)),
            vjust = -0.45, size = 3.2, colour = "grey40") +
  scale_x_continuous(breaks = 1:10) +
  scale_y_continuous(labels = label_number(accuracy = 0.001),
                     expand = expansion(mult = c(0, 0.15))) +
  labs(
    title    = "Does popularity reflect quality — or just scale?",
    subtitle = "Average kisses per profile visit by attention decile",
    x        = "Attention decile (1 = least visited, 10 = most)",
    y        = "Avg kisses per profile visit",
    caption  = paste0(
      "Flat trend → algorithmic scale effect.\n",
      "Rising trend → top profiles are genuinely more compelling."
    )
  ) +
  theme_essay()

If conversion rises with decile, popular profiles are genuinely more compelling. If flat, the algorithm is manufacturing popularity.

6 OkCupid: cross-platform validation

OkCupid is a Western platform (San Francisco-heavy sample) that skews older and more text-driven than Lovoo. Using it to cross-check demographic patterns and self-presentation behaviour strengthens the generalisation claim. Filtered to female profiles to match the Lovoo sample.

okcupid <- behaviours %>%
  filter(
    !is.na(age), age >= 18, age <= 70,
    sex == "f"
  ) %>%
  mutate(
    essay_length = nchar(replace_na(essay0, "")) +
                   nchar(replace_na(essay1, "")) +
                   nchar(replace_na(essay2, "")) +
                   nchar(replace_na(essay3, "")) +
                   nchar(replace_na(essay4, ""))
  )

cat("OkCupid rows after cleaning:", nrow(okcupid), "\n")

## OkCupid rows after cleaning: 24116

ggplot(okcupid, aes(x = age)) +
  geom_histogram(binwidth = 2, fill = accent_col, alpha = 0.80,
                 colour = "white", linewidth = 0.15) +
  labs(
    title    = "Who is on dating apps? (OkCupid, San Francisco — female profiles)",
    subtitle = "Cross-validates Lovoo demographic patterns on a western audience",
    x        = "Age",
    y        = "Number of profiles",
    caption  = paste0("n = ", format(nrow(okcupid), big.mark = ","), " profiles.")
  ) +
  theme_essay()

OkCupid age distribution — female profiles. Western, text-first app; different user-base, same structural question.

ggplot(okcupid %>% filter(essay_length > 0),
       aes(x = age, y = essay_length)) +
  geom_smooth(method = "loess", span = 0.5, se = TRUE,
              colour = accent_col, fill = accent_col,
              linewidth = 1.2, alpha = 0.12) +
  scale_y_continuous(labels = label_number(big.mark = ",")) +
  labs(
    title    = "Profile depth across the lifespan",
    subtitle = "Essay length (total characters across 5 essays) by age",
    x        = "Age",
    y        = "Total essay length (characters)",
    caption  = paste0(
      "Profiles with zero essay content excluded.\n",
      "Declining trend supports: 'profiles become curated, messages become calculated'."
    )
  ) +
  theme_essay()

Essay length declines with age. This contradicts ‘more experience = more depth’ and supports the optimisation hypothesis.

category_completion <- okcupid %>%
  summarise(
    across(
      c(body_type, diet, drinks, drugs, education,
        job, offspring, pets, religion, smokes),
      ~ mean(!is.na(.) & . != "", na.rm = TRUE),
      .names = "pct_{.col}"
    )
  ) %>%
  pivot_longer(everything(),
               names_to  = "field",
               values_to = "pct_filled") %>%
  mutate(
    field      = str_remove(field, "pct_") %>%
                   str_replace_all("_", " ") %>%
                   str_to_title(),
    pct_filled = round(pct_filled * 100, 1)
  ) %>%
  arrange(desc(pct_filled))

ggplot(category_completion,
       aes(x = reorder(field, pct_filled), y = pct_filled)) +
  geom_col(fill = "#7F77DD", alpha = 0.85, width = 0.7) +
  geom_text(aes(label = paste0(pct_filled, "%")),
            hjust = -0.15, size = 3.2, colour = "grey40") +
  coord_flip() +
  scale_y_continuous(limits  = c(0, 115),
                     labels  = function(x) paste0(x, "%"),
                     expand  = expansion(mult = c(0, 0))) +
  labs(
    title    = "What do people choose to reveal?",
    subtitle = "% of OkCupid profiles that filled each optional field",
    x        = NULL,
    y        = "% of profiles completed",
    caption  = "Fields left blank are a form of strategic omission."
  ) +
  theme_essay()

What people choose to reveal — and what they strategically omit.

Data sources: Lovoo v3 (Kaggle · utkarshx27), OkCupid profiles (Kaggle · andrewmvd)