library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read_csv("data.csv")
## Rows: 604 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (19): User ID, Organization, Potential Organization, Account Created At,...
## dbl (2): Days Active, Emotional Health
## lgl (3): Dorm, Dont Live In Dorm, Has Active Subscription
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- df %>%
rename(
user_id = `User ID`,
organization = `Organization`,
days_active_total = `Days Active`,
potential_organization = `Potential Organization`,
account_created_at = `Account Created At`,
position = `Affiliation`,
dorm = `Dorm`,
not_in_dorm = `Dont Live In Dorm`,
raw_memory_profile = `Bio`,
raw_coping_strategy = `Coping Strategies`,
dietary_constraints = `Dietary Constraints`,
emotional_health_score = `Emotional Health`,
raw_favorite_things = `Favorites`,
raw_goals = `Goals`,
hobbies = `Hobbies`,
mental_health_challenges = `Mental Health Challenges`,
physical_condition = `Physical Condition`,
preferred_language = `Preferred Language`,
quirks = `Quirks`,
relationship_with_sunnie = `Relationship With Sunnie`,
requests_to_sunnie = `Request To Sunnie`,
raw_routine = `Routine`,
values = `Values`,
is_active_subscriber = `Has Active Subscription`
)
df <- df %>%
mutate(
organization = na_if(organization, ""),
potential_organization = na_if(potential_organization, ""),
position = na_if(position, ""),
preferred_language = na_if(preferred_language, "")
)
df %>%
group_by(position) %>%
summarise(
n = n(),
subscribers = sum(is_active_subscriber),
rate = mean(is_active_subscriber)
) %>%
arrange(desc(rate))
## # A tibble: 8 × 4
## position n subscribers rate
## <chr> <int> <int> <dbl>
## 1 Alumni 6 3 0.5
## 2 <NA> 393 118 0.300
## 3 Staff 4 1 0.25
## 4 Undergraduate student 197 5 0.0254
## 5 Faculty 1 0 0
## 6 Master’s student 1 0 0
## 7 Postdoc 1 0 0
## 8 Student (full-time) 1 0 0
df %>%
mutate(
position_clean = ifelse(is.na(position), "Unknown", position)
) %>%
group_by(position_clean, is_active_subscriber) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(position_clean) %>%
mutate(total = sum(n)) %>%
ungroup() %>%
ggplot(aes(x = reorder(position_clean, total), y = n, fill = is_active_subscriber)) +
geom_col() +
coord_flip() +
scale_fill_manual(
values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
name = "Subscription Status",
labels = c("FALSE" = "Cancelled", "TRUE" = "Active")
) +
labs(
title = "Active vs Cancelled Users by Position",
x = "Position",
y = "User Count"
) +
theme_minimal(base_size = 12) +
theme(
legend.position = "top",
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title.y = element_text(margin = margin(r = 10)),
axis.title.x = element_text(margin = margin(t = 10))
)
Insight: paid users are not from universities
org_summary <- df %>%
group_by(organization) %>%
summarise(
n = n(),
subscribers = sum(is_active_subscriber),
rate = mean(is_active_subscriber)
) %>%
arrange(desc(subscribers))
org_summary
## # A tibble: 16 × 4
## organization n subscribers rate
## <chr> <int> <int> <dbl>
## 1 <NA> 387 118 0.305
## 2 Stanford University 5 3 0.6
## 3 University of Manitoba 4 2 0.5
## 4 Chapman University 54 1 0.0185
## 5 De Anza College 2 1 0.5
## 6 University of Canterbury 2 1 0.5
## 7 University of Hawaiʻi at Mānoa 53 1 0.0189
## 8 Arizona State University 2 0 0
## 9 Brown University 1 0 0
## 10 Kansas State University 8 0 0
## 11 Skyline College 1 0 0
## 12 University of Michigan 1 0 0
## 13 University of Victoria 40 0 0
## 14 University of Washington 1 0 0
## 15 University of Wisconsin–Madison 41 0 0
## 16 Villanova University 2 0 0
df %>%
mutate(
org_clean = ifelse(is.na(organization), "Unknown", organization)
) %>%
group_by(org_clean, is_active_subscriber) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(org_clean) %>%
mutate(total = sum(n)) %>%
ungroup() %>%
ggplot(aes(x = reorder(org_clean, total), y = n, fill = is_active_subscriber)) +
geom_col() +
coord_flip() +
scale_fill_manual(
values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
name = "Subscription Status",
labels = c("FALSE" = "Cancelled", "TRUE" = "Active")
) +
labs(
title = "Active vs Cancelled Users by Organization",
x = "Organization",
y = "User Count"
) +
theme_minimal(base_size = 12) +
theme(
legend.position = "top",
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title.y = element_text(margin = margin(r = 10)),
axis.title.x = element_text(margin = margin(t = 10))
)
## Preferred language
Insight: unsurprisingly, most users are just English users.
lang_summary <- df %>%
group_by(preferred_language) %>%
summarise(
n = n(),
subscribers = sum(is_active_subscriber),
rate = mean(is_active_subscriber)
) %>%
arrange(desc(rate))
lang_summary
## # A tibble: 5 × 4
## preferred_language n subscribers rate
## <chr> <int> <int> <dbl>
## 1 English (practices written French) 1 1 1
## 2 Swedish, English 1 1 1
## 3 Chinese 2 1 0.5
## 4 English 375 100 0.267
## 5 <NA> 225 24 0.107
df %>%
mutate(
lang_clean = ifelse(is.na(preferred_language), "Unknown", preferred_language)
) %>%
group_by(lang_clean, is_active_subscriber) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(lang_clean) %>%
mutate(total = sum(n)) %>%
ungroup() %>%
ggplot(aes(x = reorder(lang_clean, total), y = n, fill = is_active_subscriber)) +
geom_col() +
coord_flip() +
scale_fill_manual(
values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
name = "Subscription Status",
labels = c("FALSE" = "Cancelled", "TRUE" = "Active")
) +
labs(
title = "Active vs Cancelled Users by Preferred Language",
x = "Preferred Language",
y = "User Count"
) +
theme_minimal(base_size = 12) +
theme(
legend.position = "top",
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title.y = element_text(margin = margin(r = 10)),
axis.title.x = element_text(margin = margin(t = 10))
)
Insight: Users who end up paying have SLIGHTLY lower emotional health score than those who cancel, but not sig (p = .12). Both groups have average score of ~6 (moderate well-being)
df %>%
group_by(is_active_subscriber) %>%
summarise(
n = n(),
mean_emotional_health = mean(emotional_health_score, na.rm = TRUE),
median_emotional_health = median(emotional_health_score, na.rm = TRUE),
sd_emotional_health = sd(emotional_health_score, na.rm = TRUE)
)
## # A tibble: 2 × 5
## is_active_subscriber n mean_emotional_health median_emotional_health
## <lgl> <int> <dbl> <dbl>
## 1 FALSE 477 5.84 6
## 2 TRUE 127 5.57 6
## # ℹ 1 more variable: sd_emotional_health <dbl>
df %>%
ggplot(aes(x = emotional_health_score, fill = is_active_subscriber)) +
geom_density(alpha = 0.4, adjust = 1.8) +
scale_fill_manual(
values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
name = "Subscription Status",
labels = c("FALSE" = "Cancelled", "TRUE" = "Active")
) +
labs(
title = "Distribution of Emotional Health by Subscription Status",
x = "Emotional Health Score (0–9)",
y = "Density"
) +
theme_minimal(base_size = 12) +
theme(
legend.position = "top",
plot.title = element_text(face = "bold", hjust = 0.5)
)
## Warning: Removed 220 rows containing non-finite outside the scale range
## (`stat_density()`).
df %>%
ggplot(aes(x = is_active_subscriber, y = emotional_health_score, fill = is_active_subscriber)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(
values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
guide = "none"
) +
labs(
title = "Emotional Health by Subscription Status",
x = "Subscription Status",
y = "Emotional Health Score (0–9)"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
## Warning: Removed 220 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
t.test(emotional_health_score ~ is_active_subscriber, data = df)
##
## Welch Two Sample t-test
##
## data: emotional_health_score by is_active_subscriber
## t = 1.5514, df = 217.97, p-value = 0.1222
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -0.07123824 0.59820504
## sample estimates:
## mean in group FALSE mean in group TRUE
## 5.836299 5.572816
Insight: Active (paying) subscribers have ~4× higher mean usage than non-subscribers (p < .001)! (30.4 days vs. 8.5 days)
df %>%
group_by(is_active_subscriber) %>%
summarise(
n = n(),
mean_days = mean(days_active_total, na.rm = TRUE),
median_days = median(days_active_total, na.rm = TRUE),
sd_days = sd(days_active_total, na.rm = TRUE)
)
## # A tibble: 2 × 5
## is_active_subscriber n mean_days median_days sd_days
## <lgl> <int> <dbl> <dbl> <dbl>
## 1 FALSE 477 8.51 5 12.6
## 2 TRUE 127 30.4 14 46.7
df %>%
ggplot(aes(x = is_active_subscriber, y = days_active_total, fill = is_active_subscriber)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("FALSE"="#d3d3d3", "TRUE"="#1b9e77")) +
labs(
title = "Engagement (Days Active) by Subscription Status",
x = "Subscription Status",
y = "Total Days Active"
) +
theme_minimal()
t.test(days_active_total ~ is_active_subscriber, data = df)
##
## Welch Two Sample t-test
##
## data: days_active_total by is_active_subscriber
## t = -5.2413, df = 130.9, p-value = 6.216e-07
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -30.19329 -13.64662
## sample estimates:
## mean in group FALSE mean in group TRUE
## 8.505241 30.425197
Insight: Paying users tend to be longer-term users, not new users compared to non-paying.
df <- df %>%
mutate(
account_created_at = as.Date(account_created_at, format = "%B %d, %Y"),
account_age_days = as.numeric(Sys.Date() - account_created_at)
)
df %>%
group_by(is_active_subscriber) %>%
summarise(
mean_account_age = mean(account_age_days, na.rm = TRUE),
median_account_age = median(account_age_days, na.rm = TRUE)
)
## # A tibble: 2 × 3
## is_active_subscriber mean_account_age median_account_age
## <lgl> <dbl> <dbl>
## 1 FALSE 205. 193
## 2 TRUE 239. 199
df %>%
filter(!is.na(account_age_days)) %>%
ggplot(aes(x = account_age_days, fill = is_active_subscriber)) +
geom_density(alpha = 0.4) +
scale_fill_manual(
values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
labels = c("FALSE" = "Cancelled", "TRUE" = "Active"),
name = "Subscription Status"
) +
labs(
title = "Distribution of Account Age by Subscription Status",
x = "Account Age (Days)",
y = "Density"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "top"
)
df %>%
filter(!is.na(account_age_days)) %>%
ggplot(aes(x = is_active_subscriber, y = account_age_days, fill = is_active_subscriber)) +
geom_boxplot(alpha = 0.5, outlier.alpha = 0.6) +
scale_fill_manual(
values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
labels = c("FALSE" = "Cancelled", "TRUE" = "Active"),
name = "Subscription Status"
) +
labs(
title = "Account Age by Subscription Status",
x = "Subscription Status",
y = "Account Age (Days)"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "none"
)
Insight: Velocity of use (days active per account age) is higher for paying users. Paying users don’t just use more because they’ve been around longer—they are consistently more active.
df <- df %>%
mutate(
engagement_velocity = days_active_total / account_age_days
)
df_velocity <- df %>%
filter(!is.na(engagement_velocity), engagement_velocity < Inf)
df_velocity %>%
group_by(is_active_subscriber) %>%
summarise(
n = n(),
mean_velocity = mean(engagement_velocity, na.rm = TRUE),
median_velocity = median(engagement_velocity, na.rm = TRUE),
sd_velocity = sd(engagement_velocity, na.rm = TRUE)
)
## # A tibble: 2 × 5
## is_active_subscriber n mean_velocity median_velocity sd_velocity
## <lgl> <int> <dbl> <dbl> <dbl>
## 1 FALSE 477 0.0388 0.0270 0.0397
## 2 TRUE 127 0.105 0.0706 0.107
df_velocity %>%
ggplot(aes(x = engagement_velocity, fill = is_active_subscriber)) +
geom_density(alpha = 0.4) +
scale_fill_manual(
values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
labels = c("FALSE" = "Cancelled", "TRUE" = "Active")
) +
labs(
title = "Velocity Distribution by Subscription Status",
x = "Engagement Velocity (Active Days / Account Days)",
y = "Density"
) +
theme_minimal(base_size = 13)
df_velocity %>%
ggplot(aes(x = is_active_subscriber, y = engagement_velocity, fill = is_active_subscriber)) +
geom_boxplot(alpha = 0.5, outlier.alpha = 0.4) +
geom_jitter(width = 0.15, alpha = 0.3) +
scale_fill_manual(
values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
labels = c("FALSE" = "Cancelled", "TRUE" = "Active"),
name = "Subscription Status"
) +
labs(
title = "Engagement Velocity by Subscription Status",
x = "Subscription Status",
y = "Days Active per Account Day"
) +
theme_minimal(base_size = 13) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5)
)
t.test(engagement_velocity ~ is_active_subscriber, data = df_velocity)
##
## Welch Two Sample t-test
##
## data: engagement_velocity by is_active_subscriber
## t = -6.8058, df = 135.39, p-value = 2.969e-10
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -0.08486878 -0.04665150
## sample estimates:
## mean in group FALSE mean in group TRUE
## 0.03876395 0.10452409
NEXT: would be helpful to know: - when they signed up for the subscription/free trial - days active in first 7 days - activities completed in first 7 days - chats in first 7 days
mood check-ins in first 7 days
glimpse(df)
## Rows: 604
## Columns: 26
## $ user_id <chr> "65c42475412b7170735426ca", "6626dcb6ab6b4c8c…
## $ organization <chr> NA, NA, NA, NA, NA, "Villanova University", N…
## $ days_active_total <dbl> 9, 12, 168, 3, 21, 12, 12, 4, 37, 58, 62, 18,…
## $ potential_organization <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Worl…
## $ account_created_at <date> 2024-02-07, 2024-04-22, 2024-04-29, 2024-07-…
## $ position <chr> NA, NA, NA, NA, NA, "Staff", NA, NA, "Postdoc…
## $ dorm <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ not_in_dorm <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ raw_memory_profile <chr> "[PERSON] is a dedicated writer and blogger a…
## $ raw_coping_strategy <chr> "[{\"situation\":\"Feeling self-conscious abo…
## $ dietary_constraints <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "\"No for…
## $ emotional_health_score <dbl> 6, 4, 6, 6, 6, 9, 8, 8, 4, 4, 6, 4, 8, 4, 2, …
## $ raw_favorite_things <chr> "{\"foods\":[],\"books\":[],\"movies\":[],\"m…
## $ raw_goals <chr> "{\"shortTerm\":\"[existing] Ensure consisten…
## $ hobbies <chr> "[\"Writing\",\"Blogging\",\"Working out\",\"…
## $ mental_health_challenges <chr> "\"Struggles with self-consciousness and worr…
## $ physical_condition <chr> "{\"past\":[],\"ongoing\":[\"Brain fog (cogni…
## $ preferred_language <chr> "English", "English", "Chinese", "English", "…
## $ quirks <chr> "[\"Constantly checks views and likes after p…
## $ relationship_with_sunnie <chr> "Casual, supportive interaction; uses [PERSON…
## $ requests_to_sunnie <chr> "Provide gentle prompts, encouragement, brief…
## $ raw_routine <chr> "{\"morning\":\"\",\"windDown\":\"\",\"other\…
## $ values <chr> "[\"Courage\",\"Growth\",\"[PERSON]-worth\",\…
## $ is_active_subscriber <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ account_age_days <dbl> 812, 737, 730, 659, 612, 566, 545, 503, 439, …
## $ engagement_velocity <dbl> 0.011083744, 0.016282225, 0.230136986, 0.0045…
summary(df)
## user_id organization days_active_total potential_organization
## Length:604 Length:604 Min. : 0.00 Length:604
## Class :character Class :character 1st Qu.: 2.00 Class :character
## Mode :character Mode :character Median : 6.00 Mode :character
## Mean : 13.11
## 3rd Qu.: 13.00
## Max. :285.00
##
## account_created_at position dorm not_in_dorm
## Min. :2024-02-07 Length:604 Mode:logical Mode:logical
## 1st Qu.:2025-09-30 Class :character NA's:604 NA's:604
## Median :2025-10-16 Mode :character
## Mean :2025-09-29
## 3rd Qu.:2025-11-03
## Max. :2025-11-22
##
## raw_memory_profile raw_coping_strategy dietary_constraints
## Length:604 Length:604 Length:604
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## emotional_health_score raw_favorite_things raw_goals
## Min. :0.000 Length:604 Length:604
## 1st Qu.:5.000 Class :character Class :character
## Median :6.000 Mode :character Mode :character
## Mean :5.766
## 3rd Qu.:7.000
## Max. :9.000
## NA's :220
## hobbies mental_health_challenges physical_condition
## Length:604 Length:604 Length:604
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## preferred_language quirks relationship_with_sunnie
## Length:604 Length:604 Length:604
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## requests_to_sunnie raw_routine values is_active_subscriber
## Length:604 Length:604 Length:604 Mode :logical
## Class :character Class :character Class :character FALSE:477
## Mode :character Mode :character Mode :character TRUE :127
##
##
##
##
## account_age_days engagement_velocity
## Min. :158.0 Min. :0.00000
## 1st Qu.:177.0 1st Qu.:0.01217
## Median :195.0 Median :0.03077
## Mean :211.7 Mean :0.05259
## 3rd Qu.:211.0 3rd Qu.:0.06838
## Max. :812.0 Max. :0.48387
##
sapply(df, function(x) sum(is.na(x)))
## user_id organization days_active_total
## 0 387 0
## potential_organization account_created_at position
## 536 0 393
## dorm not_in_dorm raw_memory_profile
## 604 604 221
## raw_coping_strategy dietary_constraints emotional_health_score
## 220 555 220
## raw_favorite_things raw_goals hobbies
## 220 220 220
## mental_health_challenges physical_condition preferred_language
## 261 220 225
## quirks relationship_with_sunnie requests_to_sunnie
## 220 222 240
## raw_routine values is_active_subscriber
## 220 220 0
## account_age_days engagement_velocity
## 0 0