true

Load libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load data

df <- read_csv("data.csv")
## Rows: 604 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (19): User ID, Organization, Potential Organization, Account Created At,...
## dbl  (2): Days Active, Emotional Health
## lgl  (3): Dorm, Dont Live In Dorm, Has Active Subscription
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean

Rename the columns

df <- df %>%
  rename(
    user_id = `User ID`,
    organization = `Organization`,
    days_active_total = `Days Active`,
    potential_organization = `Potential Organization`,
    account_created_at = `Account Created At`,
    position = `Affiliation`,
    dorm = `Dorm`,
    not_in_dorm = `Dont Live In Dorm`,
    raw_memory_profile = `Bio`,
    raw_coping_strategy = `Coping Strategies`,
    dietary_constraints = `Dietary Constraints`,
    emotional_health_score = `Emotional Health`,
    raw_favorite_things = `Favorites`,
    raw_goals = `Goals`,
    hobbies = `Hobbies`,
    mental_health_challenges = `Mental Health Challenges`,
    physical_condition = `Physical Condition`,
    preferred_language = `Preferred Language`,
    quirks = `Quirks`,
    relationship_with_sunnie = `Relationship With Sunnie`,
    requests_to_sunnie = `Request To Sunnie`,
    raw_routine = `Routine`,
    values = `Values`,
    is_active_subscriber = `Has Active Subscription`
  )

Fill in NAs for missing cells

df <- df %>%
  mutate(
    organization = na_if(organization, ""),
    potential_organization = na_if(potential_organization, ""),
    position = na_if(position, ""),
    preferred_language = na_if(preferred_language, "")
  )

Demographics of Paid Users

By position

df %>%
  group_by(position) %>%
  summarise(
    n = n(),
    subscribers = sum(is_active_subscriber),
    rate = mean(is_active_subscriber)
  ) %>%
  arrange(desc(rate))
## # A tibble: 8 × 4
##   position                  n subscribers   rate
##   <chr>                 <int>       <int>  <dbl>
## 1 Alumni                    6           3 0.5   
## 2 <NA>                    393         118 0.300 
## 3 Staff                     4           1 0.25  
## 4 Undergraduate student   197           5 0.0254
## 5 Faculty                   1           0 0     
## 6 Master’s student          1           0 0     
## 7 Postdoc                   1           0 0     
## 8 Student (full-time)       1           0 0
df %>%
  mutate(
    position_clean = ifelse(is.na(position), "Unknown", position)
  ) %>%
  group_by(position_clean, is_active_subscriber) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(position_clean) %>%
  mutate(total = sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x = reorder(position_clean, total), y = n, fill = is_active_subscriber)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(
    values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
    name = "Subscription Status",
    labels = c("FALSE" = "Cancelled", "TRUE" = "Active")
  ) +
  labs(
    title = "Active vs Cancelled Users by Position",
    x = "Position",
    y = "User Count"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "top",
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.title.y = element_text(margin = margin(r = 10)),
    axis.title.x = element_text(margin = margin(t = 10))
  )

By organization

Insight: paid users are not from universities

org_summary <- df %>%
  group_by(organization) %>%
  summarise(
    n = n(),
    subscribers = sum(is_active_subscriber),
    rate = mean(is_active_subscriber)
  ) %>%
  arrange(desc(subscribers))

org_summary
## # A tibble: 16 × 4
##    organization                        n subscribers   rate
##    <chr>                           <int>       <int>  <dbl>
##  1 <NA>                              387         118 0.305 
##  2 Stanford University                 5           3 0.6   
##  3 University of Manitoba              4           2 0.5   
##  4 Chapman University                 54           1 0.0185
##  5 De Anza College                     2           1 0.5   
##  6 University of Canterbury            2           1 0.5   
##  7 University of Hawaiʻi at Mānoa     53           1 0.0189
##  8 Arizona State University            2           0 0     
##  9 Brown University                    1           0 0     
## 10 Kansas State University             8           0 0     
## 11 Skyline College                     1           0 0     
## 12 University of Michigan              1           0 0     
## 13 University of Victoria             40           0 0     
## 14 University of Washington            1           0 0     
## 15 University of Wisconsin–Madison    41           0 0     
## 16 Villanova University                2           0 0
df %>%
  mutate(
    org_clean = ifelse(is.na(organization), "Unknown", organization)
  ) %>%
  group_by(org_clean, is_active_subscriber) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(org_clean) %>%
  mutate(total = sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x = reorder(org_clean, total), y = n, fill = is_active_subscriber)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(
    values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
    name = "Subscription Status",
    labels = c("FALSE" = "Cancelled", "TRUE" = "Active")
  ) +
  labs(
    title = "Active  vs Cancelled Users by Organization",
    x = "Organization",
    y = "User Count"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "top",
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.title.y = element_text(margin = margin(r = 10)),
    axis.title.x = element_text(margin = margin(t = 10))
  )

## Preferred language

Insight: unsurprisingly, most users are just English users.

lang_summary <- df %>%
  group_by(preferred_language) %>%
  summarise(
    n = n(),
    subscribers = sum(is_active_subscriber),
    rate = mean(is_active_subscriber)
  ) %>%
  arrange(desc(rate))

lang_summary
## # A tibble: 5 × 4
##   preferred_language                     n subscribers  rate
##   <chr>                              <int>       <int> <dbl>
## 1 English (practices written French)     1           1 1    
## 2 Swedish, English                       1           1 1    
## 3 Chinese                                2           1 0.5  
## 4 English                              375         100 0.267
## 5 <NA>                                 225          24 0.107
df %>%
  mutate(
    lang_clean = ifelse(is.na(preferred_language), "Unknown", preferred_language)
  ) %>%
  group_by(lang_clean, is_active_subscriber) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(lang_clean) %>%
  mutate(total = sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x = reorder(lang_clean, total), y = n, fill = is_active_subscriber)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(
    values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
    name = "Subscription Status",
    labels = c("FALSE" = "Cancelled", "TRUE" = "Active")
  ) +
  labs(
    title = "Active vs Cancelled Users by Preferred Language",
    x = "Preferred Language",
    y = "User Count"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "top",
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.title.y = element_text(margin = margin(r = 10)),
    axis.title.x = element_text(margin = margin(t = 10))
  )

Emotional Health

Insight: Users who end up paying have SLIGHTLY lower emotional health score than those who cancel, but not sig (p = .12). Both groups have average score of ~6 (moderate well-being)

df %>%
  group_by(is_active_subscriber) %>%
  summarise(
    n = n(),
    mean_emotional_health = mean(emotional_health_score, na.rm = TRUE),
    median_emotional_health = median(emotional_health_score, na.rm = TRUE),
    sd_emotional_health = sd(emotional_health_score, na.rm = TRUE)
  )
## # A tibble: 2 × 5
##   is_active_subscriber     n mean_emotional_health median_emotional_health
##   <lgl>                <int>                 <dbl>                   <dbl>
## 1 FALSE                  477                  5.84                       6
## 2 TRUE                   127                  5.57                       6
## # ℹ 1 more variable: sd_emotional_health <dbl>
df %>%
  ggplot(aes(x = emotional_health_score, fill = is_active_subscriber)) +
  geom_density(alpha = 0.4, adjust = 1.8) +
  scale_fill_manual(
    values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
    name = "Subscription Status",
    labels = c("FALSE" = "Cancelled", "TRUE" = "Active")
  ) +
  labs(
    title = "Distribution of Emotional Health by Subscription Status",
    x = "Emotional Health Score (0–9)",
    y = "Density"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "top",
    plot.title = element_text(face = "bold", hjust = 0.5)
  )
## Warning: Removed 220 rows containing non-finite outside the scale range
## (`stat_density()`).

df %>%
  ggplot(aes(x = is_active_subscriber, y = emotional_health_score, fill = is_active_subscriber)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(
    values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
    guide = "none"
  ) +
  labs(
    title = "Emotional Health by Subscription Status",
    x = "Subscription Status",
    y = "Emotional Health Score (0–9)"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))
## Warning: Removed 220 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

t.test(emotional_health_score ~ is_active_subscriber, data = df)
## 
##  Welch Two Sample t-test
## 
## data:  emotional_health_score by is_active_subscriber
## t = 1.5514, df = 217.97, p-value = 0.1222
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -0.07123824  0.59820504
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            5.836299            5.572816

Days active

Insight: Active (paying) subscribers have ~4× higher mean usage than non-subscribers (p < .001)! (30.4 days vs. 8.5 days)

df %>%
  group_by(is_active_subscriber) %>%
  summarise(
    n = n(),
    mean_days = mean(days_active_total, na.rm = TRUE),
    median_days = median(days_active_total, na.rm = TRUE),
    sd_days = sd(days_active_total, na.rm = TRUE)
  )
## # A tibble: 2 × 5
##   is_active_subscriber     n mean_days median_days sd_days
##   <lgl>                <int>     <dbl>       <dbl>   <dbl>
## 1 FALSE                  477      8.51           5    12.6
## 2 TRUE                   127     30.4           14    46.7
df %>%
  ggplot(aes(x = is_active_subscriber, y = days_active_total, fill = is_active_subscriber)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("FALSE"="#d3d3d3", "TRUE"="#1b9e77")) +
  labs(
    title = "Engagement (Days Active) by Subscription Status",
    x = "Subscription Status",
    y = "Total Days Active"
  ) +
  theme_minimal()

t.test(days_active_total ~ is_active_subscriber, data = df)
## 
##  Welch Two Sample t-test
## 
## data:  days_active_total by is_active_subscriber
## t = -5.2413, df = 130.9, p-value = 6.216e-07
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -30.19329 -13.64662
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            8.505241           30.425197

Account age

Insight: Paying users tend to be longer-term users, not new users compared to non-paying.

df <- df %>%
  mutate(
    account_created_at = as.Date(account_created_at, format = "%B %d, %Y"),
    account_age_days = as.numeric(Sys.Date() - account_created_at)
  )

df %>%
  group_by(is_active_subscriber) %>%
  summarise(
    mean_account_age = mean(account_age_days, na.rm = TRUE),
    median_account_age = median(account_age_days, na.rm = TRUE)
  )
## # A tibble: 2 × 3
##   is_active_subscriber mean_account_age median_account_age
##   <lgl>                           <dbl>              <dbl>
## 1 FALSE                            205.                193
## 2 TRUE                             239.                199
df %>%
  filter(!is.na(account_age_days)) %>%
  ggplot(aes(x = account_age_days, fill = is_active_subscriber)) +
  geom_density(alpha = 0.4) +
  scale_fill_manual(
    values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
    labels = c("FALSE" = "Cancelled", "TRUE" = "Active"),
    name = "Subscription Status"
  ) +
  labs(
    title = "Distribution of Account Age by Subscription Status",
    x = "Account Age (Days)",
    y = "Density"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.position = "top"
  )

df %>%
  filter(!is.na(account_age_days)) %>%
  ggplot(aes(x = is_active_subscriber, y = account_age_days, fill = is_active_subscriber)) +
  geom_boxplot(alpha = 0.5, outlier.alpha = 0.6) +
  scale_fill_manual(
    values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
    labels = c("FALSE" = "Cancelled", "TRUE" = "Active"),
    name = "Subscription Status"
  ) +
  labs(
    title = "Account Age by Subscription Status",
    x = "Subscription Status",
    y = "Account Age (Days)"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.position = "none"
  )

Engagement Velocity (days active/account age)

Insight: Velocity of use (days active per account age) is higher for paying users. Paying users don’t just use more because they’ve been around longer—they are consistently more active.

df <- df %>%
  mutate(
    engagement_velocity = days_active_total / account_age_days
  )

df_velocity <- df %>% 
  filter(!is.na(engagement_velocity), engagement_velocity < Inf)

df_velocity %>%
  group_by(is_active_subscriber) %>%
  summarise(
    n = n(),
    mean_velocity = mean(engagement_velocity, na.rm = TRUE),
    median_velocity = median(engagement_velocity, na.rm = TRUE),
    sd_velocity = sd(engagement_velocity, na.rm = TRUE)
  )
## # A tibble: 2 × 5
##   is_active_subscriber     n mean_velocity median_velocity sd_velocity
##   <lgl>                <int>         <dbl>           <dbl>       <dbl>
## 1 FALSE                  477        0.0388          0.0270      0.0397
## 2 TRUE                   127        0.105           0.0706      0.107
df_velocity %>%
  ggplot(aes(x = engagement_velocity, fill = is_active_subscriber)) +
  geom_density(alpha = 0.4) +
  scale_fill_manual(
    values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
    labels = c("FALSE" = "Cancelled", "TRUE" = "Active")
  ) +
  labs(
    title = "Velocity Distribution by Subscription Status",
    x = "Engagement Velocity (Active Days / Account Days)",
    y = "Density"
  ) +
  theme_minimal(base_size = 13)

df_velocity %>%
  ggplot(aes(x = is_active_subscriber, y = engagement_velocity, fill = is_active_subscriber)) +
  geom_boxplot(alpha = 0.5, outlier.alpha = 0.4) +
  geom_jitter(width = 0.15, alpha = 0.3) +
  scale_fill_manual(
    values = c("FALSE" = "#d3d3d3", "TRUE" = "#1b9e77"),
    labels = c("FALSE" = "Cancelled", "TRUE" = "Active"),
    name = "Subscription Status"
  ) +
  labs(
    title = "Engagement Velocity by Subscription Status",
    x = "Subscription Status",
    y = "Days Active per Account Day"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

t.test(engagement_velocity ~ is_active_subscriber, data = df_velocity)
## 
##  Welch Two Sample t-test
## 
## data:  engagement_velocity by is_active_subscriber
## t = -6.8058, df = 135.39, p-value = 2.969e-10
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -0.08486878 -0.04665150
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##          0.03876395          0.10452409

NEXT: would be helpful to know: - when they signed up for the subscription/free trial - days active in first 7 days - activities completed in first 7 days - chats in first 7 days

mood check-ins in first 7 days

Qualitative data

glimpse(df)
## Rows: 604
## Columns: 26
## $ user_id                  <chr> "65c42475412b7170735426ca", "6626dcb6ab6b4c8c…
## $ organization             <chr> NA, NA, NA, NA, NA, "Villanova University", N…
## $ days_active_total        <dbl> 9, 12, 168, 3, 21, 12, 12, 4, 37, 58, 62, 18,…
## $ potential_organization   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Worl…
## $ account_created_at       <date> 2024-02-07, 2024-04-22, 2024-04-29, 2024-07-…
## $ position                 <chr> NA, NA, NA, NA, NA, "Staff", NA, NA, "Postdoc…
## $ dorm                     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ not_in_dorm              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ raw_memory_profile       <chr> "[PERSON] is a dedicated writer and blogger a…
## $ raw_coping_strategy      <chr> "[{\"situation\":\"Feeling self-conscious abo…
## $ dietary_constraints      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "\"No for…
## $ emotional_health_score   <dbl> 6, 4, 6, 6, 6, 9, 8, 8, 4, 4, 6, 4, 8, 4, 2, …
## $ raw_favorite_things      <chr> "{\"foods\":[],\"books\":[],\"movies\":[],\"m…
## $ raw_goals                <chr> "{\"shortTerm\":\"[existing] Ensure consisten…
## $ hobbies                  <chr> "[\"Writing\",\"Blogging\",\"Working out\",\"…
## $ mental_health_challenges <chr> "\"Struggles with self-consciousness and worr…
## $ physical_condition       <chr> "{\"past\":[],\"ongoing\":[\"Brain fog (cogni…
## $ preferred_language       <chr> "English", "English", "Chinese", "English", "…
## $ quirks                   <chr> "[\"Constantly checks views and likes after p…
## $ relationship_with_sunnie <chr> "Casual, supportive interaction; uses [PERSON…
## $ requests_to_sunnie       <chr> "Provide gentle prompts, encouragement, brief…
## $ raw_routine              <chr> "{\"morning\":\"\",\"windDown\":\"\",\"other\…
## $ values                   <chr> "[\"Courage\",\"Growth\",\"[PERSON]-worth\",\…
## $ is_active_subscriber     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ account_age_days         <dbl> 812, 737, 730, 659, 612, 566, 545, 503, 439, …
## $ engagement_velocity      <dbl> 0.011083744, 0.016282225, 0.230136986, 0.0045…
summary(df)
##    user_id          organization       days_active_total potential_organization
##  Length:604         Length:604         Min.   :  0.00    Length:604            
##  Class :character   Class :character   1st Qu.:  2.00    Class :character      
##  Mode  :character   Mode  :character   Median :  6.00    Mode  :character      
##                                        Mean   : 13.11                          
##                                        3rd Qu.: 13.00                          
##                                        Max.   :285.00                          
##                                                                                
##  account_created_at     position           dorm         not_in_dorm   
##  Min.   :2024-02-07   Length:604         Mode:logical   Mode:logical  
##  1st Qu.:2025-09-30   Class :character   NA's:604       NA's:604      
##  Median :2025-10-16   Mode  :character                                
##  Mean   :2025-09-29                                                   
##  3rd Qu.:2025-11-03                                                   
##  Max.   :2025-11-22                                                   
##                                                                       
##  raw_memory_profile raw_coping_strategy dietary_constraints
##  Length:604         Length:604          Length:604         
##  Class :character   Class :character    Class :character   
##  Mode  :character   Mode  :character    Mode  :character   
##                                                            
##                                                            
##                                                            
##                                                            
##  emotional_health_score raw_favorite_things  raw_goals        
##  Min.   :0.000          Length:604          Length:604        
##  1st Qu.:5.000          Class :character    Class :character  
##  Median :6.000          Mode  :character    Mode  :character  
##  Mean   :5.766                                                
##  3rd Qu.:7.000                                                
##  Max.   :9.000                                                
##  NA's   :220                                                  
##    hobbies          mental_health_challenges physical_condition
##  Length:604         Length:604               Length:604        
##  Class :character   Class :character         Class :character  
##  Mode  :character   Mode  :character         Mode  :character  
##                                                                
##                                                                
##                                                                
##                                                                
##  preferred_language    quirks          relationship_with_sunnie
##  Length:604         Length:604         Length:604              
##  Class :character   Class :character   Class :character        
##  Mode  :character   Mode  :character   Mode  :character        
##                                                                
##                                                                
##                                                                
##                                                                
##  requests_to_sunnie raw_routine           values          is_active_subscriber
##  Length:604         Length:604         Length:604         Mode :logical       
##  Class :character   Class :character   Class :character   FALSE:477           
##  Mode  :character   Mode  :character   Mode  :character   TRUE :127           
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##  account_age_days engagement_velocity
##  Min.   :158.0    Min.   :0.00000    
##  1st Qu.:177.0    1st Qu.:0.01217    
##  Median :195.0    Median :0.03077    
##  Mean   :211.7    Mean   :0.05259    
##  3rd Qu.:211.0    3rd Qu.:0.06838    
##  Max.   :812.0    Max.   :0.48387    
## 
sapply(df, function(x) sum(is.na(x)))
##                  user_id             organization        days_active_total 
##                        0                      387                        0 
##   potential_organization       account_created_at                 position 
##                      536                        0                      393 
##                     dorm              not_in_dorm       raw_memory_profile 
##                      604                      604                      221 
##      raw_coping_strategy      dietary_constraints   emotional_health_score 
##                      220                      555                      220 
##      raw_favorite_things                raw_goals                  hobbies 
##                      220                      220                      220 
## mental_health_challenges       physical_condition       preferred_language 
##                      261                      220                      225 
##                   quirks relationship_with_sunnie       requests_to_sunnie 
##                      220                      222                      240 
##              raw_routine                   values     is_active_subscriber 
##                      220                      220                        0 
##         account_age_days      engagement_velocity 
##                        0                        0