Motiv8AI - Home Assignment

Author

Noam Keshet

Published

September 30, 2025

library(tidyverse)
library(readr)
library(readxl)
library(openxlsx)
library(psych)
library(GPArotation)
library(multiway)
library(here)
library(explore)
library(rstatix)
library(janitor)
library(gt)
library(tidylog)

1. Data Import

#read files
user_apps <- read_csv("/Users/noam.keshet/Downloads/user_apps.csv")
app_data <- read_csv("/Users/noam.keshet/Downloads/app_data.csv")

#merge the file using the key of packageName
merge <- left_join(user_apps, app_data, by = "packageName")

#show glimpse of the new df
merge %>% head() %>% gt() %>% tab_options(table.font.size = 10)
userId packageName app_name summary short_desc category
0003fd49-0a21-423b-8c5c-18555baeb475 br.com.correios.preatendimento Correios Official application of the Post Office. This is the official Post Office App. The application offers simple, reliable and intuitive self-service solutions to make your daily life easier. N/A
0003fd49-0a21-423b-8c5c-18555baeb475 br.com.doctoralia Doctoralia: agende seu médico Private consultations and health plans with the best doctors and clinics We welcome you to Doctoralia, the world's leading healthcare platform. Schedule your appointments online and in person with the best specialists. N/A
0003fd49-0a21-423b-8c5c-18555baeb475 br.com.light.ava Light Clientes Service channel of Light, power company of RJ The Light Clientes app has a new look and new digital services: •Proof of payment • Adhesion to the digital invoice • Reconnection after cut • Installment of debts •Contract termination And still counting on the services of power outage, bar code, registration update and consumption history. N/A
0003fd49-0a21-423b-8c5c-18555baeb475 br.com.lojasrenner Lojas Renner - Comprar Roupas Black Friday Renner: Clothes, Shoes and Beauty! Download the App and Get 15% OFF* Are you ready for the biggest promotion of the year? Black Friday has arrived at Renner! Enjoy Black Friday at Renner with women's, men's and children's clothing on sale. Download the app, use the coupon VOUDEAPP and get 15% OFF on your first purchase!* What to expect from Black Friday at Renner: shopping
0003fd49-0a21-423b-8c5c-18555baeb475 br.com.serasaexperian.consumidor Serasa: Consulta CPF e Score CPF and Score consultation, card/loan request and digital wallet. Hi, you're at Serasa! And this is much better than you can imagine. 😉 In our app, checking your CPF is free, and there are several services that help you get credit. N/A
0003fd49-0a21-423b-8c5c-18555baeb475 br.com.webdiet.webdiet WebDiet - App para pacientes How about watch your diet in your palm when you need ?! Let's forget the prey diet in the fridge !? There is no more reason for you to forget your diet at home or else carry a paperwork confusing and relevant things. If your nutritionist prescribes diet by WebDiet platform, download this app and have everything in the palm. N/A

2. Explore Main Variables

# Apps categories

merge %>% 
  freq_table(category) %>%
  arrange(-n) %>% 
  gt() %>% opt_interactive()
# Apps Names
merge %>% 
  freq_table(app_name,na.rm=F) %>%
  arrange(-n) %>% 
  filter(n>=35) %>% 
  gt() %>% opt_interactive()
filter: removed 2,042 rows (97%), 59 rows remaining

3. Tidy the Categories

#step 1 - export empty categorized apps name (only n>10)
merge %>%
  filter(category == "N/A") %>%
  freq_table(app_name) %>%
  arrange(-n) %>%
  filter(n > 10) %>%
  openxlsx::write.xlsx(file = "category_tidy.xlsx")
filter: removed 2,591 rows (30%), 6,150 rows remaining
filter: removed 1,435 rows (93%), 101 rows remaining
# step 2 - after tidy using chatGPT - import tidy categories
cat_tidy <- read.csv("/Users/noam.keshet/Downloads/cat_tidy.csv")

# replace new tidy apps categories in cases when no category what placed
tidy_merge <- merge %>%
  left_join(cat_tidy, by = "app_name") %>%
  mutate(category = case_when(category == "N/A" ~ category_tidy, TRUE ~ category))
left_join: added one column (category_tidy)
           > rows only in x         4,977
           > rows only in cat_tidy (    1)
           > matched rows           3,764
           >                       =======
           > rows total             8,741
mutate: changed 6,150 values (70%) of 'category' (2,386 new NAs)
# After the tidy the percentage of N/A reduce from 70% to 27%
tidy_merge %>% 
  freq_table(category,na.rm =F) %>%
  arrange(-n) %>% 
  gt()  %>% opt_interactive()

4. Category-Level Analysis

# To process the data at the userId level, a pivot of the categories was implemented.
category <- tidy_merge %>%
  group_by(userId, category) %>%
  summarise(num_apps = n_distinct(app_name), .groups = "drop") %>%
  pivot_wider(
    id_cols = userId,
    names_from = category,
    values_from = num_apps,
    values_fill = 0
  ) %>%
  
# Adding a new variable, sum_apps, that counts the number of apps per user in all categories, excluding apps that are not categorized.  
  mutate(sum_apps = rowSums(select(., -c(userId, `NA`)), na.rm = TRUE)) %>% 
   relocate(sum_apps, .after = "userId") %>%
  relocate(`NA`, .after = "sum_apps")
group_by: 2 grouping variables (userId, category)
summarise: now 1,894 rows and 3 columns, ungrouped
pivot_wider: reorganized (category, num_apps) into (bank, casual_game, communication_messaging, credit_card, gadget, …) [was 1894x3, now 136x20]
select: dropped 2 variables (userId, NA)
mutate: new variable 'sum_apps' (double) with 60 unique values and 0% NA
relocate: columns reordered (userId, sum_apps, bank, casual_game, communication_messaging, …)
relocate: columns reordered (userId, sum_apps, NA, bank, casual_game, …)
category %>% gt() %>% opt_interactive() %>% tab_options(table.font.size = 10)

5. Creating the Features

add_parent_category <- category %>%
  mutate(
    # --- Create Parent Categories (combine related app types) ---
    finance_apps    = bank + credit_card + investments,  # financial-related apps
    social_apps     = communication_messaging + social_media + photography,  # social interaction
    info_apps       = education + news_and_weather,  # information-seeking
    consumer_apps   = shopping + food_delivery + health_and_fitness,  # consumption-oriented
    entertiment_apps = casual_game + gambling + tv_and_video_streaming,  # leisure/entertainment
    utilities_apps  = public_transportation + gadget,  # practical utilities

    # --- Diversity of apps ---
    # Measures variability across all app categories per user
    # High SD = the user has a mix of very different categories
    diversity_apps = apply(select(., bank:gambling), 1, sd, na.rm = TRUE),

    # --- Ratios: proportion of each domain relative to total apps ---
    ratio_finance   = finance_apps / sum_apps,
    ratio_social    = social_apps / sum_apps,
    ratio_info      = info_apps / sum_apps,
    ratio_consumer  = consumer_apps / sum_apps,
    ratio_shopping  = shopping / sum_apps,
    ratio_gambling  = gambling / sum_apps,
    ratio_work      = work / sum_apps,

    # --- Leisure vs Work balance ---
    # If user has work apps, compare leisure apps (consumer + entertainment) to work
    ratio_leisure_work = ifelse(work > 0,
                                (consumer_apps + entertiment_apps) / work,
                                NA_real_),

    # --- Risk vs Security group classification ---
    # Compare apps that reflect "risk" (gambling, investments, credit cards)
    # vs apps that reflect "security" (bank, religion, health/fitness)
    ratio_risk_security_group = case_when(
      (gambling + investments + credit_card) > (bank + religion + health_and_fitness) ~ "risk > security",
      (gambling + investments + credit_card) < (bank + religion + health_and_fitness) ~ "security > risk",
      TRUE ~ "security = risk"
    ),

    # --- Risk-Security numerical difference ---
    # Positive = more risk-oriented apps
    # 0 = balance
    # Negative = more security-oriented apps
    ratio_risk_security = (gambling + investments + credit_card) -
                          (bank + religion + health_and_fitness)
  )
select: dropped 3 variables (userId, sum_apps, NA)
mutate: new variable 'finance_apps' (integer) with 19 unique values and 0% NA
        new variable 'social_apps' (integer) with 20 unique values and 0% NA
        new variable 'info_apps' (integer) with 9 unique values and 0% NA
        new variable 'consumer_apps' (integer) with 21 unique values and 0% NA
        new variable 'entertiment_apps' (integer) with 17 unique values and 0% NA
        new variable 'utilities_apps' (integer) with 21 unique values and 0% NA
        new variable 'diversity_apps' (double) with 128 unique values and 0% NA
        new variable 'ratio_finance' (double) with 99 unique values and 0% NA
        new variable 'ratio_social' (double) with 100 unique values and 0% NA
        new variable 'ratio_info' (double) with 80 unique values and 0% NA
        new variable 'ratio_consumer' (double) with 96 unique values and 0% NA
        new variable 'ratio_shopping' (double) with 89 unique values and 0% NA
        new variable 'ratio_gambling' (double) with 19 unique values and 0% NA
        new variable 'ratio_work' (double) with 101 unique values and 0% NA
        new variable 'ratio_leisure_work' (double) with 71 unique values and 1% NA
        new variable 'ratio_risk_security_group' (character) with 3 unique values and 0% NA
        new variable 'ratio_risk_security' (integer) with 19 unique values and 0% NA
add_parent_category %>% 
  select(userId:sum_apps, diversity_apps:ratio_risk_security) %>% 
  mutate(across(where(is.numeric), ~ round(.x, 2))) %>% 
  gt() %>% opt_interactive() %>% tab_options(table.font.size = 10)
select: dropped 25 variables (NA, bank, casual_game, communication_messaging, credit_card, …)
mutate: changed 135 values (99%) of 'diversity_apps' (0 new NAs)
        changed 126 values (93%) of 'ratio_finance' (0 new NAs)
        changed 124 values (91%) of 'ratio_social' (0 new NAs)
        changed 109 values (80%) of 'ratio_info' (0 new NAs)
        changed 125 values (92%) of 'ratio_consumer' (0 new NAs)
        changed 123 values (90%) of 'ratio_shopping' (0 new NAs)
        changed 22 values (16%) of 'ratio_gambling' (0 new NAs)
        changed 127 values (93%) of 'ratio_work' (0 new NAs)
        changed 49 values (36%) of 'ratio_leisure_work' (0 new NAs)
        converted 'ratio_risk_security' from integer to double (0 new NA)

6. Summary of Features

add_parent_category %>% 
  select(userId:sum_apps, diversity_apps, ends_with("_apps"),gambling,work,religion) %>% 
  mutate(across(where(is.numeric), ~ round(.x, 2))) %>% 
   get_summary_stats() %>%  
    gt() %>% opt_interactive() %>% tab_options(table.font.size = 12)
select: dropped 26 variables (NA, bank, casual_game, communication_messaging, credit_card, …)
mutate: changed 135 values (99%) of 'diversity_apps' (0 new NAs)
        converted 'finance_apps' from integer to double (0 new NA)
        converted 'social_apps' from integer to double (0 new NA)
        converted 'info_apps' from integer to double (0 new NA)
        converted 'consumer_apps' from integer to double (0 new NA)
        converted 'entertiment_apps' from integer to double (0 new NA)
        converted 'utilities_apps' from integer to double (0 new NA)
        converted 'gambling' from integer to double (0 new NA)
        converted 'work' from integer to double (0 new NA)
        converted 'religion' from integer to double (0 new NA)
add_parent_category %>%
  select(starts_with("ratio_")) %>%
  get_summary_stats() %>%  
    gt() %>% opt_interactive() %>% tab_options(table.font.size = 12)
select: dropped 28 variables (userId, sum_apps, NA, bank, casual_game, …)
# show correlation of ration apps
add_parent_category %>%
  select(contains("ratio")) %>%
  select_if(is.numeric) %>%
  select(-ratio_leisure_work) %>% 
  cor() %>%
  round(2) %>%
  knitr::kable()  
select: dropped 28 variables (userId, sum_apps, NA, bank, casual_game, …)
select_if: dropped one variable (ratio_risk_security_group)
select: dropped one variable (ratio_leisure_work)
ratio_finance ratio_social ratio_info ratio_consumer ratio_shopping ratio_gambling ratio_work ratio_risk_security
ratio_finance 1.00 -0.28 -0.18 -0.20 -0.04 0.05 -0.07 0.12
ratio_social -0.28 1.00 -0.08 -0.28 -0.22 -0.10 -0.07 -0.03
ratio_info -0.18 -0.08 1.00 0.03 -0.15 -0.06 -0.05 -0.03
ratio_consumer -0.20 -0.28 0.03 1.00 0.81 -0.07 -0.35 -0.20
ratio_shopping -0.04 -0.22 -0.15 0.81 1.00 -0.05 -0.33 0.08
ratio_gambling 0.05 -0.10 -0.06 -0.07 -0.05 1.00 0.15 0.20
ratio_work -0.07 -0.07 -0.05 -0.35 -0.33 0.15 1.00 -0.04
ratio_risk_security 0.12 -0.03 -0.03 -0.20 0.08 0.20 -0.04 1.00