Dating - App Reviews Dataset

Data Set The data is from 2017-2022. I acquired the data using googleplayscraper from google playstore online. The data I received was more than just the column shown here but were unnecessary.

knitr::opts_chunk$set(message = FALSE)

Loading required packages

library(readtext)
library(tidyverse)
library(quanteda)
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "packedMatrix" of class "mMatrix"; definition not updated
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "packedMatrix" of class "replValueSp"; definition not updated
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(textrank)
library(readtext)
library(udpipe)
library(dplyr)
library(topicmodels)
library(quanteda)
library(tidyverse)
library(tidytext)
library(topicdoc)
library(LDAvis)
library(broom)
library(ldatuning)
library(stm)
library(seededlda)
library(keyATM)
library(kableExtra)

Describe data

Dating_df <- readtext::readtext("Assignment2.csv",text_field = "Review", docid_field = "ID")
glimpse(Dating_df)
## Rows: 1,000
## Columns: 7
## $ doc_id     <chr> "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11~
## $ text       <chr> "On this app i cant find a partner", "Tinder would be so mu~
## $ Name       <chr> "linah sibanda", "Norman Johnson", "David Hume", "Last 1 St~
## $ Rating     <int> 5, 3, 1, 2, 5, 5, 5, 1, 5, 3, 2, 5, 1, 1, 1, 5, 1, 5, 1, 5,~
## $ X.ThumbsUp <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Date.Time  <chr> "18-02-2022 01:19", "18-02-2022 01:16", "18-02-2022 01:11",~
## $ App        <chr> "Tinder", "Tinder", "Tinder", "Tinder", "Tinder", "Tinder",~

Create corpus

Dating_corp <- corpus(Dating_df)
summary(Dating_corp, n = 5)
## Corpus consisting of 1000 documents, showing 5 documents:
## 
##  Text Types Tokens Sentences             Name Rating X.ThumbsUp
##     0     8      8         1    linah sibanda      5          0
##     1    22     23         1   Norman Johnson      3          0
##     2    61     72         7       David Hume      1          0
##     3    39     45         1  Last 1 Standing      2          0
##     4     3      3         1 Arthur Magamedov      5          0
##         Date.Time    App
##  18-02-2022 01:19 Tinder
##  18-02-2022 01:16 Tinder
##  18-02-2022 01:11 Tinder
##  18-02-2022 01:11 Tinder
##  18-02-2022 01:06 Tinder

Token and DCM

Dating_toks <- tokens(
  Dating_corp,
  remove_punct = TRUE,
  remove_numbers = TRUE,
  remove_symbols = TRUE,
  remove_url = TRUE,
  split_hyphens = FALSE)

myStopWords = c("Where", "when", "shall", "include", "including", "by",
                "includes", "included", "may", "uses", "using", "used", "may",
                "also", "can", "whether", "so", "however", "rather", "â", "s",
                "said", "one", "two", "three", "k")

Dating_toks2 <- tokens_remove(
  Dating_toks, pattern = c(stopwords("en"), myStopWords))

Feature Matrix

Dating_dfm <- dfm(Dating_toks2)
Dating_dfm
## Document-feature matrix of: 1,000 documents, 2,765 features (99.65% sparse) and 5 docvars.
##     features
## docs app cant find partner tinder much better specify race looking
##    0   1    1    1       1      0    0      0       0    0       0
##    1   0    0    0       0      1    1      1       1    1       1
##    2   1    0    0       0      0    0      0       0    0       0
##    3   0    0    0       0      1    0      0       0    0       1
##    4   0    0    0       0      0    0      0       0    0       0
##    5   1    0    0       0      0    0      0       0    0       0
## [ reached max_ndoc ... 994 more documents, reached max_nfeat ... 2,755 more features ]
Dating_dfm <- dfm(Dating_toks2, tolower = TRUE) %>%
  dfm_trim(min_termfreq = 3, min_docfreq = 10)

Dating_dfm
## Document-feature matrix of: 1,000 documents, 180 features (97.39% sparse) and 5 docvars.
##     features
## docs app find tinder much better looking swiping still matches messages
##    0   1    1      0    0      0       0       0     0       0        0
##    1   0    0      1    1      1       1       1     0       0        0
##    2   1    0      0    0      0       0       0     1       1        1
##    3   0    0      1    0      0       1       0     0       0        0
##    4   0    0      0    0      0       0       0     0       0        0
##    5   1    0      0    0      0       0       0     0       0        0
## [ reached max_ndoc ... 994 more documents, reached max_nfeat ... 170 more features ]

Top Features

ndoc(Dating_dfm)
## [1] 1000
nfeat(Dating_dfm)
## [1] 180
topfeatures(Dating_dfm, 30)
##          app       tinder       banned       people         just          get 
##          385          148          136          136          114          113 
##         good      account        money           ðÿ         time         like 
##          105          101           95           93           89           88 
##         even          pay      matches          got       reason         fake 
##           80           67           66           63           62           56 
##        match          now        never          use         many        waste 
##           54           53           51           49           49           48 
##      profile         want        likes subscription          see     profiles 
##           43           42           42           42           41           40

3 Relevent Keywords

keyword1 <- kwic(Dating_toks2, pattern = phrase("learn*"), window = 2)
head(keyword1, 5)
## Keyword-in-context with 1 match.                                             
##  [723, 6] ladies man | learned | lot socially
keyword2 <- kwic(Dating_toks2, pattern = phrase("content*"), window = 2)
head(keyword2, 5)
## Keyword-in-context with 3 matches.                                                         
##  [484, 13] post inappropriate | content | Stupid app     
##  [779, 10]     buying premium | content | Waste time     
##  [792, 44]  profile offensive | content | amicable convos
keyword3 <- kwic(Dating_toks2, pattern = phrase("course*"), window = 2)
head(keyword3, 5)
## Keyword-in-context with 5 matches.                                                           
##  [110, 22]   months total | course | order know            
##  [138, 35]  except really | course | never reveal          
##  [214, 18]      log email | course | lets send             
##  [435, 26]    called scam | course | possible communication
##  [912, 13] business model | course |
head(keyword1, 5) %>%
  kbl() %>%
  kable_classic(bootstrap_options = "striped", full_width = F, position = "left")
docname from to pre keyword post pattern
723 6 6 ladies man learned lot socially learn*
head(keyword2, 5) %>%
  kbl() %>%
  kable_classic(bootstrap_options = "striped", full_width = F, position = "left")
docname from to pre keyword post pattern
484 13 13 post inappropriate content Stupid app content*
779 10 10 buying premium content Waste time content*
792 44 44 profile offensive content amicable convos content*
head(keyword3, 5) %>%
  kbl() %>%
  kable_classic(bootstrap_options = "striped", full_width = F, position = "left")
docname from to pre keyword post pattern
110 22 22 months total course order know course*
138 35 35 except really course never reveal course*
214 18 18 log email course lets send course*
435 26 26 called scam course possible communication course*
912 13 13 business model course course*
Dating_dfm_small <- dfm_trim(Dating_dfm, min_termfreq = 100)
nfeat(Dating_dfm_small)
## [1] 8
Dating_fcm <- fcm(Dating_dfm_small)

feat <- names(topfeatures(Dating_fcm, 30))
fcmat_select <- fcm_select(Dating_fcm, pattern = feat, selection = "keep")

Plot

size <- log(colSums(dfm_select(Dating_fcm, feat, selection = "keep")))

set.seed(123)


textplot_network(fcmat_select, min_freq = 0.5, edge_size = 2, edge_color = "red",
                 vertex_size = size/max(size)*3)

LDA

Dating_dtmat = quanteda::convert(Dating_dfm, to="topicmodels")
Dating_lda5 <- LDA(Dating_dtmat, k = 5, control = list(seed = 123))

Top 5

Dating_lda5_betas <- broom::tidy(Dating_lda5)


top_terms_in_topics <- Dating_lda5_betas %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms_in_topics
## # A tibble: 25 x 3
##    topic term      beta
##    <int> <chr>    <dbl>
##  1     1 app     0.0794
##  2     1 time    0.0530
##  3     1 get     0.0412
##  4     1 just    0.0318
##  5     1 account 0.0258
##  6     2 just    0.0391
##  7     2 like    0.0369
##  8     2 people  0.0367
##  9     2 app     0.0320
## 10     2 get     0.0252
## # ... with 15 more rows
top_terms_in_topics %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

test = subset(Dating_df)
nrow(test)
## [1] 1000

Best number

train_Dating_dtmat <- corpus_subset(Dating_corp)[1:600,]%>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE,
         remove_symbols = TRUE, remove_url = TRUE) %>%
  dfm(tolower = TRUE) %>%
  dfm_remove(c(stopwords("en"), myStopWords)) %>%
  dfm_trim(min_termfreq = 5, min_docfreq = 10) %>%
  quanteda::convert(to="topicmodels")

test_Dating_dtmat <- corpus_subset(Dating_corp)[601:1000,] %>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE,
         remove_symbols = TRUE, remove_url = TRUE) %>%
  dfm(tolower = TRUE) %>%
  dfm_remove(c(stopwords("en"), myStopWords)) %>%
  dfm_trim(min_termfreq = 5, min_docfreq = 10) %>%
  quanteda::convert(to="topicmodels")

train_Dating_lda5 <- LDA(test_Dating_dtmat, k = 5, control = list(seed = 123))
perplexity(train_Dating_lda5, test_Dating_dtmat)
## [1] 42.69726
n_topics_vec = 2:5
perplexity_vec = map_dbl(n_topics_vec, function(kk) {
  message(kk)
  train_Dating_ldaK <- LDA(train_Dating_dtmat, k = kk, control = list(seed = 123))
  perp = perplexity(train_Dating_ldaK, test_Dating_dtmat)
})
lda_perplexity_result = tibble(
  n_topics = n_topics_vec, perplexity = perplexity_vec
)
plot(lda_perplexity_result, type="l")

Observation: As per the above perplexity 2 topics would be best number

LDA Tuning

library(ldatuning)
lda_ldatuning_result <- FindTopicsNumber(
  Dating_dtmat, topics = n_topics_vec,
  metrics = c("CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "VEM", control = list(seed = 123), mc.cores = 4L, verbose = TRUE
)
## fit models... done.
## calculate metrics:
##   CaoJuan2009... done.
##   Arun2010... done.
##   Deveaud2014... done.
FindTopicsNumber_plot(lda_ldatuning_result)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Observation: As per the measures, 5 would be best number of topics

Dating_lda3 <- LDA(Dating_dtmat, k = 3, control = list(seed = 123))
topicdoc_result = topic_diagnostics(Dating_lda3, Dating_dtmat)
#below is the diagnostics for the best Model:
head(topicdoc_result, 5) %>%
  kbl() %>%
  kable_classic(bootstrap_options = "striped", full_width = F, position = "left")
topic_num topic_size mean_token_length dist_from_corpus tf_df_dist doc_prominence topic_coherence topic_exclusivity
1 55.42932 4.3 0.2238619 2.177076 890 -162.1257 8.381046
2 63.20371 4.6 0.2237125 2.122344 890 -121.3646 7.287485
3 61.36697 4.4 0.2177195 2.437037 890 -186.7830 8.279880

STM

library(stm)
stm_Datingdfmat <- quanteda::convert(Dating_dfm, to = "stm")
## Warning in dfm2stm(x, docvars, omit_empty = TRUE): Dropped empty document(s):
## 11, 25, 34, 46, 47, 56, 84, 89, 94, 114, 125, 127, 129, 143, 150, 164, 169, 176,
## 181, 187, 203, 206, 211, 220, 225, 241, 253, 255, 258, 271, 274, 276, 281, 300,
## 308, 317, 331, 353, 354, 361, 363, 364, 379, 428, 448, 450, 460, 465, 477, 500,
## 527, 532, 538, 541, 542, 547, 556, 560, 569, 590, 597, 604, 611, 625, 631, 643,
## 645, 658, 675, 696, 698, 701, 709, 711, 717, 720, 724, 749, 754, 763, 789, 794,
## 800, 805, 808, 810, 811, 820, 828, 850, 853, 864, 866, 869, 881, 886, 906, 917,
## 930, 933, 936, 939, 942, 944, 953, 962, 965, 979, 983, 999
out <- prepDocuments( stm_Datingdfmat$documents, 
                      stm_Datingdfmat$vocab, 
                      stm_Datingdfmat$meta)


Dating_tmob_stm <- stm(out$documents, out$vocab,K=5,
                         seed=123,emtol=1e-3, max.em.its=150)
## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      .....
##   Recovering initialization...
##      .
## Initialization complete.
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -5.122) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -4.961, relative change = 3.126e-02) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -4.893, relative change = 1.371e-02) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -4.857, relative change = 7.401e-03) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 5 (approx. per word bound = -4.836, relative change = 4.391e-03) 
## Topic 1: people, like, great, matches, lot 
##  Topic 2: banned, time, got, match, subscription 
##  Topic 3: tinder, get, now, profile, bad 
##  Topic 4: just, ðÿ, money, even, reason 
##  Topic 5: app, good, account, pay, never 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 6 (approx. per word bound = -4.822, relative change = 2.803e-03) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -4.813, relative change = 1.889e-03) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 8 (approx. per word bound = -4.807, relative change = 1.326e-03) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Model Converged
toLDAvis(mod=Dating_tmob_stm, docs=out$documents)



plot(Dating_tmob_stm, type="summary", n=5)

Top Quality

topicQuality(Dating_tmob_stm, out$documents)
## [1] -202.0875 -151.3701 -127.6038 -158.5214 -161.8727
## [1] 9.654707 9.113728 8.991798 8.948663 9.249885

Semantic coherenceIt explains the relationship between words to aid understanding and interpretation of spoken and written language.

Exclusivity measures how distinctive the top words are to that topic

Topic 3 has more semantic coherence and Topic 1 has high value of exclusivity ## 4 set of keywords

library(keyATM)
keyATM_docs <- keyATM_read(texts = Dating_dfm)
## Warning in get_doc_index(W_raw, check = TRUE): Number of documents with 0 length: 110
## This may cause invalid covariates or time index.
## Please review the preprocessing steps.
## Document index to check: 12, 26, 35, 47, 48, 57, 85, 90, 95, 115, 126, 128, 130, 144, 151, 165, 170, 177, 182, 188, 204, 207, 212, 221, 226, 242, 254, 256, 259, 272, 275, 277, 282, 301, 309, 318, 332, 354, 355, 362, 364, 365, 380, 429, 449, 451, 461, 466, 478, 501, 528, 533, 539, 542, 543, 548, 557, 561, 570, 591, 598, 605, 612, 626, 632, 644, 646, 659, 676, 697, 699, 702, 710, 712, 718, 721, 725, 750, 755, 764, 790, 795, 801, 806, 809, 811, 812, 821, 829, 851, 854, 865, 867, 870, 882, 887, 907, 918, 931, 934, 937, 940, 943, 945, 954, 963, 966, 980, 984, 1000
summary(keyATM_docs)
## keyATM_docs object of: 1000 documents.
## Length of documents:
##   Avg: 5.211
##   Min: 0
##   Max: 27
##    SD: 5.578
## Number of unique words: 1728
Dating_key_list = list(
  good = c("match", "reason", "amazing", "good", "amazing"),
  bad = c("banned", "fake", "terrible", "sucks", "useless"),
  timestamp = c("hours","times","every","work","subscription"),
  profile =c("tinder","profile","women","experience","men","erotic")
)

Top 5 keyword in each topic

Dating_key_viz <- visualize_keywords(docs = keyATM_docs, keywords = Dating_key_list)
## Warning in check_keywords(unique(unlisted), keywords, prune): A keyword will be
## pruned because it does not appear in documents: erotic
Dating_key_viz

Dating_tmod_keyatm_base <- keyATM(
  docs = keyATM_docs, 
  no_keyword_topics = 3, 
  keywords = Dating_key_list, 
  model = "base", 
  options = list(seed = 123))
## Warning in keyATM_fit(docs, model, no_keyword_topics, keywords,
## model_settings, : Some documents have 0 length. Please review the preprocessing
## steps.
## Warning in check_keywords(info$wd_names, keywords, options$prune): A keyword
## will be pruned because it does not appear in documents: erotic
top_words(Dating_tmod_keyatm_base, 5)
##              1_good             2_bad             3_timestamp         4_profile
## 1   good [<U+2713>]               app subscription [<U+2713>]               app
## 2 reason [<U+2713>] banned [<U+2713>]        every [<U+2713>] tinder [<U+2713>]
## 3               app           account        hours [<U+2713>]            people
## 4        banned [2]            people         work [<U+2713>]               pay
## 5              ðÿ              just                    gold              just
##      Other_1 Other_2  Other_3
## 1      trash     app      app
## 2        app    ðÿ     nice
## 3       find    best    great
## 4 tinder [4]   worst good [1]
## 5       much  dating      use
kable(top_words(Dating_tmod_keyatm_base, 5), caption = "Top 5 keywords")
Top 5 keywords
1_good 2_bad 3_timestamp 4_profile Other_1 Other_2 Other_3
good [<U+2713>] app subscription [<U+2713>] app trash app app
reason [<U+2713>] banned [<U+2713>] every [<U+2713>] tinder [<U+2713>] app ðÿ nice
app account hours [<U+2713>] people find best great
banned [2] people work [<U+2713>] pay tinder [4] worst good [1]
ðÿ just gold just much dating use