Data Set The data is from 2017-2022. I acquired the data using googleplayscraper from google playstore online. The data I received was more than just the column shown here but were unnecessary.
knitr::opts_chunk$set(message = FALSE)
library(readtext)
library(tidyverse)
library(quanteda)
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "packedMatrix" of class "mMatrix"; definition not updated
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "packedMatrix" of class "replValueSp"; definition not updated
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(textrank)
library(readtext)
library(udpipe)
library(dplyr)
library(topicmodels)
library(quanteda)
library(tidyverse)
library(tidytext)
library(topicdoc)
library(LDAvis)
library(broom)
library(ldatuning)
library(stm)
library(seededlda)
library(keyATM)
library(kableExtra)
Dating_df <- readtext::readtext("Assignment2.csv",text_field = "Review", docid_field = "ID")
glimpse(Dating_df)
## Rows: 1,000
## Columns: 7
## $ doc_id <chr> "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11~
## $ text <chr> "On this app i cant find a partner", "Tinder would be so mu~
## $ Name <chr> "linah sibanda", "Norman Johnson", "David Hume", "Last 1 St~
## $ Rating <int> 5, 3, 1, 2, 5, 5, 5, 1, 5, 3, 2, 5, 1, 1, 1, 5, 1, 5, 1, 5,~
## $ X.ThumbsUp <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Date.Time <chr> "18-02-2022 01:19", "18-02-2022 01:16", "18-02-2022 01:11",~
## $ App <chr> "Tinder", "Tinder", "Tinder", "Tinder", "Tinder", "Tinder",~
Dating_corp <- corpus(Dating_df)
summary(Dating_corp, n = 5)
## Corpus consisting of 1000 documents, showing 5 documents:
##
## Text Types Tokens Sentences Name Rating X.ThumbsUp
## 0 8 8 1 linah sibanda 5 0
## 1 22 23 1 Norman Johnson 3 0
## 2 61 72 7 David Hume 1 0
## 3 39 45 1 Last 1 Standing 2 0
## 4 3 3 1 Arthur Magamedov 5 0
## Date.Time App
## 18-02-2022 01:19 Tinder
## 18-02-2022 01:16 Tinder
## 18-02-2022 01:11 Tinder
## 18-02-2022 01:11 Tinder
## 18-02-2022 01:06 Tinder
Dating_toks <- tokens(
Dating_corp,
remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE,
remove_url = TRUE,
split_hyphens = FALSE)
myStopWords = c("Where", "when", "shall", "include", "including", "by",
"includes", "included", "may", "uses", "using", "used", "may",
"also", "can", "whether", "so", "however", "rather", "â", "s",
"said", "one", "two", "three", "k")
Dating_toks2 <- tokens_remove(
Dating_toks, pattern = c(stopwords("en"), myStopWords))
Dating_dfm <- dfm(Dating_toks2)
Dating_dfm
## Document-feature matrix of: 1,000 documents, 2,765 features (99.65% sparse) and 5 docvars.
## features
## docs app cant find partner tinder much better specify race looking
## 0 1 1 1 1 0 0 0 0 0 0
## 1 0 0 0 0 1 1 1 1 1 1
## 2 1 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 1 0 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0
## 5 1 0 0 0 0 0 0 0 0 0
## [ reached max_ndoc ... 994 more documents, reached max_nfeat ... 2,755 more features ]
Dating_dfm <- dfm(Dating_toks2, tolower = TRUE) %>%
dfm_trim(min_termfreq = 3, min_docfreq = 10)
Dating_dfm
## Document-feature matrix of: 1,000 documents, 180 features (97.39% sparse) and 5 docvars.
## features
## docs app find tinder much better looking swiping still matches messages
## 0 1 1 0 0 0 0 0 0 0 0
## 1 0 0 1 1 1 1 1 0 0 0
## 2 1 0 0 0 0 0 0 1 1 1
## 3 0 0 1 0 0 1 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 1 0 0 0 0 0 0 0 0 0
## [ reached max_ndoc ... 994 more documents, reached max_nfeat ... 170 more features ]
ndoc(Dating_dfm)
## [1] 1000
nfeat(Dating_dfm)
## [1] 180
topfeatures(Dating_dfm, 30)
## app tinder banned people just get
## 385 148 136 136 114 113
## good account money ðÿ time like
## 105 101 95 93 89 88
## even pay matches got reason fake
## 80 67 66 63 62 56
## match now never use many waste
## 54 53 51 49 49 48
## profile want likes subscription see profiles
## 43 42 42 42 41 40
keyword1 <- kwic(Dating_toks2, pattern = phrase("learn*"), window = 2)
head(keyword1, 5)
## Keyword-in-context with 1 match.
## [723, 6] ladies man | learned | lot socially
keyword2 <- kwic(Dating_toks2, pattern = phrase("content*"), window = 2)
head(keyword2, 5)
## Keyword-in-context with 3 matches.
## [484, 13] post inappropriate | content | Stupid app
## [779, 10] buying premium | content | Waste time
## [792, 44] profile offensive | content | amicable convos
keyword3 <- kwic(Dating_toks2, pattern = phrase("course*"), window = 2)
head(keyword3, 5)
## Keyword-in-context with 5 matches.
## [110, 22] months total | course | order know
## [138, 35] except really | course | never reveal
## [214, 18] log email | course | lets send
## [435, 26] called scam | course | possible communication
## [912, 13] business model | course |
head(keyword1, 5) %>%
kbl() %>%
kable_classic(bootstrap_options = "striped", full_width = F, position = "left")
| docname | from | to | pre | keyword | post | pattern |
|---|---|---|---|---|---|---|
| 723 | 6 | 6 | ladies man | learned | lot socially | learn* |
head(keyword2, 5) %>%
kbl() %>%
kable_classic(bootstrap_options = "striped", full_width = F, position = "left")
| docname | from | to | pre | keyword | post | pattern |
|---|---|---|---|---|---|---|
| 484 | 13 | 13 | post inappropriate | content | Stupid app | content* |
| 779 | 10 | 10 | buying premium | content | Waste time | content* |
| 792 | 44 | 44 | profile offensive | content | amicable convos | content* |
head(keyword3, 5) %>%
kbl() %>%
kable_classic(bootstrap_options = "striped", full_width = F, position = "left")
| docname | from | to | pre | keyword | post | pattern |
|---|---|---|---|---|---|---|
| 110 | 22 | 22 | months total | course | order know | course* |
| 138 | 35 | 35 | except really | course | never reveal | course* |
| 214 | 18 | 18 | log email | course | lets send | course* |
| 435 | 26 | 26 | called scam | course | possible communication | course* |
| 912 | 13 | 13 | business model | course | course* |
Dating_dfm_small <- dfm_trim(Dating_dfm, min_termfreq = 100)
nfeat(Dating_dfm_small)
## [1] 8
Dating_fcm <- fcm(Dating_dfm_small)
feat <- names(topfeatures(Dating_fcm, 30))
fcmat_select <- fcm_select(Dating_fcm, pattern = feat, selection = "keep")
size <- log(colSums(dfm_select(Dating_fcm, feat, selection = "keep")))
set.seed(123)
textplot_network(fcmat_select, min_freq = 0.5, edge_size = 2, edge_color = "red",
vertex_size = size/max(size)*3)
Dating_dtmat = quanteda::convert(Dating_dfm, to="topicmodels")
Dating_lda5 <- LDA(Dating_dtmat, k = 5, control = list(seed = 123))
Dating_lda5_betas <- broom::tidy(Dating_lda5)
top_terms_in_topics <- Dating_lda5_betas %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms_in_topics
## # A tibble: 25 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 app 0.0794
## 2 1 time 0.0530
## 3 1 get 0.0412
## 4 1 just 0.0318
## 5 1 account 0.0258
## 6 2 just 0.0391
## 7 2 like 0.0369
## 8 2 people 0.0367
## 9 2 app 0.0320
## 10 2 get 0.0252
## # ... with 15 more rows
top_terms_in_topics %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
test = subset(Dating_df)
nrow(test)
## [1] 1000
train_Dating_dtmat <- corpus_subset(Dating_corp)[1:600,]%>%
tokens(remove_punct = TRUE, remove_numbers = TRUE,
remove_symbols = TRUE, remove_url = TRUE) %>%
dfm(tolower = TRUE) %>%
dfm_remove(c(stopwords("en"), myStopWords)) %>%
dfm_trim(min_termfreq = 5, min_docfreq = 10) %>%
quanteda::convert(to="topicmodels")
test_Dating_dtmat <- corpus_subset(Dating_corp)[601:1000,] %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE,
remove_symbols = TRUE, remove_url = TRUE) %>%
dfm(tolower = TRUE) %>%
dfm_remove(c(stopwords("en"), myStopWords)) %>%
dfm_trim(min_termfreq = 5, min_docfreq = 10) %>%
quanteda::convert(to="topicmodels")
train_Dating_lda5 <- LDA(test_Dating_dtmat, k = 5, control = list(seed = 123))
perplexity(train_Dating_lda5, test_Dating_dtmat)
## [1] 42.69726
n_topics_vec = 2:5
perplexity_vec = map_dbl(n_topics_vec, function(kk) {
message(kk)
train_Dating_ldaK <- LDA(train_Dating_dtmat, k = kk, control = list(seed = 123))
perp = perplexity(train_Dating_ldaK, test_Dating_dtmat)
})
lda_perplexity_result = tibble(
n_topics = n_topics_vec, perplexity = perplexity_vec
)
plot(lda_perplexity_result, type="l")
Observation: As per the above perplexity 2 topics would be best number
library(ldatuning)
lda_ldatuning_result <- FindTopicsNumber(
Dating_dtmat, topics = n_topics_vec,
metrics = c("CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "VEM", control = list(seed = 123), mc.cores = 4L, verbose = TRUE
)
## fit models... done.
## calculate metrics:
## CaoJuan2009... done.
## Arun2010... done.
## Deveaud2014... done.
FindTopicsNumber_plot(lda_ldatuning_result)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
Observation: As per the measures, 5 would be best number of topics
Dating_lda3 <- LDA(Dating_dtmat, k = 3, control = list(seed = 123))
topicdoc_result = topic_diagnostics(Dating_lda3, Dating_dtmat)
#below is the diagnostics for the best Model:
head(topicdoc_result, 5) %>%
kbl() %>%
kable_classic(bootstrap_options = "striped", full_width = F, position = "left")
| topic_num | topic_size | mean_token_length | dist_from_corpus | tf_df_dist | doc_prominence | topic_coherence | topic_exclusivity |
|---|---|---|---|---|---|---|---|
| 1 | 55.42932 | 4.3 | 0.2238619 | 2.177076 | 890 | -162.1257 | 8.381046 |
| 2 | 63.20371 | 4.6 | 0.2237125 | 2.122344 | 890 | -121.3646 | 7.287485 |
| 3 | 61.36697 | 4.4 | 0.2177195 | 2.437037 | 890 | -186.7830 | 8.279880 |
library(stm)
stm_Datingdfmat <- quanteda::convert(Dating_dfm, to = "stm")
## Warning in dfm2stm(x, docvars, omit_empty = TRUE): Dropped empty document(s):
## 11, 25, 34, 46, 47, 56, 84, 89, 94, 114, 125, 127, 129, 143, 150, 164, 169, 176,
## 181, 187, 203, 206, 211, 220, 225, 241, 253, 255, 258, 271, 274, 276, 281, 300,
## 308, 317, 331, 353, 354, 361, 363, 364, 379, 428, 448, 450, 460, 465, 477, 500,
## 527, 532, 538, 541, 542, 547, 556, 560, 569, 590, 597, 604, 611, 625, 631, 643,
## 645, 658, 675, 696, 698, 701, 709, 711, 717, 720, 724, 749, 754, 763, 789, 794,
## 800, 805, 808, 810, 811, 820, 828, 850, 853, 864, 866, 869, 881, 886, 906, 917,
## 930, 933, 936, 939, 942, 944, 953, 962, 965, 979, 983, 999
out <- prepDocuments( stm_Datingdfmat$documents,
stm_Datingdfmat$vocab,
stm_Datingdfmat$meta)
Dating_tmob_stm <- stm(out$documents, out$vocab,K=5,
seed=123,emtol=1e-3, max.em.its=150)
## Beginning Spectral Initialization
## Calculating the gram matrix...
## Finding anchor words...
## .....
## Recovering initialization...
## .
## Initialization complete.
## ...............................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 1 (approx. per word bound = -5.122)
## ...............................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 2 (approx. per word bound = -4.961, relative change = 3.126e-02)
## ...............................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 3 (approx. per word bound = -4.893, relative change = 1.371e-02)
## ...............................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 4 (approx. per word bound = -4.857, relative change = 7.401e-03)
## ...............................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 5 (approx. per word bound = -4.836, relative change = 4.391e-03)
## Topic 1: people, like, great, matches, lot
## Topic 2: banned, time, got, match, subscription
## Topic 3: tinder, get, now, profile, bad
## Topic 4: just, ðÿ, money, even, reason
## Topic 5: app, good, account, pay, never
## ...............................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 6 (approx. per word bound = -4.822, relative change = 2.803e-03)
## ...............................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 7 (approx. per word bound = -4.813, relative change = 1.889e-03)
## ...............................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 8 (approx. per word bound = -4.807, relative change = 1.326e-03)
## ...............................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Model Converged
toLDAvis(mod=Dating_tmob_stm, docs=out$documents)
plot(Dating_tmob_stm, type="summary", n=5)
topicQuality(Dating_tmob_stm, out$documents)
## [1] -202.0875 -151.3701 -127.6038 -158.5214 -161.8727
## [1] 9.654707 9.113728 8.991798 8.948663 9.249885
Semantic coherenceIt explains the relationship between words to aid understanding and interpretation of spoken and written language.
Exclusivity measures how distinctive the top words are to that topic
Topic 3 has more semantic coherence and Topic 1 has high value of exclusivity ## 4 set of keywords
library(keyATM)
keyATM_docs <- keyATM_read(texts = Dating_dfm)
## Warning in get_doc_index(W_raw, check = TRUE): Number of documents with 0 length: 110
## This may cause invalid covariates or time index.
## Please review the preprocessing steps.
## Document index to check: 12, 26, 35, 47, 48, 57, 85, 90, 95, 115, 126, 128, 130, 144, 151, 165, 170, 177, 182, 188, 204, 207, 212, 221, 226, 242, 254, 256, 259, 272, 275, 277, 282, 301, 309, 318, 332, 354, 355, 362, 364, 365, 380, 429, 449, 451, 461, 466, 478, 501, 528, 533, 539, 542, 543, 548, 557, 561, 570, 591, 598, 605, 612, 626, 632, 644, 646, 659, 676, 697, 699, 702, 710, 712, 718, 721, 725, 750, 755, 764, 790, 795, 801, 806, 809, 811, 812, 821, 829, 851, 854, 865, 867, 870, 882, 887, 907, 918, 931, 934, 937, 940, 943, 945, 954, 963, 966, 980, 984, 1000
summary(keyATM_docs)
## keyATM_docs object of: 1000 documents.
## Length of documents:
## Avg: 5.211
## Min: 0
## Max: 27
## SD: 5.578
## Number of unique words: 1728
Dating_key_list = list(
good = c("match", "reason", "amazing", "good", "amazing"),
bad = c("banned", "fake", "terrible", "sucks", "useless"),
timestamp = c("hours","times","every","work","subscription"),
profile =c("tinder","profile","women","experience","men","erotic")
)
Dating_key_viz <- visualize_keywords(docs = keyATM_docs, keywords = Dating_key_list)
## Warning in check_keywords(unique(unlisted), keywords, prune): A keyword will be
## pruned because it does not appear in documents: erotic
Dating_key_viz
Dating_tmod_keyatm_base <- keyATM(
docs = keyATM_docs,
no_keyword_topics = 3,
keywords = Dating_key_list,
model = "base",
options = list(seed = 123))
## Warning in keyATM_fit(docs, model, no_keyword_topics, keywords,
## model_settings, : Some documents have 0 length. Please review the preprocessing
## steps.
## Warning in check_keywords(info$wd_names, keywords, options$prune): A keyword
## will be pruned because it does not appear in documents: erotic
top_words(Dating_tmod_keyatm_base, 5)
## 1_good 2_bad 3_timestamp 4_profile
## 1 good [<U+2713>] app subscription [<U+2713>] app
## 2 reason [<U+2713>] banned [<U+2713>] every [<U+2713>] tinder [<U+2713>]
## 3 app account hours [<U+2713>] people
## 4 banned [2] people work [<U+2713>] pay
## 5 ðÿ just gold just
## Other_1 Other_2 Other_3
## 1 trash app app
## 2 app ðÿ nice
## 3 find best great
## 4 tinder [4] worst good [1]
## 5 much dating use
kable(top_words(Dating_tmod_keyatm_base, 5), caption = "Top 5 keywords")
| 1_good | 2_bad | 3_timestamp | 4_profile | Other_1 | Other_2 | Other_3 |
|---|---|---|---|---|---|---|
| good [<U+2713>] | app | subscription [<U+2713>] | app | trash | app | app |
| reason [<U+2713>] | banned [<U+2713>] | every [<U+2713>] | tinder [<U+2713>] | app | ðÿ | nice |
| app | account | hours [<U+2713>] | people | find | best | great |
| banned [2] | people | work [<U+2713>] | pay | tinder [4] | worst | good [1] |
| ðÿ | just | gold | just | much | dating | use |