Assignment1-Jain

Dating - App Reviews Dataset

Data Set The data is from 2017-2022. I acquired the data using googleplayscraper from google playstore online. The data I received was more than just the column shown here but were unnecessary.

knitr::opts_chunk$set(message = FALSE)

Loading required packages

library(readtext)
library(tidyverse)
library(quanteda)

## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "packedMatrix" of class "mMatrix"; definition not updated

## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "packedMatrix" of class "replValueSp"; definition not updated

library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(textrank)
library(readtext)
library(udpipe)
library(dplyr)
library(topicmodels)
library(quanteda)
library(tidyverse)
library(tidytext)
library(topicdoc)
library(LDAvis)
library(broom)
library(ldatuning)
library(stm)
library(seededlda)
library(keyATM)
library(kableExtra)

Describe data

Dating_df <- readtext::readtext("Assignment2.csv",text_field = "Review", docid_field = "ID")
glimpse(Dating_df)

## Rows: 1,000
## Columns: 7
## $ doc_id     <chr> "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11~
## $ text       <chr> "On this app i cant find a partner", "Tinder would be so mu~
## $ Name       <chr> "linah sibanda", "Norman Johnson", "David Hume", "Last 1 St~
## $ Rating     <int> 5, 3, 1, 2, 5, 5, 5, 1, 5, 3, 2, 5, 1, 1, 1, 5, 1, 5, 1, 5,~
## $ X.ThumbsUp <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Date.Time  <chr> "18-02-2022 01:19", "18-02-2022 01:16", "18-02-2022 01:11",~
## $ App        <chr> "Tinder", "Tinder", "Tinder", "Tinder", "Tinder", "Tinder",~

Create corpus

Dating_corp <- corpus(Dating_df)
summary(Dating_corp, n = 5)

## Corpus consisting of 1000 documents, showing 5 documents:
## 
##  Text Types Tokens Sentences             Name Rating X.ThumbsUp
##     0     8      8         1    linah sibanda      5          0
##     1    22     23         1   Norman Johnson      3          0
##     2    61     72         7       David Hume      1          0
##     3    39     45         1  Last 1 Standing      2          0
##     4     3      3         1 Arthur Magamedov      5          0
##         Date.Time    App
##  18-02-2022 01:19 Tinder
##  18-02-2022 01:16 Tinder
##  18-02-2022 01:11 Tinder
##  18-02-2022 01:11 Tinder
##  18-02-2022 01:06 Tinder

Token and DCM

Dating_toks <- tokens(
  Dating_corp,
  remove_punct = TRUE,
  remove_numbers = TRUE,
  remove_symbols = TRUE,
  remove_url = TRUE,
  split_hyphens = FALSE)

myStopWords = c("Where", "when", "shall", "include", "including", "by",
                "includes", "included", "may", "uses", "using", "used", "may",
                "also", "can", "whether", "so", "however", "rather", "â", "s",
                "said", "one", "two", "three", "k")

Dating_toks2 <- tokens_remove(
  Dating_toks, pattern = c(stopwords("en"), myStopWords))

Feature Matrix

Dating_dfm <- dfm(Dating_toks2)
Dating_dfm

## Document-feature matrix of: 1,000 documents, 2,765 features (99.65% sparse) and 5 docvars.
##     features
## docs app cant find partner tinder much better specify race looking
##    0   1    1    1       1      0    0      0       0    0       0
##    1   0    0    0       0      1    1      1       1    1       1
##    2   1    0    0       0      0    0      0       0    0       0
##    3   0    0    0       0      1    0      0       0    0       1
##    4   0    0    0       0      0    0      0       0    0       0
##    5   1    0    0       0      0    0      0       0    0       0
## [ reached max_ndoc ... 994 more documents, reached max_nfeat ... 2,755 more features ]

Dating_dfm <- dfm(Dating_toks2, tolower = TRUE) %>%
  dfm_trim(min_termfreq = 3, min_docfreq = 10)

Dating_dfm

## Document-feature matrix of: 1,000 documents, 180 features (97.39% sparse) and 5 docvars.
##     features
## docs app find tinder much better looking swiping still matches messages
##    0   1    1      0    0      0       0       0     0       0        0
##    1   0    0      1    1      1       1       1     0       0        0
##    2   1    0      0    0      0       0       0     1       1        1
##    3   0    0      1    0      0       1       0     0       0        0
##    4   0    0      0    0      0       0       0     0       0        0
##    5   1    0      0    0      0       0       0     0       0        0
## [ reached max_ndoc ... 994 more documents, reached max_nfeat ... 170 more features ]

Top Features

ndoc(Dating_dfm)

## [1] 1000

nfeat(Dating_dfm)

## [1] 180

topfeatures(Dating_dfm, 30)

##          app       tinder       banned       people         just          get 
##          385          148          136          136          114          113 
##         good      account        money           ðÿ         time         like 
##          105          101           95           93           89           88 
##         even          pay      matches          got       reason         fake 
##           80           67           66           63           62           56 
##        match          now        never          use         many        waste 
##           54           53           51           49           49           48 
##      profile         want        likes subscription          see     profiles 
##           43           42           42           42           41           40

3 Relevent Keywords

keyword1 <- kwic(Dating_toks2, pattern = phrase("learn*"), window = 2)
head(keyword1, 5)

## Keyword-in-context with 1 match.                                             
##  [723, 6] ladies man | learned | lot socially

keyword2 <- kwic(Dating_toks2, pattern = phrase("content*"), window = 2)
head(keyword2, 5)

## Keyword-in-context with 3 matches.                                                         
##  [484, 13] post inappropriate | content | Stupid app     
##  [779, 10]     buying premium | content | Waste time     
##  [792, 44]  profile offensive | content | amicable convos

keyword3 <- kwic(Dating_toks2, pattern = phrase("course*"), window = 2)
head(keyword3, 5)

## Keyword-in-context with 5 matches.                                                           
##  [110, 22]   months total | course | order know            
##  [138, 35]  except really | course | never reveal          
##  [214, 18]      log email | course | lets send             
##  [435, 26]    called scam | course | possible communication
##  [912, 13] business model | course |

head(keyword1, 5) %>%
  kbl() %>%
  kable_classic(bootstrap_options = "striped", full_width = F, position = "left")

docname	from	to	pre	keyword	post	pattern
723	6	6	ladies man	learned	lot socially	learn*

head(keyword2, 5) %>%
  kbl() %>%
  kable_classic(bootstrap_options = "striped", full_width = F, position = "left")

docname	from	to	pre	keyword	post	pattern
484	13	13	post inappropriate	content	Stupid app	content*
779	10	10	buying premium	content	Waste time	content*
792	44	44	profile offensive	content	amicable convos	content*

head(keyword3, 5) %>%
  kbl() %>%
  kable_classic(bootstrap_options = "striped", full_width = F, position = "left")

docname	from	to	pre	keyword	post	pattern
110	22	22	months total	course	order know	course*
138	35	35	except really	course	never reveal	course*
214	18	18	log email	course	lets send	course*
435	26	26	called scam	course	possible communication	course*
912	13	13	business model	course		course*

Dating_dfm_small <- dfm_trim(Dating_dfm, min_termfreq = 100)
nfeat(Dating_dfm_small)

## [1] 8

Dating_fcm <- fcm(Dating_dfm_small)

feat <- names(topfeatures(Dating_fcm, 30))
fcmat_select <- fcm_select(Dating_fcm, pattern = feat, selection = "keep")

Plot

size <- log(colSums(dfm_select(Dating_fcm, feat, selection = "keep")))

set.seed(123)


textplot_network(fcmat_select, min_freq = 0.5, edge_size = 2, edge_color = "red",
                 vertex_size = size/max(size)*3)

LDA

Dating_dtmat = quanteda::convert(Dating_dfm, to="topicmodels")
Dating_lda5 <- LDA(Dating_dtmat, k = 5, control = list(seed = 123))

Top 5

Dating_lda5_betas <- broom::tidy(Dating_lda5)


top_terms_in_topics <- Dating_lda5_betas %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms_in_topics

## # A tibble: 25 x 3
##    topic term      beta
##    <int> <chr>    <dbl>
##  1     1 app     0.0794
##  2     1 time    0.0530
##  3     1 get     0.0412
##  4     1 just    0.0318
##  5     1 account 0.0258
##  6     2 just    0.0391
##  7     2 like    0.0369
##  8     2 people  0.0367
##  9     2 app     0.0320
## 10     2 get     0.0252
## # ... with 15 more rows

top_terms_in_topics %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

test = subset(Dating_df)
nrow(test)

## [1] 1000

Best number

train_Dating_dtmat <- corpus_subset(Dating_corp)[1:600,]%>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE,
         remove_symbols = TRUE, remove_url = TRUE) %>%
  dfm(tolower = TRUE) %>%
  dfm_remove(c(stopwords("en"), myStopWords)) %>%
  dfm_trim(min_termfreq = 5, min_docfreq = 10) %>%
  quanteda::convert(to="topicmodels")

test_Dating_dtmat <- corpus_subset(Dating_corp)[601:1000,] %>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE,
         remove_symbols = TRUE, remove_url = TRUE) %>%
  dfm(tolower = TRUE) %>%
  dfm_remove(c(stopwords("en"), myStopWords)) %>%
  dfm_trim(min_termfreq = 5, min_docfreq = 10) %>%
  quanteda::convert(to="topicmodels")

train_Dating_lda5 <- LDA(test_Dating_dtmat, k = 5, control = list(seed = 123))
perplexity(train_Dating_lda5, test_Dating_dtmat)

## [1] 42.69726

n_topics_vec = 2:5
perplexity_vec = map_dbl(n_topics_vec, function(kk) {
  message(kk)
  train_Dating_ldaK <- LDA(train_Dating_dtmat, k = kk, control = list(seed = 123))
  perp = perplexity(train_Dating_ldaK, test_Dating_dtmat)
})
lda_perplexity_result = tibble(
  n_topics = n_topics_vec, perplexity = perplexity_vec
)
plot(lda_perplexity_result, type="l")

Observation: As per the above perplexity 2 topics would be best number

LDA Tuning

library(ldatuning)
lda_ldatuning_result <- FindTopicsNumber(
  Dating_dtmat, topics = n_topics_vec,
  metrics = c("CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "VEM", control = list(seed = 123), mc.cores = 4L, verbose = TRUE
)

## fit models... done.
## calculate metrics:
##   CaoJuan2009... done.
##   Arun2010... done.
##   Deveaud2014... done.

FindTopicsNumber_plot(lda_ldatuning_result)

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Observation: As per the measures, 5 would be best number of topics

Dating_lda3 <- LDA(Dating_dtmat, k = 3, control = list(seed = 123))
topicdoc_result = topic_diagnostics(Dating_lda3, Dating_dtmat)
#below is the diagnostics for the best Model:
head(topicdoc_result, 5) %>%
  kbl() %>%
  kable_classic(bootstrap_options = "striped", full_width = F, position = "left")

topic_num	topic_size	mean_token_length	dist_from_corpus	tf_df_dist	doc_prominence	topic_coherence	topic_exclusivity
1	55.42932	4.3	0.2238619	2.177076	890	-162.1257	8.381046
2	63.20371	4.6	0.2237125	2.122344	890	-121.3646	7.287485
3	61.36697	4.4	0.2177195	2.437037	890	-186.7830	8.279880

STM

library(stm)
stm_Datingdfmat <- quanteda::convert(Dating_dfm, to = "stm")

## Warning in dfm2stm(x, docvars, omit_empty = TRUE): Dropped empty document(s):
## 11, 25, 34, 46, 47, 56, 84, 89, 94, 114, 125, 127, 129, 143, 150, 164, 169, 176,
## 181, 187, 203, 206, 211, 220, 225, 241, 253, 255, 258, 271, 274, 276, 281, 300,
## 308, 317, 331, 353, 354, 361, 363, 364, 379, 428, 448, 450, 460, 465, 477, 500,
## 527, 532, 538, 541, 542, 547, 556, 560, 569, 590, 597, 604, 611, 625, 631, 643,
## 645, 658, 675, 696, 698, 701, 709, 711, 717, 720, 724, 749, 754, 763, 789, 794,
## 800, 805, 808, 810, 811, 820, 828, 850, 853, 864, 866, 869, 881, 886, 906, 917,
## 930, 933, 936, 939, 942, 944, 953, 962, 965, 979, 983, 999

out <- prepDocuments( stm_Datingdfmat$documents, 
                      stm_Datingdfmat$vocab, 
                      stm_Datingdfmat$meta)


Dating_tmob_stm <- stm(out$documents, out$vocab,K=5,
                         seed=123,emtol=1e-3, max.em.its=150)

## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      .....
##   Recovering initialization...
##      .
## Initialization complete.
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -5.122) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -4.961, relative change = 3.126e-02) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -4.893, relative change = 1.371e-02) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -4.857, relative change = 7.401e-03) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 5 (approx. per word bound = -4.836, relative change = 4.391e-03) 
## Topic 1: people, like, great, matches, lot 
##  Topic 2: banned, time, got, match, subscription 
##  Topic 3: tinder, get, now, profile, bad 
##  Topic 4: just, ðÿ, money, even, reason 
##  Topic 5: app, good, account, pay, never 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 6 (approx. per word bound = -4.822, relative change = 2.803e-03) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -4.813, relative change = 1.889e-03) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 8 (approx. per word bound = -4.807, relative change = 1.326e-03) 
## ...............................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Model Converged

toLDAvis(mod=Dating_tmob_stm, docs=out$documents)



plot(Dating_tmob_stm, type="summary", n=5)

Top Quality

topicQuality(Dating_tmob_stm, out$documents)

## [1] -202.0875 -151.3701 -127.6038 -158.5214 -161.8727
## [1] 9.654707 9.113728 8.991798 8.948663 9.249885

Semantic coherenceIt explains the relationship between words to aid understanding and interpretation of spoken and written language.

Exclusivity measures how distinctive the top words are to that topic

Topic 3 has more semantic coherence and Topic 1 has high value of exclusivity ## 4 set of keywords

library(keyATM)
keyATM_docs <- keyATM_read(texts = Dating_dfm)

## Warning in get_doc_index(W_raw, check = TRUE): Number of documents with 0 length: 110
## This may cause invalid covariates or time index.
## Please review the preprocessing steps.
## Document index to check: 12, 26, 35, 47, 48, 57, 85, 90, 95, 115, 126, 128, 130, 144, 151, 165, 170, 177, 182, 188, 204, 207, 212, 221, 226, 242, 254, 256, 259, 272, 275, 277, 282, 301, 309, 318, 332, 354, 355, 362, 364, 365, 380, 429, 449, 451, 461, 466, 478, 501, 528, 533, 539, 542, 543, 548, 557, 561, 570, 591, 598, 605, 612, 626, 632, 644, 646, 659, 676, 697, 699, 702, 710, 712, 718, 721, 725, 750, 755, 764, 790, 795, 801, 806, 809, 811, 812, 821, 829, 851, 854, 865, 867, 870, 882, 887, 907, 918, 931, 934, 937, 940, 943, 945, 954, 963, 966, 980, 984, 1000

summary(keyATM_docs)

## keyATM_docs object of: 1000 documents.
## Length of documents:
##   Avg: 5.211
##   Min: 0
##   Max: 27
##    SD: 5.578
## Number of unique words: 1728

Dating_key_list = list(
  good = c("match", "reason", "amazing", "good", "amazing"),
  bad = c("banned", "fake", "terrible", "sucks", "useless"),
  timestamp = c("hours","times","every","work","subscription"),
  profile =c("tinder","profile","women","experience","men","erotic")
)

1_good	2_bad	3_timestamp	4_profile	Other_1	Other_2	Other_3
good [<U+2713>]	app	subscription [<U+2713>]	app	trash	app	app
reason [<U+2713>]	banned [<U+2713>]	every [<U+2713>]	tinder [<U+2713>]	app	Ã°Ã¿	nice
app	account	hours [<U+2713>]	people	find	best	great
banned [2]	people	work [<U+2713>]	pay	tinder [4]	worst	good [1]
Ã°Ã¿	just	gold	just	much	dating	use