library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'tidyr' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## ── Conflicts ──────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.5.2
## Package version: 1.4.3
## Parallel computing: 2 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(readtext)
library(topicmodels)
library(ggplot2)
library(stm)
## stm v1.3.3 (2018-1-26) successfully loaded. See ?stm for help.
## Papers, resources, and other materials at structuraltopicmodel.com
library(ldatuning)
library(tidytext)
library(FactoMineR)
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
dat_typisk <- readtext(paste0("typisknorsk.xlsx"), text_field = "r13polkom1")
corp_typisknorsk <- corpus(dat_typisk)
summary(corp_typisknorsk, 5)
## Corpus consisting of 1184 documents, showing 5 documents:
##
## Text Types Tokens Sentences responseid
## typisknorsk.xlsx.1 67 97 2 1014483
## typisknorsk.xlsx.2 40 54 1 1010981
## typisknorsk.xlsx.3 28 36 1 1006679
## typisknorsk.xlsx.4 3 4 1 1016981
## typisknorsk.xlsx.5 79 112 1 1000605
##
## Source: /Users/janfredrikhovden/Dropbox/DBXPAGAANDEARBEID/Statistikk/Rworkdir/Quanteda19/* on x86_64 by janfredrikhovden
## Created: Tue Apr 30 08:24:59 2019
## Notes:
«I politikken og i nyhetene har det vært skrevet og diskutert mye om norske verdier. Vi er interessert i dine meninger om hva norske verdier er. Hvilke verdier mener du er mest sentrale for norsk samfunn og kultur? Skriv stikkord»
quanteda_options("language_stemmer"="no")
# make tokens
toks <- tokens_remove(tokens(corp_typisknorsk, remove_punct = TRUE), stopwords("norwegian"))
toks <-tokens_tolower(toks)
toks <-tokens_wordstem(toks)
#check typical "double words"
head(textstat_collocations(toks, size=2, min_count=4),n=20)
## collocation count count_nested length lambda z
## 1 norsk verdi 64 0 2 3.788059 20.45894
## 2 ta var 35 0 2 7.375134 17.74182
## 3 demokrati ytringsfri 51 0 2 2.707475 14.93331
## 4 kristn verdi 24 0 2 4.235154 14.13438
## 5 små forskjell 16 0 2 6.705462 13.78525
## 6 stor grad 11 0 2 6.145125 12.78323
## 7 fatt rik 11 0 2 7.363672 12.66494
## 8 respekt andr 18 0 2 3.682377 12.57205
## 9 lik lov 18 0 2 3.632914 11.97806
## 10 tar var 10 0 2 5.280011 11.60510
## 11 norsk tradisjon 21 0 2 3.172710 11.46366
## 12 forskjell fatt 8 0 2 5.489899 11.23956
## 13 skol utdanning 10 0 2 4.386773 11.13432
## 14 krist kulturarv 7 0 2 6.065102 11.11691
## 15 andr mennesk 12 0 2 3.932722 11.10739
## 16 ytringsfri religionsfri 15 0 2 3.661843 10.84266
## 17 ytringsfri likestilling 27 0 2 2.416129 10.76193
## 18 lik mul 14 0 2 3.631714 10.66219
## 19 lik rett 17 0 2 3.080306 10.65899
## 20 demokrati likestilling 38 0 2 2.022624 10.59652
# compound typical "double words"
toks <- tokens_compound(toks, list(c("17","mai"), c("10", "bud"), c("norsk","verdi"), c("respekt", "andr"), c("kristn", "verdi"), c("små", "forskjell"), c("norsk", "tradisjon"), c("lik", "rett"), c("lik","mul"), c("norsk", "kultur"), c("ta","var"), c("krist", "kulturarv"), c("lik","lov"), c("tar","vare"), c("andr","mennesk"), c("gjør","plikt")))
# merge similar tokens
toks <- tokens_replace(toks, "ta_var", "tar_vare")
toks <- tokens_replace(toks, "små_forskjell", "lit forskjell")
# make dfm
dfm_typisk <- dfm(toks)
# remove additional stopwords (if any - just an example below)
#dfm_typisk <- dfm_remove(pattern=c("public", "pay"))
# vanlegaste ord
topfeatures(dfm_typisk, 50)
## demokrati likestilling ytringsfri likeverd åpen
## 366 230 201 141 119
## lik norsk frih tillit natur
## 115 112 106 101 94
## samfunn god respekt rettferd kultur
## 94 88 80 78 67
## norsk_verdi verdi solidarit land tolerans
## 64 63 62 61 57
## andr rett fellesskap trygg dugnad
## 56 55 53 50 50
## utdanning dugnadsånd språk ærl samhold
## 49 48 48 48 47
## folk tradisjon norg demokratisk skol
## 46 46 43 43 41
## religionsfri hverandr sosial inkludering inkluder
## 39 39 38 38 36
## tar_vare vårt mennesk må lov
## 35 34 34 34 33
## religion mer stor arbeid lit
## 32 31 31 30 30
# plot most common words
typisk_freqplot <- dfm_typisk %>%
textstat_frequency(n = 50) %>%
ggplot(aes(x = reorder(feature, frequency),
y = frequency)) +
geom_point() + ggtitle("Hvilke verdier mener du er mest sentrale for norsk samfunn og kultur?") +
coord_flip() + theme_minimal() +
labs(x = NULL, y = "Frequency")
typisk_freqplot
# wordcloud
set.seed(97)
textplot_wordcloud(dfm_typisk, min_count = 10, random_order = FALSE,
rotation = .25,
color = RColorBrewer::brewer.pal(8,"Dark2"))
# tokens in context
kw_demokra <- kwic(toks, pattern = 'demokra*')
kw_likestil <- kwic(toks, pattern = 'likestil*')
kw_vaare <- kwic(toks, pattern = 'våre*')
head(kw_demokra, n=10)
##
## [typisknorsk.xlsx.1, 2] liberal |
## [typisknorsk.xlsx.2, 2] funger |
## [typisknorsk.xlsx.7, 4] solidarit familieliv friluftsliv |
## [typisknorsk.xlsx.10, 2] velfunger |
## [typisknorsk.xlsx.11, 5] kultur identit selvbestemmelsesrett vårt |
## [typisknorsk.xlsx.16, 1] |
## [typisknorsk.xlsx.17, 1] |
## [typisknorsk.xlsx.18, 1] |
## [typisknorsk.xlsx.19, 1] |
## [typisknorsk.xlsx.20, 2] solidarit |
##
## demokrati | menneskerett uavheng frih størst mul
## demokrati | høy grad tillit innbyggern tolerans
## demokratisk | politikk økonomisk frih
## demokrati | likestilling sosialt ansv ytringsfri rett
## demokrati | hev sperregrens organiser parti 2.7
## demokrati | lik velferdstjenest uansett økonomi solidarit
## demokrati | religionsfri ytringsfri allemannsrett samhold
## demokrati | åpen respekt lojalit fellesskap felleskapsgod
## demokrati | behold vår tradisjon rund jul
## demokrati | åpen rettferd
head(kw_likestil, n=10)
##
## [typisknorsk.xlsx.10, 3] velfunger demokrati |
## [typisknorsk.xlsx.13, 2] ytringsfri |
## [typisknorsk.xlsx.16, 8] velferdstjenest uansett økonomi solidarit lik |
## [typisknorsk.xlsx.31, 8] hvert fall stikkord forbind norg |
## [typisknorsk.xlsx.34, 18] samfunn stor grad yttringsfri 4 |
## [typisknorsk.xlsx.42, 10] hudfarg funksjonsnivå mm lik fellesskap |
## [typisknorsk.xlsx.46, 2] rettferd |
## [typisknorsk.xlsx.48, 10] utdann hjelp lån stipend kvinn |
## [typisknorsk.xlsx.51, 9] arbeid dugnad tar_vare natur del |
## [typisknorsk.xlsx.54, 2] arbeid |
##
## likestilling | sosialt ansv ytringsfri rett plikt
## likestilling | mangfold trygg omsorg
## likestilling | ytringsfri åpen virk elsk norsk
## likestilling | lit klassebaser tolerans velmen individualism
## likestilling | kjønn 3 natur friluftsliv
## likestilling | unngå stor forskjell fordeling land
## likestilling | demokrati ytringsfri
## likestilling | hjelp uføretrygd fri legehjelp skatteplikt
## likestilling | tillitsbas f.eks selvbetjent betaling turisthytt
## likestilling | demokrati
#textplot_xray(head((kw_demokra),n=10))
#textplot_xray(
# kw_demokra,kw_likestil) +
# ggtitle("Lexical dispersion")
# trim dfm by minimum number of terms
dfm_typisk2 <- dfm_trim(dfm_typisk, min_termfreq = 4)
# drop all documents with only zeroes
docvars(dfm_typisk2, "ntoken") <- ntoken(dfm_typisk2)
ndoc(dfm_typisk2)
## [1] 1184
dfm_typisk3 <- dfm_typisk2 %>%
dfm_subset(ntoken > 0)
ndoc(dfm_typisk3)
## [1] 1164
# idf (not ready)
dfm_typiskidf <- dfm_tfidf(dfm_typisk3) # tf-idf vekting
# plot idf
# compare fit statistics for different latent classes
result <- FindTopicsNumber(
dfm_typisk3,
topics = seq(from = 2, to = 15, by = 1),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
mc.cores = 2L,
verbose = TRUE
)
## fit models... done.
## calculate metrics:
## Griffiths2004... done.
## CaoJuan2009... done.
## Arun2010... done.
## Deveaud2014... done.
result
## topics Griffiths2004 CaoJuan2009 Arun2010 Deveaud2014
## 1 2 -37610.62 0.03034108 433.7852 2.391077
## 2 3 -35600.93 0.04738686 388.1206 2.257055
## 3 4 -34707.23 0.05604280 359.3747 2.194538
## 4 5 -33846.60 0.05290794 335.4508 2.180222
## 5 6 -33409.46 0.08432974 321.8092 2.027435
## 6 7 -32782.72 0.07128635 301.8783 2.063909
## 7 8 -32764.08 0.08316701 290.4824 2.018753
## 8 9 -32281.49 0.06340169 275.3770 2.053337
## 9 10 -32413.11 0.08754141 268.9272 1.951503
## 10 11 -32189.22 0.07713013 259.9165 1.984814
## 11 12 -31946.27 0.08393813 252.4684 1.900797
## 12 13 -32358.58 0.09108933 246.8695 1.885455
## 13 14 -32262.31 0.09678871 239.7978 1.861070
## 14 15 -32194.28 0.09205773 235.8782 1.811995
FindTopicsNumber_plot(result)
# model 5
set.seed(100)
my_lda_fit <- LDA(convert(dfm_typisk3, to = "topicmodels"), k = 5)
get_terms(my_lda_fit, 30)
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "demokrati" "ytringsfri" "lik" "likestilling"
## [2,] "god" "demokrati" "likeverd" "ytringsfri"
## [3,] "likestilling" "likestilling" "frih" "demokrati"
## [4,] "frih" "likeverd" "samfunn" "natur"
## [5,] "respekt" "andr" "norsk" "likeverd"
## [6,] "åpen" "tillit" "demokrati" "tillit"
## [7,] "solidarit" "stor" "tillit" "åpen"
## [8,] "tolerans" "samfunn" "rettferd" "kultur"
## [9,] "natur" "norsk_verdi" "åpen" "tradisjon"
## [10,] "raus" "verdi" "verdi" "samhold"
## [11,] "medmennesk" "respekt" "språk" "rett"
## [12,] "likeverd" "åpen" "hjelp" "solidarit"
## [13,] "utdanning" "frih" "land" "god"
## [14,] "trygg" "tolerans" "sosial" "trygg"
## [15,] "dugnad" "del" "mennesk" "respekt"
## [16,] "skol" "utdanning" "respekt" "menneskeverd"
## [17,] "religion" "grad" "kultur" "demokratisk"
## [18,] "rettferd" "fellesskap" "omsorg" "inkludering"
## [19,] "menneskerett" "lik" "norg" "språk"
## [20,] "barn" "land" "ærl" "lik"
## [21,] "velferd" "dugnadsånd" "samhold" "inkluder"
## [22,] "tar_vare" "tar_vare" "religionsfri" "religionsfri"
## [23,] "korrupsjon" "norg" "dugnadsånd" "nestekjær"
## [24,] "samfunn" "men" "offent" "ærl"
## [25,] "kvinn" "dugnad" "få" "lik_lov"
## [26,] "land" "ønsk" "økonomisk" "dugnadsånd"
## [27,] "rett" "ærl" "hverandr" "samfunn"
## [28,] "regl" "bidr" "famili" "ansv"
## [29,] "ærl" "samhold" "skol" "pressefri"
## [30,] "mangfold" "folk" "kjønn" "kristn_verdi"
## Topic 5
## [1,] "norsk"
## [2,] "lik"
## [3,] "norsk_verdi"
## [4,] "mer"
## [5,] "rettferd"
## [6,] "arbeid"
## [7,] "god"
## [8,] "fellesskap"
## [9,] "folk"
## [10,] "må"
## [11,] "hverandr"
## [12,] "dugnad"
## [13,] "land"
## [14,] "vikt"
## [15,] "vårt"
## [16,] "åpen"
## [17,] "frih"
## [18,] "likestilling"
## [19,] "identit"
## [20,] "rett"
## [21,] "demokrati"
## [22,] "osv"
## [23,] "verd"
## [24,] "mul"
## [25,] "mye"
## [26,] "samfunn"
## [27,] "kultur"
## [28,] "mat"
## [29,] "nordmenn"
## [30,] "fell"
lda_inf <-posterior(my_lda_fit)
# lda_inf
ap_topics <- tidy(my_lda_fit, matrix = "beta")
# ap_topics
# plot of most characteristic words by topic
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(30, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) + ggtitle("Mest karakteristiske ord i kvar topic")+
facet_wrap(~ topic, scales = "free") +
coord_flip()
#paragonic texts for the topic?
# diverging topics
beta_spread <- ap_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread
## # A tibble: 298 x 7
## term topic1 topic2 topic3 topic4 topic5 log_ratio
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 17_mai 0.00298 0.00249 0.00295 0.000493 0.000785 -0.260
## 2 ærl 0.00592 0.00741 0.00876 0.00782 0.00334 0.325
## 3 aksept 0.000433 0.00227 0.00204 0.00195 0.000251 2.39
## 4 aksepter 0.000217 0.00115 0.000340 0.000681 0.00305 2.41
## 5 aktiv 0.000389 0.00321 0.000526 0.000138 0.00125 3.04
## 6 allemannsrett 0.00338 0.00232 0.00391 0.0000343 0.000749 -0.539
## 7 alltid 0.00162 0.000767 0.000108 0.000262 0.00137 -1.08
## 8 alt 0.00147 0.00118 0.00222 0.000650 0.00338 -0.322
## 9 andr 0.00312 0.0251 0.00351 0.00135 0.00564 3.01
## 10 ann 0.00105 0.000113 0.000792 0.0000875 0.000712 -3.22
## # … with 288 more rows
# probability of being in a class
mean(lda_inf$topics[1])
## [1] 0.190926
mean(lda_inf$topics[2])
## [1] 0.193584
mean(lda_inf$topics[3])
## [1] 0.2048709
mean(lda_inf$topics[4])
## [1] 0.1993197
mean(lda_inf$topics[5])
## [1] 0.1861875
# model 7
set.seed(100)
my_lda_fit <- LDA(convert(dfm_typisk3, to = "topicmodels"), k = 7)
get_terms(my_lda_fit, 30)
## Topic 1 Topic 2 Topic 3
## [1,] "god" "likeverd" "lik"
## [2,] "solidarit" "åpen" "frih"
## [3,] "trygg" "tillit" "rettferd"
## [4,] "utdanning" "respekt" "kultur"
## [5,] "skol" "tolerans" "ærl"
## [6,] "raus" "dugnadsånd" "språk"
## [7,] "religion" "fellesskap" "demokratisk"
## [8,] "barn" "samhold" "sosial"
## [9,] "medmennesk" "lit" "omsorg"
## [10,] "menneskerett" "forskjell" "kristn_verdi"
## [11,] "velferd" "grad" "velferdsstat"
## [12,] "kjønn" "rik" "økonomisk"
## [13,] "fri" "fatt" "offent"
## [14,] "hels" "stor" "kulturarv"
## [15,] "helseves" "lit forskjell" "histori"
## [16,] "korrupsjon" "frivil" "kristendom"
## [17,] "godt" "trosfri" "norsk_kultur"
## [18,] "eldr" "gjensid" "allemannsrett"
## [19,] "gratis" "svak" "mening"
## [20,] "eldreomsorg" "aksept" "etikk"
## [21,] "fred" "nøysom" "moral"
## [22,] "lik_mul" "innbygger" "respekt_andr"
## [23,] "mangfold" "utdann" "trygt"
## [24,] "etc" "ordning" "hjelpsom"
## [25,] "press" "klasseskill" "nasjonal"
## [26,] "velferdssamfunn" "omtank" "modell"
## [27,] "legning" "tillitt" "nordisk"
## [28,] "funger" "aktiv" "selvstend"
## [29,] "kristent" "avstand" "sikker"
## [30,] "bistand" "relativ" "arbeidsmoral"
## Topic 4 Topic 5 Topic 6 Topic 7
## [1,] "demokrati" "natur" "samfunn" "andr"
## [2,] "likestilling" "norsk" "norsk_verdi" "land"
## [3,] "ytringsfri" "dugnad" "verdi" "folk"
## [4,] "rett" "hverandr" "norg" "mennesk"
## [5,] "religionsfri" "inkluder" "vårt" "arbeid"
## [6,] "inkludering" "lov" "må" "hjelp"
## [7,] "menneskeverd" "mul" "mer" "bidr"
## [8,] "kvinn" "nestekjær" "vikt" "ansv"
## [9,] "lik_lov" "tar_vare" "vår" "ønsk"
## [10,] "pressefri" "friluftsliv" "norsk" "fell"
## [11,] "lik_rett" "uavheng" "identit" "skatt"
## [12,] "tradisjon" "tar" "del" "tro"
## [13,] "samarbeid" "famili" "kristn" "føl"
## [14,] "menn" "norsk_tradisjon" "men" "tradisjon"
## [15,] "respekt_andr_mennesk" "mat" "stolt" "nordmenn"
## [16,] "valgfri" "fordeling" "verd" "ulik"
## [17,] "homofil" "osv" "bygg" "17_mai"
## [18,] "velferdsordning" "var" "politisk" "få"
## [19,] "grupp" "miljø" "land" "flest"
## [20,] "plikt" "alt" "mest" "mye"
## [21,] "krev" "tradisjon" "forhold" "regl"
## [22,] "yttringsfri" "tenk" "ta" "komm"
## [23,] "lovverk" "person" "gjør" "liv"
## [24,] "vintersport" "jant" "åpent" "godt"
## [25,] "gjør_plikt" "bruk" "jobb" "bedr"
## [26,] "gjestfri" "stol" "tar_vare" "spesielt"
## [27,] "ferdsel" "sett" "vern" "stor"
## [28,] "institusjon" "tilgang" "finn" "best"
## [29,] "viktigst" "venn" "bør" "følg"
## [30,] "helsehjelp" "arv" "mye" "gi"
lda_inf <-posterior(my_lda_fit)
# lda_inf
ap_topics <- tidy(my_lda_fit, matrix = "beta")
# ap_topics
# plot of most characteristic words by topic
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) + ggtitle("Mest karakteristiske ord i kvar topic")+
facet_wrap(~ topic, scales = "free") +
coord_flip()
## diverging topics
beta_spread <- ap_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread
## # A tibble: 123 x 9
## term topic1 topic2 topic3 topic4 topic5 topic6 topic7
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 akse… 1.73e-14 9.88e- 3 1.51e-10 2.25e-11 3.42e-16 3.15e-16 1.09e-16
## 2 aktiv 2.78e- 7 7.91e- 3 8.14e-13 2.72e-14 6.96e- 9 3.82e-12 7.46e- 9
## 3 åpen 6.18e- 6 1.17e- 1 6.59e- 4 5.00e- 6 2.07e-10 8.08e-17 1.06e-13
## 4 arbe… 4.21e- 3 4.14e-33 1.70e-13 4.02e-33 4.22e- 6 8.29e-29 1.19e-10
## 5 avst… 1.13e-18 7.91e- 3 7.25e-18 1.26e-21 5.70e- 7 3.05e-16 7.25e-11
## 6 bære… 4.22e- 3 1.35e- 8 5.85e-12 6.47e-17 2.62e-21 1.19e-18 8.87e-23
## 7 bakg… 7.38e- 3 3.65e-15 2.66e-19 5.29e-13 3.63e-15 1.89e-24 2.57e-13
## 8 barn 3.06e- 2 1.70e-14 1.11e-17 5.97e-14 6.79e-14 3.01e-17 9.43e-10
## 9 best… 4.22e- 3 3.28e-19 7.87e-13 1.72e- 7 6.55e-11 7.48e-23 4.63e-19
## 10 bist… 8.50e- 3 1.07e- 9 2.30e- 7 5.11e- 8 4.15e-10 7.58e- 4 5.60e- 5
## # … with 113 more rows, and 1 more variable: log_ratio <dbl>
## probability of being in a class
mean(lda_inf$topics[1])
## [1] 0.09137702
mean(lda_inf$topics[2])
## [1] 0.09778256
mean(lda_inf$topics[3])
## [1] 0.5625453
mean(lda_inf$topics[4])
## [1] 0.1250525
mean(lda_inf$topics[5])
## [1] 0.02228537
mean(lda_inf$topics[6])
## [1] 0.1001008
mean(lda_inf$topics[7])
## [1] 0.1542776
# model 9
set.seed(100)
my_lda_fit <- LDA(convert(dfm_typisk3, to = "topicmodels"), k = 9)
get_terms(my_lda_fit, 30)
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "god" "samfunn" "likestilling" "demokrati"
## [2,] "rett" "land" "frih" "ytringsfri"
## [3,] "skol" "norsk_verdi" "rettferd" "likeverd"
## [4,] "religion" "dugnad" "religionsfri" "tolerans"
## [5,] "barn" "hverandr" "nestekjær" "medmennesk"
## [6,] "omsorg" "norg" "fri" "menneskeverd"
## [7,] "bidr" "mer" "velferdsstat" "trosfri"
## [8,] "offent" "del" "pressefri" "legning"
## [9,] "hels" "stolt" "lik_lov" "respekt_andr_mennesk"
## [10,] "korrupsjon" "verd" "allemannsrett" "krist_kulturarv"
## [11,] "kulturarv" "politisk" "gratis" "medbestemm"
## [12,] "lik_rett" "jant" "lik_mul" "valgfri"
## [13,] "eldr" "frivil" "press" "selvstend"
## [14,] "histori" "gjør" "kristent" "homofil"
## [15,] "eldreomsorg" "behold" "bistand" "ytring"
## [16,] "fred" "vårt" "funger" "vintersport"
## [17,] "godt" "finn" "nasjonal" "kunnskap"
## [18,] "etikk" "dag" "valg" "religionsfri"
## [19,] "ta" "individ" "skolegang" "respekt"
## [20,] "moral" "vet" "rettsstat" "beskytt"
## [21,] "jobb" "opptatt" "ytringsfridom" "kjønn"
## [22,] "innbygger" "stol" "red" "likestilling"
## [23,] "ordning" "innvandring" "humanism" "forstå"
## [24,] "trygt" "leng" "etc" "miljøbevisst"
## [25,] "bygd" "hell" "brunost" "naivit"
## [26,] "økonomi" "fler" "bestemm" "lovverk"
## [27,] "ivaretak" "mindr" "kontroll" "omsorg"
## [28,] "ung" "vis" "tank" "naturvern"
## [29,] "arbeidsliv" "men" "gjennomsikt" "uansett"
## [30,] "helsetjenest" "enkelt" "pålit" "rettsikker"
## Topic 5 Topic 6 Topic 7 Topic 8
## [1,] "andr" "åpen" "norsk" "natur"
## [2,] "folk" "tillit" "kultur" "dugnadsånd"
## [3,] "stor" "respekt" "verdi" "tradisjon"
## [4,] "arbeid" "solidarit" "språk" "sosial"
## [5,] "hjelp" "trygg" "må" "tar_vare"
## [6,] "ansv" "ærl" "vikt" "lit"
## [7,] "ønsk" "inkludering" "vårt" "raus"
## [8,] "grad" "velferd" "norsk_tradisjon" "menneskerett"
## [9,] "tar" "kvinn" "identit" "vår"
## [10,] "fell" "fordeling" "tro" "økonomisk"
## [11,] "skatt" "lit forskjell" "mat" "kristn_verdi"
## [12,] "komm" "kristendom" "norsk_kultur" "forskjell"
## [13,] "godt" "mening" "bygg" "friluftsliv"
## [14,] "føl" "mangfold" "alt" "kristn"
## [15,] "ulik" "samarbeid" "regl" "fatt"
## [16,] "nordmenn" "gjensid" "mest" "rik"
## [17,] "få" "aksept" "men" "var"
## [18,] "flest" "velferdssamfunn" "person" "miljø"
## [19,] "mye" "respekt_andr" "tenk" "17_mai"
## [20,] "liv" "menn" "flagg" "svak"
## [21,] "spesielt" "hjelpsom" "vern" "nøysom"
## [22,] "bedr" "åpent" "bør" "jul"
## [23,] "best" "tillitt" "typisk" "arv"
## [24,] "vansk" "relativ" "grunn" "fokus"
## [25,] "stat" "rettssikker" "fjell" "avstand"
## [26,] "treng" "arbeidsmoral" "kirk" "klasseskill"
## [27,] "litt" "mynd" "holdning" "aktiv"
## [28,] "får" "grupp" "egn" "ivaret"
## [29,] "høy" "egalit" "positiv" "bruk"
## [30,] "respekter" "verdisyn" "fjord" "humanistisk"
## Topic 9
## [1,] "lik"
## [2,] "fellesskap"
## [3,] "samhold"
## [4,] "utdanning"
## [5,] "demokratisk"
## [6,] "inkluder"
## [7,] "mennesk"
## [8,] "lov"
## [9,] "mul"
## [10,] "kjønn"
## [11,] "famili"
## [12,] "uavheng"
## [13,] "helseves"
## [14,] "lev"
## [15,] "osv"
## [16,] "forhold"
## [17,] "tilgang"
## [18,] "sett"
## [19,] "venn"
## [20,] "utdann"
## [21,] "velg"
## [22,] "omtank"
## [23,] "fritt"
## [24,] "lønn"
## [25,] "egen"
## [26,] "modell"
## [27,] "likestilt"
## [28,] "stort"
## [29,] "kjær"
## [30,] "nordisk"
lda_inf <-posterior(my_lda_fit)
# lda_inf
ap_topics <- tidy(my_lda_fit, matrix = "beta")
# ap_topics
# plot of most characteristic words by topic
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) + ggtitle("Mest karakteristiske ord i kvar topic")+
facet_wrap(~ topic, scales = "free") +
coord_flip()
#paragonic texts for the topic?
# diverging topics
beta_spread <- ap_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread
## # A tibble: 169 x 11
## term topic1 topic2 topic3 topic4 topic5 topic6 topic7
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 akse… 2.26e-21 7.40e- 3 2.88e-19 5.85e-20 2.11e-14 4.72e-17 1.63e- 3
## 2 allt… 1.49e-23 6.76e- 3 1.12e-22 3.72e-29 8.65e-24 5.81e-22 9.77e- 9
## 3 ann 1.74e- 3 6.90e-14 1.49e-23 1.50e-38 3.82e-19 8.41e-29 3.02e- 3
## 4 åpent 4.50e-13 2.88e- 3 8.83e-15 2.40e-17 2.03e- 9 1.10e- 2 2.23e-15
## 5 åpn 2.05e-34 4.50e- 3 3.27e-17 2.15e-25 1.69e-30 4.41e-33 1.25e-36
## 6 år 2.09e-42 3.48e- 3 3.49e-54 3.66e-57 8.04e-43 6.94e-44 2.18e- 3
## 7 arbe… 9.58e- 3 7.43e-19 3.68e-16 5.97e-19 8.19e-12 1.59e-11 1.42e-22
## 8 arbe… 3.24e-13 6.76e- 3 3.57e- 7 3.01e-14 1.99e-20 5.02e-10 1.19e-24
## 9 arbe… 5.10e- 3 9.83e-50 2.19e-48 3.72e-51 2.50e-30 3.75e-38 1.14e-39
## 10 bakg… 8.92e- 3 7.95e-27 1.87e-23 1.66e-20 1.83e-21 5.70e-20 3.61e-34
## # … with 159 more rows, and 3 more variables: topic8 <dbl>, topic9 <dbl>,
## # log_ratio <dbl>
# probability of being in a class
mean(lda_inf$topics[1])
## [1] 0.05654479
mean(lda_inf$topics[2])
## [1] 0.0600119
mean(lda_inf$topics[3])
## [1] 0.566706
mean(lda_inf$topics[4])
## [1] 0.09955673
mean(lda_inf$topics[5])
## [1] 0.02165925
mean(lda_inf$topics[6])
## [1] 0.08241594
mean(lda_inf$topics[7])
## [1] 0.06550012
mean(lda_inf$topics[8])
## [1] 0.08241594
mean(lda_inf$topics[9])
## [1] 0.07588348
#paragonic texts for the topic?
# diverging topics
beta_spread <- ap_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread
## # A tibble: 169 x 11
## term topic1 topic2 topic3 topic4 topic5 topic6 topic7
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 akse… 2.26e-21 7.40e- 3 2.88e-19 5.85e-20 2.11e-14 4.72e-17 1.63e- 3
## 2 allt… 1.49e-23 6.76e- 3 1.12e-22 3.72e-29 8.65e-24 5.81e-22 9.77e- 9
## 3 ann 1.74e- 3 6.90e-14 1.49e-23 1.50e-38 3.82e-19 8.41e-29 3.02e- 3
## 4 åpent 4.50e-13 2.88e- 3 8.83e-15 2.40e-17 2.03e- 9 1.10e- 2 2.23e-15
## 5 åpn 2.05e-34 4.50e- 3 3.27e-17 2.15e-25 1.69e-30 4.41e-33 1.25e-36
## 6 år 2.09e-42 3.48e- 3 3.49e-54 3.66e-57 8.04e-43 6.94e-44 2.18e- 3
## 7 arbe… 9.58e- 3 7.43e-19 3.68e-16 5.97e-19 8.19e-12 1.59e-11 1.42e-22
## 8 arbe… 3.24e-13 6.76e- 3 3.57e- 7 3.01e-14 1.99e-20 5.02e-10 1.19e-24
## 9 arbe… 5.10e- 3 9.83e-50 2.19e-48 3.72e-51 2.50e-30 3.75e-38 1.14e-39
## 10 bakg… 8.92e- 3 7.95e-27 1.87e-23 1.66e-20 1.83e-21 5.70e-20 3.61e-34
## # … with 159 more rows, and 3 more variables: topic8 <dbl>, topic9 <dbl>,
## # log_ratio <dbl>
# probability of being in a class
mean(lda_inf$topics[1])
## [1] 0.05654479
mean(lda_inf$topics[2])
## [1] 0.0600119
mean(lda_inf$topics[3])
## [1] 0.566706
mean(lda_inf$topics[4])
## [1] 0.09955673
mean(lda_inf$topics[5])
## [1] 0.02165925
# trim dfm min words
dfm_typisk4 <- dfm_trim(dfm_typisk, min_termfreq = 4)
# drop all documents with only zeroes
docvars(dfm_typisk4, "ntoken") <- ntoken(dfm_typisk4)
ndoc(dfm_typisk4)
## [1] 1184
dfm_typisk5 <- dfm_typisk4 %>%
dfm_subset(ntoken > 0)
ndoc(dfm_typisk5)
## [1] 1164
dfm_typisk5_mca <- dfm_weight(dfm_typisk5, scheme="boolean") # gjer til indikatormatrise
# convert to dataframe
typiskdata <- convert(dfm_typisk5, to = "data.frame")
typiskdata_mca <- convert(dfm_typisk5, to = "data.frame")
#typiskdata<-typiskdata[-c(344,1099),] # drop outliers
#typiskdata_mca<-typiskdata_mca[-c(344,1099),] # drop outliers
# add metadata to data frame, if needed
#typiskedata2 <-docvars(dfm_typisk5)
#typiskedata3 <-cbind(typiskdata,typiskedata2) # if need metadata
# PCA
res<-PCA(typiskdata_mca[2:length(typiskdata_mca)])
fviz_screeplot(res)
p1<-fviz_contrib(res, axes = 1, top=20, fill="black", choice="var")
p2<-fviz_contrib(res, axes = 2, top=20, fill="black", choice="var")
p3<-fviz_contrib(res, axes = 3, top=20, fill="black", choice="var")
p4<-fviz_contrib(res, axes = 4, top=20, fill="black", choice="var")
grid.arrange(p1,p2,p3,p4, nrow=2)
fviz_pca_var(res)
fviz_pca_var(res, repel=TRUE, geom="text", labelsize=4, select.var=list(contrib=80))
fviz_pca_var(res, repel=TRUE, geom="text", labelsize=4, axes=c(3,2), select.var=list(contrib=80))
fviz_pca_var(res, repel=TRUE, geom="text", labelsize=4, select.var=list(cos2=0.1))
fviz_pca_var(res, repel=TRUE, geom="text", labelsize=4, axes=c(3,2), select.var=list(cos2=0.05))
# Clustering on principal components
res.hcpc<-HCPC(res, nb.clust=2,consol=TRUE,min=2,max=12,graph=TRUE)
fviz_cluster(res.hcpc, data = x, axes=c(2,1), ellipse.type = "convex", repel=FALSE, labelsize=3)+
labs(title = "Clusters") +
theme(axis.text=element_text(size=12), axis.title=element_text(size=12,face="bold"))
head(res.hcpc$desc.var$quanti$`1`, n=20)
## v.test Mean in category Overall mean sd in category
## norsk_verdi -2.258271 0.054216867 0.054982818 0.2776598
## verdi -2.354046 0.053356282 0.054123711 0.2667648
## land -2.418537 0.051635112 0.052405498 0.2605797
## norsk -2.906928 0.094664372 0.096219931 0.4366857
## utdanning -3.096953 0.041308090 0.042096220 0.2074707
## stor -3.362608 0.025817556 0.026632302 0.1972825
## godt -4.242338 0.023235800 0.024054983 0.1562596
## tillit -4.317667 0.085197935 0.086769759 0.2941855
## kjønn -4.481596 0.022375215 0.023195876 0.1479005
## tro -4.591213 0.018072289 0.018900344 0.1455611
## fell -4.709460 0.013769363 0.014604811 0.1430540
## famili -4.781926 0.018072289 0.018900344 0.1395237
## identit -4.781926 0.018072289 0.018900344 0.1395237
## del -4.998566 0.018072289 0.018900344 0.1332129
## fatt -5.256663 0.016351119 0.017182131 0.1268218
## gratis -5.410697 0.013769363 0.014604811 0.1236970
## få -5.735445 0.012048193 0.012886598 0.1167228
## bør -5.932335 0.009466437 0.010309278 0.1132217
## ulik -6.333404 0.011187608 0.012027491 0.1051782
## leng -6.374229 0.006024096 0.006872852 0.1055998
## Overall sd p.value
## norsk_verdi 0.2788062 2.392875e-02
## verdi 0.2679793 1.857031e-02
## land 0.2618387 1.558307e-02
## norsk 0.4398760 3.649969e-03
## utdanning 0.2091902 1.955209e-03
## stor 0.1991701 7.720984e-04
## godt 0.1587279 2.212035e-05
## tillit 0.2992491 1.576869e-05
## kjønn 0.1505252 7.408705e-06
## tro 0.1482550 4.406766e-06
## fell 0.1458231 2.483745e-06
## famili 0.1423423 1.736239e-06
## identit 0.1423423 1.736239e-06
## del 0.1361731 5.775825e-07
## fatt 0.1299496 1.466931e-07
## gratis 0.1269241 6.277991e-08
## få 0.1201613 9.725663e-09
## bør 0.1167879 2.986566e-09
## ulik 0.1090084 2.398106e-10
## leng 0.1094544 1.838856e-10
head(res.hcpc$desc.var$quanti$`2`, n=20)
## v.test Mean in category Overall mean sd in category
## nok 20.85665 1.5 0.008591065 0.5
## skjer 19.66100 1.0 0.003436426 1.0
## rest 19.66100 1.0 0.003436426 1.0
## bosetting 19.66100 1.0 0.003436426 1.0
## svært 19.66100 1.0 0.003436426 1.0
## individ 18.64186 1.5 0.007731959 1.5
## befolkning 18.19395 1.0 0.004295533 1.0
## fortsatt 17.01101 1.0 0.005154639 1.0
## folkeskikk 17.01101 1.0 0.005154639 1.0
## treng 17.00896 1.5 0.008591065 1.5
## gjeld 17.00731 1.5 0.010309278 0.5
## flest 16.55056 1.5 0.011168385 1.5
## mye 16.50126 2.0 0.018900344 1.0
## behov 16.03081 1.0 0.006013746 1.0
## folk 15.84525 2.5 0.039518900 0.5
## verd 15.73696 1.5 0.012886598 1.5
## hell 15.20129 1.0 0.006872852 1.0
## går 15.19864 1.0 0.008591065 0.0
## mer 14.45960 2.5 0.026632302 1.5
## opptatt 13.31990 1.0 0.007731959 1.0
## Overall sd p.value
## nok 0.10117051 1.326378e-96
## skjer 0.07171353 4.654172e-86
## rest 0.07171353 4.654172e-86
## bosetting 0.07171353 4.654172e-86
## svært 0.07171353 4.654172e-86
## individ 0.11325553 1.470410e-77
## befolkning 0.07742928 5.763280e-74
## fortsatt 0.08274226 6.805980e-65
## folkeskikk 0.08274226 6.805980e-65
## treng 0.12405689 7.048275e-65
## gjeld 0.12392593 7.248640e-65
## flest 0.12727251 1.586124e-61
## mye 0.16985994 3.593067e-61
## behov 0.08772567 7.786137e-58
## folk 0.21969623 1.516066e-56
## verd 0.13369807 8.440637e-56
## hell 0.09243284 3.467398e-52
## går 0.09228900 3.610452e-52
## mer 0.24201048 2.180583e-47
## opptatt 0.10539735 1.773352e-40