This is an R Markdown document. Markdown is a simple formatting syntax for authoring HL, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

This document includes both content as well as the output of any embedded R code chunks within the document.

1 Gyakoriságok

gospels <- c("Mt","Mk","Lk","Jn")
#Get the datanames
library(readxl)
evm <- read_xlsx("Lk_short.xlsx")
# evm <- evm$Lemma
# Create a corpus variable
library(tm)
word.corpus<-Corpus(VectorSource(evm$Lemma)) #Corpus
# Make sure it has loaded properly - have a look!
# inspect(word.corpus)

1.1 Gyakoriságok - táblázatok

## # A tibble: 20 × 2
##    Lemma       n
##    <chr>   <int>
##  1 ὁ        2646
##  2 καί      1469
##  3 αὐτός    1086
##  4 δέ        542
##  5 λέγω      533
##  6 σύ        446
##  7 ἐν        361
##  8 εἰμί      360
##  9 ἐγώ       282
## 10 οὗτος     229
## 11 εἰς       226
## 12 ὅς        190
## 13 ὅτι       174
## 14 οὐ        172
## 15 πρός      166
## 16 ἐπί       161
## 17 πᾶς       158
## 18 μή        140
## 19 γίνομαι   131
## 20 ἀπό       125

1.2 Gyakoriságok - diagrammok

#fReorder factor levels of category based on value (descending order)
top20$Lemma <- factor(top20$Lemma, levels = top20$Lemma[order(top20$n, decreasing = TRUE)])

# Create the bar chart
library(plotly)
fig <- plot_ly(
  data = top20,
  x = ~Lemma,
  y = ~n,
  type = 'bar',
  marker = list(color = 'sandybrown')
)

# Customize layout
fig <- fig %>% layout(
  title = "Bar Chart Sorted by Value",
  xaxis = list(title = "Lemma"),
  yaxis = list(title = "Előfordulás")
)

# Show the plot
fig

Ez az ábra interaktív; ha ráhúzzuk az egeret, az aktuális lemmát (szótőt) és annak gyakoriságát mutatja.

1.3 Gyakoriságok - szófelhők

#word.counts<-as.matrix(TermDocumentMatrix(word.corpus))
#word.freq<-sort(rowSums(word.counts), decreasing=TRUE)
#Load libraries for wordclouds
library(SnowballC)
library(tm)
library(wordcloud2)
library(RColorBrewer)

#Create a table of word frequenciess
greek_words <- evm$FullWord[1:188]            
word_freqs <- as.data.frame(table(greek_words))

#Remove stopwords
perzsa <- stopwords::stopwords(language = "grc", source = "perseus")
word_freqs_filtered <- word_freqs %>%
  filter(!greek_words %in% perzsa)

# Create the word cloud
set.seed(32) #be sure to set the seed if you want to reproduce the same again
wc <- wordcloud2(
  data = word_freqs_filtered,
  size = 1,
  gridSize = 8,
  color = "random-dark", backgroundColor = "white"
)
wc

1.4 Gyakoriságok - bigram hálózatok

Image file (greek_network_Jn_prologue.png)

2 Zipf-szabály

2.1 A Zipf-szabály táblázatban

#Install the zipfR package
#install.packages("zipfR")

#Load the package
library(zipfR)

#Load necessary libraries
library(ggplot2)

#Define parameters
N <- 100   # Total number of elements
s <- 1.5   # Shape parameter

#Generate Zipf distribution probabilities
zipf_probs <- (1 / (1:N)^s) / sum(1 / (1:N)^s)
zipf_data <- data.frame(Rank = 1:N, Probability = zipf_probs)

#Display the first few rows
head(zipf_data,n=20)

##    Rank Probability
## 1     1 0.414443506
## 2     2 0.146527907
## 3     3 0.079759690
## 4     4 0.051805438
## 5     5 0.037068954
## 6     6 0.028199309
## 7     7 0.022377846
## 8     8 0.018315988
## 9     9 0.015349759
## 10   10 0.013105854
## 11   11 0.011359947
## 12   12 0.009969961
## 13   13 0.008841996
## 14   14 0.007911763
## 15   15 0.007133924
## 16   16 0.006475680
## 17   17 0.005912783
## 18   18 0.005426960
## 19   19 0.005004203
## 20   20 0.004633619

2.2 A Zipf-szabály mint hatványfüggvény

#Basic Zipf distribution plot
ggplot(zipf_data, aes(x = Rank, y = Probability)) +
  geom_line(color = "brown", size = .75) +
  labs(title = "Basic Zipf Distribution",
       x = "Rank",
       y = "Probability") +
  theme_minimal()

2.3 A Zipf-szabály log-log skálán

#Log10 Zipf distribution plot
ggplot(zipf_data, aes(x = Rank, y = Probability)) +
  geom_line(color = "brown", size = .75) +
  scale_x_log10() +
  scale_y_log10() +
  labs(title = "Log/log Scale Zipf Distribution",
       x = "Rank",
       y = "Probability") +
  theme_minimal()

2.4 A Zipf-szabály és Harry Potter

Harry Potter books

2.5 A Zipf-szabály és az evangéliumok

2.5.1 Táblázatban

#Get the data
library(readxl) 
#Make frequency tables 
library(tidyverse)

evm <- read_xlsx("Mt_short.xlsx")
freqtab1 <- evm %>% count(FullWord, sort=TRUE)
top50Mt <- freqtab1[1:50,]
Mt_total <- sum(freqtab1$n)
#
evm <- read_xlsx("Mk_short.xlsx")
freqtab2 <- evm %>% count(FullWord, sort=TRUE)
top50Mk <- freqtab2[1:50,]
Mk_total <- sum(freqtab2$n)
evm <- read_xlsx("Lk_short.xlsx")
freqtab3 <- evm %>% count(FullWord, sort=TRUE)
top50Lk <- freqtab3[1:50,]
#
Lk_total <- sum(freqtab3$n)
#
evm <- read_xlsx("Jn_short.xlsx")
freqtab4 <- evm %>% count(FullWord, sort=TRUE)
top50Jn <- freqtab4[1:50,]
Jn_total <- sum(freqtab4$n)
#
evmtab50 <- cbind(top50Mt,top50Mk,top50Lk,top50Jn)
names(evmtab50) <- c("Szó(Mt)","n","Szó(Mk)","n","Szó(Lk)","n","Szó(Jn)","n")
evmtab50

##    Szó(Mt)    n Szó(Mk)    n Szó(Lk)    n  Szó(Jn)   n
## 1      καὶ 1175     καὶ 1085     καὶ 1466      καὶ 827
## 2        ὁ  493       ὁ  237      δὲ  513        ὁ 565
## 3       δὲ  471   αὐτοῦ  173       ὁ  399      ὅτι 271
## 4      τοῦ  294     εἰς  168     τοῦ  380      τοῦ 243
## 5       ἐν  293      δὲ  155      ἐν  360      τὸν 240
## 6    αὐτοῦ  266     τὸν  150   αὐτοῦ  255       ἐν 226
## 7       τὸ  227      ἐν  135   εἶπεν  229       δὲ 203
## 8       οἱ  224     τοῦ  132     εἰς  225      οὖν 200
## 9      τὸν  221      τὸ  131      τὸ  222   Ἰησοῦς 198
## 10     εἰς  218     τὴν  126     τὸν  216      εἰς 187
## 11     τῶν  206      οἱ  123      οἱ  185    αὐτοῦ 173
## 12     τὴν  203    αὐτῷ  121      τῷ  177     αὐτῷ 173
## 13    αὐτῷ  170  αὐτοῖς  120     ὅτι  174      οὐκ 151
## 14      τῷ  149   αὐτὸν  117     τὴν  171       τὸ 150
## 15     ὅτι  140     τῶν  108    πρὸς  161      ἵνα 145
## 16      μὴ  123     ὅτι  102    αὐτῷ  153       οἱ 144
## 17     τῆς  121     τῆς   80   αὐτὸν  145      τὴν 142
## 18       ἡ  121      τῷ   77      τῇ  136       ἐκ 139
## 19   εἶπεν  119      μὴ   72      μὴ  132    λέγει 123
## 20  Ἰησοῦς  111     οὐκ   66     τῶν  131        ἡ 122
## 21      τὰ  110     ἵνα   64     τῆς  119       τῷ 114
## 22     γὰρ  108    πρὸς   63    τοὺς  118    εἶπεν 112
## 23    τοὺς  108    τοὺς   63     ἐπὶ  116      τῶν 109
## 24    τοῖς  108   λέγει   62     σου  104       οὐ 108
## 25    ὑμῖν  107   αὐτόν   61      τὰ  104    ἐστιν 107
## 26  αὐτοῖς  103     γὰρ   60       ἡ  102       μὴ 106
## 27      τῇ  103       ἡ   60     οὐκ   99      ἐγὼ 103
## 28   αὐτῶν  100   εἶπεν   59   αὐτῶν   98     ὑμῖν 103
## 29     ἐπὶ   99      τῇ   59    ὑμῖν   96   αὐτοῖς 100
## 30     οὐκ   98  Ἰησοῦς   58  αὐτοῖς   91    αὐτὸν 100
## 31     σου   98    τοῖς   56     γὰρ   87       με  99
## 32   αὐτὸν   94      τί   54     μου   87      μου  98
## 33      οὐ   92      τὰ   52     ἀπὸ   83     πρὸς  97
## 34     ἀπὸ   92     ἐπὶ   52      ἦν   75       ἦν  96
## 35    τότε   90   ἐστιν   52    τοῖς   74      τῆς  82
## 36     μου   83      οὐ   45    θεοῦ   72       τὰ  80
## 37   ἐστιν   83   αὐτῶν   42 ἐγένετο   69    αὐτόν  76
## 38    ὑμῶν   76      ἐκ   42   ἐστιν   69       τῇ  72
## 39       ἢ   65   εὐθὺς   41    ὑμῶν   67    ὑμεῖς  68
## 40      τί   64     μου   40   αὐτόν   66     περὶ  67
## 41    λέγω   61     σου   39      οὐ   66    ταῦτα  61
## 42    ἰδοὺ   60      ἦν   38      τί   62      γὰρ  60
## 43     ἐὰν   58    ὑμῖν   37    ἰδοὺ   57 ἀπεκρίθη  57
## 44     οὖν   56    μετὰ   36  Ἰησοῦς   55     τοὺς  55
## 45   λέγει   54     ἀπὸ   36      ὡς   51     ἀλλὰ  52
## 46     διὰ   53       ἢ   33      εἰ   50    τοῦτο  51
## 47      εἰ   53    θεοῦ   31    λέγω   50     ἀλλ’  50
## 48      ἐκ   52     τὰς   31      ἐκ   50     ἀμὴν  50
## 49   λέγων   49  ἔλεγεν   31  αὐτούς   47       εἰ  49
## 50     ἕως   49     διὰ   30   λέγων   47       τί  48

2.5.2 Grafikonon

#install.packages("plotly")
library(plotly)
datus <- data.frame(Roll_number = 1:50, 
                          y1 = top50Mt$n,
                          y2 = top50Mk$n,
                          y3 = top50Lk$n,
                          y4 = top50Jn$n)
#
fig <-plotly::plot_ly(data = datus, x = ~Roll_number,
                      y = ~y1, name = "Mt",
                      type = "scatter",mode = "lines") %>%
  add_trace(y = ~y2, name = "Mk") %>% 
  add_trace(y = ~y4, name = "Jn") %>%
  add_trace(y = ~y3, name = "Lk") %>% 
  layout(title = 'Zipfs law and the gospels', xaxis = list(title = 'Helyezés'),
         yaxis = list(title = 'Előfordulás'), legend = list(title=list(text='Legend Title')))
  
fig

2.6 The largest cities in the world

2.6.1 In datatable

library(DT)
library(readxl)
bigs <- read_xlsx("C:/users/weltl/Dokumentumok/bigcities5col.xlsx")
#dtbigs <- datatable(bigs)
print(bigs, n=100)

## # A tibble: 822 × 5
##     population city             country        cca2   rank
##          <dbl> <chr>            <chr>          <chr> <dbl>
##   1   37036200 Tokyo            Japan          JP        1
##   2   34665600 Delhi            India          IN        2
##   3   30482100 Shanghai         China          CN        3
##   4   24652900 Dhaka            Bangladesh     BD        4
##   5   23074200 Cairo            Egypt          EG        5
##   6   22990000 Sao Paulo        Brazil         BR        6
##   7   22752400 Mexico City      Mexico         MX        7
##   8   22596500 Beijing          China          CN        8
##   9   22089000 Mumbai           India          IN        9
##  10   18921600 Osaka            Japan          JP       10
##  11   18171200 Chongqing        China          CN       11
##  12   18076800 Karachi          Pakistan       PK       12
##  13   17778500 Kinshasa         DR Congo       CD       13
##  14   17156400 Lagos            Nigeria        NG       14
##  15   16236700 Istanbul         Turkey         TR       15
##  16   15845200 Kolkata          India          IN       16
##  17   15752300 Buenos Aires     Argentina      AR       17
##  18   15230600 Manila           Philippines    PH       18
##  19   14878700 Guangzhou        China          CN       19
##  20   14825800 Lahore           Pakistan       PK       20
##  21   14704100 Tianjin          China          CN       21
##  22   14395400 Bangalore        India          IN       22
##  23   13923200 Rio de Janeiro   Brazil         BR       23
##  24   13545400 Shenzhen         China          CN       24
##  25   12737400 Moscow           Russia         RU       25
##  26   12336000 Chennai          India          IN       26
##  27   11795800 Bogota           Colombia       CO       27
##  28   11634100 Jakarta          Indonesia      ID       28
##  29   11517300 Lima             Peru           PE       29
##  30   11391700 Bangkok          Thailand       TH       30
##  31   11346800 Paris            France         FR       31
##  32   11337900 Hyderabad        India          IN       32
##  33   10174900 Nanjing          China          CN       33
##  34   10027900 Luanda           Angola         AO       34
##  35   10025800 Seoul            South Korea    KR       35
##  36    9998870 Chengdu          China          CN       36
##  37    9840740 London           United Kingdom GB       37
##  38    9816320 Ho Chi Minh City Vietnam        VN       38
##  39    9729740 Tehran           Iran           IR       39
##  40    9534790 Nagoya           Japan          JP       40
##  41    9222080 Xi-an            China          CN       41
##  42    9061820 Ahmedabad        India          IN       42
##  43    9000280 Kuala Lumpur     Malaysia       MY       43
##  44    8986480 Wuhan            China          CN       44
##  45    8592820 Suzhou           China          CN       45
##  46    8591040 Hangzhou         China          CN       46
##  47    8581730 Surat            India          IN       47
##  48    8561520 Dar es Salaam    Tanzania       TZ       48
##  49    8141120 Baghdad          Iraq           IQ       49
##  50    7974270 Shenyang         China          CN       50
##  51    7952860 Riyadh           Saudi Arabia   SA       51
##  52    7936530 New York City    United States  US       52
##  53    7817160 Foshan           China          CN       53
##  54    7772860 Dongguan         China          CN       54
##  55    7768510 Hong Kong        Hong Kong      HK       55
##  56    7525720 Pune             India          IN       56
##  57    7066860 Haerbin          China          CN       57
##  58    6999460 Santiago         Chile          CL       58
##  59    6810530 Madrid           Spain          ES       59
##  60    6754180 Khartoum         Sudan          SD       60
##  61    6491290 Toronto          Canada         CA       61
##  62    6444580 Johannesburg     South Africa   ZA       62
##  63    6351680 Belo Horizonte   Brazil         BR       63
##  64    6347380 Dalian           China          CN       64
##  65    6217970 Qingdao          China          CN       65
##  66    6157270 Singapore        Singapore      SG       66
##  67    6156140 Zhengzhou        China          CN       67
##  68    6065850 Ji nan Shandong  China          CN       68
##  69    6056880 Abidjan          Ivory Coast    CI       69
##  70    5956680 Addis Ababa      Ethiopia       ET       70
##  71    5813190 Yangon           Myanmar        MM       71
##  72    5807050 Alexandria       Egypt          EG       72
##  73    5766990 Nairobi          Kenya          KE       73
##  74    5733250 Barcelona        Spain          ES       74
##  75    5653490 Chittagong       Bangladesh     BD       75
##  76    5602200 Hanoi            Vietnam        VN       76
##  77    5597340 Saint Petersburg Russia         RU       77
##  78    5578580 Guadalajara      Mexico         MX       78
##  79    5550490 Ankara           Turkey         TR       79
##  80    5465920 Fukuoka          Japan          JP       80
##  81    5391890 Melbourne        Australia      AU       81
##  82    5272360 Monterrey        Mexico         MX       82
##  83    5248790 Sydney           Australia      AU       83
##  84    5132170 Urumqi           China          CN       84
##  85    5128270 Changsha         China          CN       85
##  86    5063580 Cape Town        South Africa   ZA       86
##  87    5021600 Jiddah           Saudi Arabia   SA       87
##  88    4990930 Brasilia         Brazil         BR       88
##  89    4955680 Kunming          China          CN       89
##  90    4891020 Changchun        China          CN       90
##  91    4877020 Kabul            Afghanistan    AF       91
##  92    4854260 Yaounde          Cameroon       CM       92
##  93    4830170 Hefei            China          CN       93
##  94    4770300 Ningbo           China          CN       94
##  95    4737590 Shantou          China          CN       95
##  96    4645320 Kano             Nigeria        NG       96
##  97    4568530 Tel Aviv         Israel         IL       97
##  98    4563850 New Taipei       Taiwan         TW       98
##  99    4534990 Shijiazhuang     China          CN       99
## 100    4411110 Jaipur           India          IN      100
## # ℹ 722 more rows

2.6.2 On graph

library(plotly)
library(dplyr)

slope <- -1
intercept <- 1000

fig <- plot_ly(data = bigs, x = ~rank, y = ~population,
                      text = ~city,
                      name = "Biggest cities of the world",
                      type = "scatter",mode = "lines")

fig

2.7 Hamlet’s monologue

2.7.1 Shakespeare’s text

Hamlet <- "
To be, or not to be: that is the question:
Whether ’tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles,
And by opposing end them? To die: to sleep;
No more; and, by a sleep to say we end
The heart-ache and the thousand natural shocks
That flesh is heir to, ’tis a consummation
Devoutly to be wish’d. To die, to sleep;
To sleep: perchance to dream: ay, there’s the rub;
For in that sleep of death what dreams may come
When we have shuffled off this mortal coil,
Must give us pause. There’s the respect
That makes calamity of so long life;
For who would bear the whips and scorns of time,
The oppressor’s wrong, the proud man’s contumely,
The pangs of dispriz’d love, the law’s delay,
The insolence of office, and the spurns
That patient merit of the unworthy takes,
When he himself might his quietus make
With a bare bodkin? who would fardels bear,
To grunt and sweat under a weary life,
But that the dread of something after death,
The undiscover’d country from whose bourn
No traveller returns, puzzles the will,
And makes us rather bear those ills we have
Than fly to others that we know not of?
Thus conscience does make cowards of us all;
And thus the native hue of resolution
Is sicklied o’er with the pale cast of thought,
And enterprises of great pith and moment
With this regard their currents turn awry,
And lose the name of action. Soft you now!
The fair Ophelia! Nymph, in thy orisons
Be all my sins remember’d."
Hamlet

## [1] "\nTo be, or not to be: that is the question:\nWhether ’tis nobler in the mind to suffer\nThe slings and arrows of outrageous fortune,\nOr to take arms against a sea of troubles,\nAnd by opposing end them? To die: to sleep;\nNo more; and, by a sleep to say we end\nThe heart-ache and the thousand natural shocks\nThat flesh is heir to, ’tis a consummation\nDevoutly to be wish’d. To die, to sleep;\nTo sleep: perchance to dream: ay, there’s the rub;\nFor in that sleep of death what dreams may come\nWhen we have shuffled off this mortal coil,\nMust give us pause. There’s the respect\nThat makes calamity of so long life;\nFor who would bear the whips and scorns of time,\nThe oppressor’s wrong, the proud man’s contumely,\nThe pangs of dispriz’d love, the law’s delay,\nThe insolence of office, and the spurns\nThat patient merit of the unworthy takes,\nWhen he himself might his quietus make\nWith a bare bodkin? who would fardels bear,\nTo grunt and sweat under a weary life,\nBut that the dread of something after death,\nThe undiscover’d country from whose bourn\nNo traveller returns, puzzles the will,\nAnd makes us rather bear those ills we have\nThan fly to others that we know not of?\nThus conscience does make cowards of us all;\nAnd thus the native hue of resolution\nIs sicklied o’er with the pale cast of thought,\nAnd enterprises of great pith and moment\nWith this regard their currents turn awry,\nAnd lose the name of action. Soft you now!\nThe fair Ophelia! Nymph, in thy orisons\nBe all my sins remember’d."

2.7.2 Arany János fordítása

Arany <- "Lenni vagy nem lenni: az itt a kérdés.
Akkor nemesb-e a lélek, ha tűri
Balsorsa minden nyűgét s nyilait;
Vagy ha kiszáll tenger fájdalma ellen,
S fegyvert ragadva véget vet neki?
Meghalni – elszunnyadni – semmi több;
S egy álom által elvégezni mind
A szív keservét, a test eredendő,
Természetes rázkódtatásait
Oly cél, minőt óhajthat a kegyes.
Meghalni – elszunnyadni – és alunni!
Talán álmodni: ez a bökkenő;
Mert hogy mi álmok jőnek a halálban,
Ha majd leráztuk mind e földi bajt,
Ez visszadöbbent. E meggondolás az,
Mi a nyomort oly hosszan élteti
Mert ki viselné a kor gúny-csapásit,
Zsarnok bosszúját, gőgös ember dölyfét,
Útált szerelme kínját, pör-halasztást,
A hivatalnak packázásait,
S mind a rugást, mellyel méltatlanok
Bántalmazzák a tűrő érdemet
Ha nyúgalomba küldhetné magát
Egy puszta tőrrel? – Ki hordaná e terheket,
Izzadva, nyögve élte fáradalmin,
Ha rettegésünk egy halál utáni
Valamitől – a nem ismert tartomány,
Melyből nem tér meg utazó – le nem
Lohasztja kedvünk, inkább tűrni a
Jelen gonoszt, mint ismeretlenek
Felé sietni? – Ekképp az öntudat
Belőlünk mind gyávát csinál,
S az elszántság természetes szinét
A gondolat halványra betegíti;
Ily kétkedés által sok nagyszerű,
Fontos merény kifordul medriből
S elveszti »tett« nevét. – De csöndesen!
A szép Ophelia jő. – Szép hölgy, imádba
Legyenek foglalva minden bűneim."
Arany

## [1] "Lenni vagy nem lenni: az itt a kérdés.\nAkkor nemesb-e a lélek, ha tűri\nBalsorsa minden nyűgét s nyilait;\nVagy ha kiszáll tenger fájdalma ellen,\nS fegyvert ragadva véget vet neki?\nMeghalni – elszunnyadni – semmi több;\nS egy álom által elvégezni mind\nA szív keservét, a test eredendő,\nTermészetes rázkódtatásait\nOly cél, minőt óhajthat a kegyes.\nMeghalni – elszunnyadni – és alunni!\nTalán álmodni: ez a bökkenő;\nMert hogy mi álmok jőnek a halálban,\nHa majd leráztuk mind e földi bajt,\nEz visszadöbbent. E meggondolás az,\nMi a nyomort oly hosszan élteti\nMert ki viselné a kor gúny-csapásit,\nZsarnok bosszúját, gőgös ember dölyfét,\nÚtált szerelme kínját, pör-halasztást,\nA hivatalnak packázásait,\nS mind a rugást, mellyel méltatlanok\nBántalmazzák a tűrő érdemet\nHa nyúgalomba küldhetné magát\nEgy puszta tőrrel? – Ki hordaná e terheket,\nIzzadva, nyögve élte fáradalmin,\nHa rettegésünk egy halál utáni\nValamitől – a nem ismert tartomány,\nMelyből nem tér meg utazó – le nem\nLohasztja kedvünk, inkább tűrni a\nJelen gonoszt, mint ismeretlenek\nFelé sietni? – Ekképp az öntudat\nBelőlünk mind gyávát csinál,\nS az elszántság természetes szinét\nA gondolat halványra betegíti;\nIly kétkedés által sok nagyszerű,\nFontos merény kifordul medriből\nS elveszti »tett« nevét. – De csöndesen!\nA szép Ophelia jő. – Szép hölgy, imádba\nLegyenek foglalva minden bűneim."

2.7.3 Key lexicographic facts to know about Hamlet (under construction)

3 Korpuszok (szótestek)

## Corpus consisting of 60 documents, showing 60 documents:
## 
##             Text Types Tokens Sentences Year  President       FirstName                 Party
##  1789-Washington   625   1537        23 1789 Washington          George                  none
##  1793-Washington    96    147         4 1793 Washington          George                  none
##       1797-Adams   826   2577        37 1797      Adams            John            Federalist
##   1801-Jefferson   717   1923        41 1801  Jefferson          Thomas Democratic-Republican
##   1805-Jefferson   804   2380        45 1805  Jefferson          Thomas Democratic-Republican
##     1809-Madison   535   1261        21 1809    Madison           James Democratic-Republican
##     1813-Madison   541   1302        33 1813    Madison           James Democratic-Republican
##      1817-Monroe  1040   3677       121 1817     Monroe           James Democratic-Republican
##      1821-Monroe  1259   4886       131 1821     Monroe           James Democratic-Republican
##       1825-Adams  1003   3147        74 1825      Adams     John Quincy Democratic-Republican
##     1829-Jackson   517   1208        25 1829    Jackson          Andrew            Democratic
##     1833-Jackson   499   1267        29 1833    Jackson          Andrew            Democratic
##    1837-VanBuren  1315   4158        95 1837  Van Buren          Martin            Democratic
##    1841-Harrison  1896   9125       210 1841   Harrison   William Henry                  Whig
##        1845-Polk  1334   5186       153 1845       Polk      James Knox                  Whig
##      1849-Taylor   496   1178        22 1849     Taylor         Zachary                  Whig
##      1853-Pierce  1165   3636       104 1853     Pierce        Franklin            Democratic
##    1857-Buchanan   945   3083        89 1857   Buchanan           James            Democratic
##     1861-Lincoln  1075   3999       135 1861    Lincoln         Abraham            Republican
##     1865-Lincoln   360    775        26 1865    Lincoln         Abraham            Republican
##       1869-Grant   485   1229        40 1869      Grant      Ulysses S.            Republican
##       1873-Grant   552   1472        43 1873      Grant      Ulysses S.            Republican
##       1877-Hayes   831   2707        59 1877      Hayes   Rutherford B.            Republican
##    1881-Garfield  1021   3209       111 1881   Garfield        James A.            Republican
##   1885-Cleveland   676   1816        44 1885  Cleveland          Grover            Democratic
##    1889-Harrison  1352   4721       157 1889   Harrison        Benjamin            Republican
##   1893-Cleveland   821   2125        58 1893  Cleveland          Grover            Democratic
##    1897-McKinley  1232   4353       130 1897   McKinley         William            Republican
##    1901-McKinley   854   2437       100 1901   McKinley         William            Republican
##   1905-Roosevelt   404   1079        33 1905  Roosevelt        Theodore            Republican
##        1909-Taft  1437   5821       158 1909       Taft  William Howard            Republican
##      1913-Wilson   658   1882        68 1913     Wilson         Woodrow            Democratic
##      1917-Wilson   549   1652        59 1917     Wilson         Woodrow            Democratic
##     1921-Harding  1169   3719       148 1921    Harding       Warren G.            Republican
##    1925-Coolidge  1220   4440       196 1925   Coolidge          Calvin            Republican
##      1929-Hoover  1090   3860       158 1929     Hoover         Herbert            Republican
##   1933-Roosevelt   743   2057        85 1933  Roosevelt     Franklin D.            Democratic
##   1937-Roosevelt   725   1989        96 1937  Roosevelt     Franklin D.            Democratic
##   1941-Roosevelt   526   1519        68 1941  Roosevelt     Franklin D.            Democratic
##   1945-Roosevelt   275    633        27 1945  Roosevelt     Franklin D.            Democratic
##      1949-Truman   781   2504       116 1949     Truman        Harry S.            Democratic
##  1953-Eisenhower   900   2743       119 1953 Eisenhower       Dwight D.            Republican
##  1957-Eisenhower   621   1907        92 1957 Eisenhower       Dwight D.            Republican
##     1961-Kennedy   566   1541        52 1961    Kennedy         John F.            Democratic
##     1965-Johnson   568   1710        93 1965    Johnson   Lyndon Baines            Democratic
##       1969-Nixon   743   2416       103 1969      Nixon Richard Milhous            Republican
##       1973-Nixon   544   1995        68 1973      Nixon Richard Milhous            Republican
##      1977-Carter   527   1370        52 1977     Carter           Jimmy            Democratic
##      1981-Reagan   902   2781       129 1981     Reagan          Ronald            Republican
##      1985-Reagan   925   2909       123 1985     Reagan          Ronald            Republican
##        1989-Bush   795   2674       141 1989       Bush          George            Republican
##     1993-Clinton   642   1833        81 1993    Clinton            Bill            Democratic
##     1997-Clinton   773   2436       111 1997    Clinton            Bill            Democratic
##        2001-Bush   621   1806        97 2001       Bush       George W.            Republican
##        2005-Bush   772   2312        99 2005       Bush       George W.            Republican
##       2009-Obama   938   2689       110 2009      Obama          Barack            Democratic
##       2013-Obama   814   2317        88 2013      Obama          Barack            Democratic
##       2017-Trump   582   1660        88 2017      Trump       Donald J.            Republican
##       2021-Biden   812   2766       216 2021      Biden       Joseph R.            Democratic
##       2025-Trump  1000   3347       177 2025      Trump       Donald J.            Republican

3.1 Inauguration speeches - Number of Words

require(ggplot2)

korpa %>%
  summary %>%
    ggplot(aes(x = Year, y = Tokens, group = 1)) +
     geom_line() +
     geom_point() +
     geom_label(aes(label = President, fill = Party), nudge_x = 0.1, nudge_y = 0.1) +
     ggtitle("Inauguration Speeches of Presidents - Number of Words") +
     theme_bw() +
     theme(plot.title=element_text( hjust=0.5, vjust=0.5, face='bold' ) +
     scale_color_paletteer_d(nord::frost))

3.2 Inauguration speeches - T(ypes)/(T)okens (R)atio

library(quanteda)
library(dplyr)
#summary(korpa)

# Most jön a brutale rész: A 'korpa' korpuszból 'mysummary' data frame, TTR-rel!
library(quanteda)
library(dplyr)
data("data_corpus_inaugural")
korpa <- corpus(data_corpus_inaugural) # save the `corpus` to a short obj name
docvars_df <- docvars(data_corpus_inaugural)
#docvars_df
#str(docvars_df)
speech_texts <- as.character(data_corpus_inaugural)
#str(speech_texts)
mysummary <- as.data.frame(summary(korpa, verbose = FALSE))
#mysummary
ms <- mysummary %>% mutate (TTR = Types/Tokens)
#
require(ggplot2)
ms1 <- ms[-2,]            # Drop the second row of ms, i.e. Wahington's 2. speech!
ms1 %>%
#  options(repr.plot.width = 12, repr.plot.height = 12)
  ggplot(aes(x = Year, y = TTR, group = 1)) +
  geom_line() +
  geom_point() +
  geom_label(aes(label = President, fill = Party)) +
  ggtitle("Inauguration Speeches of Presidents - T(ypes)/(T)okens (R)atio") +
  theme_bw() +
  theme(plot.title=element_text( hjust=0.5, vjust=0.5, face='bold' ))

3.3 A basic aspect: sentence length vs. word length

library(plotly)
library(quanteda)
#calculate y as the mean word length
korpa <- corpus(data_corpus_inaugural) # save the `corpus` to a short obj name
df <- docvars(data_corpus_inaugural)
df$text <- as.character(data_corpus_inaugural)

library(tokenizers)
words <- tokenize_words(
  df$text,
  lowercase = TRUE,
  stopwords = NULL,
  strip_punct = TRUE,
  strip_numeric = FALSE,
  simplify = FALSE
)
wc <- count_words(df$text)
st <- count_sentences(df$text)
wl <- count_characters(df$text)
fig <- plot_ly(data = ms, type = "scatter", mode = "markers", 
               x = round(wc/st,2), y = round(wl/wc,2),
               text = ~President,
               color = ~Party, 
               colors = c("red","green","blue","tomato","magenta","seagreen","salmon")) %>%
      layout(title = "Sentence length vs. word length",
         xaxis = list(title = "Sentence Length in Words"),
         yaxis = list(title = "Word Length in Characters"))
      
fig

3.4 Phrase dispersion in the subcorpus 2001-2025

library(quanteda)
library(quanteda.textplots)
library(quanteda.textstats)
library(ggplot2)

#Example corpus (replace with your actual corpus) 
corp_us <- corpus(data_corpus_inaugural) # save the corpus to a short obj name
my_corpus <- corpus_subset(corp_us, Year > 2000)
toki <- tokens(my_corpus)
kwic_results <- kwic(toki, pattern = "country", window = 3)
kwic_results

## Keyword-in-context with 53 matches.                                                                              
##     [2001-Bush, 25]         common in our | country | . With a                
##    [2001-Bush, 312]          creed of our | country | , it is                 
##    [2001-Bush, 375]            of our own | country | . The ambitions         
##    [2001-Bush, 415]             but not a | country | . We do                 
##    [2001-Bush, 570]           , makes our | country | more, not               
##    [2001-Bush, 674]              . If our | country | does not lead           
##    [2001-Bush, 948]       liberty and our | country | should make no          
##   [2001-Bush, 1226]           Many in our | country | do not know             
##   [2001-Bush, 1758]           to make our | country | more just and           
##     [2005-Bush, 64]        that unite our | country | . I am                  
##    [2005-Bush, 988]          of your free | country | . The rulers            
##   [2005-Bush, 1170]          measure. Our | country | has accepted obligations
##   [2005-Bush, 1321]       devotion to our | country | in deaths that          
##   [2005-Bush, 1427]         wealth of our | country | but to its              
##   [2005-Bush, 1555]         future of our | country | , we will               
##   [2005-Bush, 1843]             . And our | country | must abandon all        
##   [2005-Bush, 1890]  questions before our | country | are many.               
##  [2009-Obama, 1003]   forgotten what this | country | has already done        
##  [2009-Obama, 2550]          city and the | country | , alarmed at            
##   [2013-Obama, 667]   understand that our | country | cannot succeed when     
##   [2013-Obama, 942]       that built this | country | and investing in        
##   [2013-Obama, 990]          that in this | country | freedom is reserved     
##  [2013-Obama, 1089]        that make this | country | great. We               
##  [2013-Obama, 1831]     expelled from our | country | . Our journey           
##  [2013-Obama, 2098]            to God and | country | , not party             
##    [2017-Trump, 47]        to rebuild our | country | and restore its         
##   [2017-Trump, 255]       citizens of our | country | . Their victories       
##   [2017-Trump, 354]             , is your | country | . What truly            
##   [2017-Trump, 405]          women of our | country | will be forgotten       
##   [2017-Trump, 573]        and robbed our | country | of so much              
##   [2017-Trump, 724]     confidence of our | country | has dissipated over     
##  [2017-Trump, 1016]      - rebuilding our | country | with American hands     
##  [2017-Trump, 1154]        loyalty to our | country | , we will               
##  [2017-Trump, 1369]             fail. Our | country | will thrive and         
##   [2021-Biden, 448]   silently stalks the | country | . It's taken            
##  [2021-Biden, 1870]              way, our | country | will be stronger        
##  [2021-Biden, 2175]           and for our | country | . Amen.                 
##  [2021-Biden, 2743]           and to this | country | we love with            
##    [2025-Trump, 80]          forward, our | country | will flourish and       
##   [2025-Trump, 237]       is sweeping the | country | , sunlight is           
##   [2025-Trump, 405] illegally entered our | country | from all over           
##   [2025-Trump, 443]           people. Our | country | can no longer           
##   [2025-Trump, 539]    individuals in our | country | — some of               
##   [2025-Trump, 606]           it than any | country | anywhere in the         
##   [2025-Trump, 635]           to hate our | country | despite the love        
##   [2025-Trump, 977]        history of our | country | . As our                
##  [2025-Trump, 1216]        not forget our | country | , we will               
##  [2025-Trump, 1353]       invasion of our | country | . Under the             
##  [2025-Trump, 1442]         to defend our | country | from threats and        
##  [2025-Trump, 1571]            gas of any | country | on earth —              
##  [2025-Trump, 2293]     McKinley made our | country | very rich through       
##  [2025-Trump, 2335]          given to the | country | of Panama after         
##  [2025-Trump, 2898]                ., our | country | was forged and

Phrase dispersion

3.5 A sophisticated measure for corpora: TF-IDF

TF-IDF Top Keywords of Presidents

Az R ereje a lexikográfiában

WR

2025-04-30