This is an R Markdown document. Markdown is a simple formatting syntax for authoring HL, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

This document includes both content as well as the output of any embedded R code chunks within the document.

1 Gyakoriságok

gospels <- c("Mt","Mk","Lk","Jn")
#Get the datanames
library(readxl)
evm <- read_xlsx("Lk_short.xlsx")
# evm <- evm$Lemma
# Create a corpus variable
library(tm)
word.corpus<-Corpus(VectorSource(evm$Lemma)) #Corpus
# Make sure it has loaded properly - have a look!
# inspect(word.corpus)

1.1 Gyakoriságok - táblázatok

## # A tibble: 20 × 2
##    Lemma       n
##    <chr>   <int>
##  1 ὁ        2646
##  2 καί      1469
##  3 αὐτός    1086
##  4 δέ        542
##  5 λέγω      533
##  6 σύ        446
##  7 ἐν        361
##  8 εἰμί      360
##  9 ἐγώ       282
## 10 οὗτος     229
## 11 εἰς       226
## 12 ὅς        190
## 13 ὅτι       174
## 14 οὐ        172
## 15 πρός      166
## 16 ἐπί       161
## 17 πᾶς       158
## 18 μή        140
## 19 γίνομαι   131
## 20 ἀπό       125

1.2 Gyakoriságok - diagrammok

#fReorder factor levels of category based on value (descending order)
top20$Lemma <- factor(top20$Lemma, levels = top20$Lemma[order(top20$n, decreasing = TRUE)])

# Create the bar chart
library(plotly)
fig <- plot_ly(
  data = top20,
  x = ~Lemma,
  y = ~n,
  type = 'bar',
  marker = list(color = 'sandybrown')
)

# Customize layout
fig <- fig %>% layout(
  title = "Bar Chart Sorted by Value",
  xaxis = list(title = "Lemma"),
  yaxis = list(title = "Előfordulás")
)

# Show the plot
fig

Ez az ábra interaktív; ha ráhúzzuk az egeret, az aktuális lemmát (szótőt) és annak gyakoriságát mutatja.

1.3 Gyakoriságok - szófelhők

#word.counts<-as.matrix(TermDocumentMatrix(word.corpus))
#word.freq<-sort(rowSums(word.counts), decreasing=TRUE)
#Load libraries for wordclouds
library(SnowballC)
library(tm)
library(wordcloud2)
library(RColorBrewer)

#Create a table of word frequenciess
greek_words <- evm$FullWord[1:188]            
word_freqs <- as.data.frame(table(greek_words))

#Remove stopwords
perzsa <- stopwords::stopwords(language = "grc", source = "perseus")
word_freqs_filtered <- word_freqs %>%
  filter(!greek_words %in% perzsa)

# Create the word cloud
set.seed(32) #be sure to set the seed if you want to reproduce the same again
wc <- wordcloud2(
  data = word_freqs_filtered,
  size = 1,
  gridSize = 8,
  color = "random-dark", backgroundColor = "white"
)
wc

1.4 Gyakoriságok - bigram hálózatok

Image file (greek_network_Jn_prologue.png)

2 Zipf-szabály

2.1 A Zipf-szabály táblázatban

#Install the zipfR package
#install.packages("zipfR")

#Load the package
library(zipfR)

#Load necessary libraries
library(ggplot2)

#Define parameters
N <- 100   # Total number of elements
s <- 1.5   # Shape parameter

#Generate Zipf distribution probabilities
zipf_probs <- (1 / (1:N)^s) / sum(1 / (1:N)^s)
zipf_data <- data.frame(Rank = 1:N, Probability = zipf_probs)

#Display the first few rows
head(zipf_data,n=20)
##    Rank Probability
## 1     1 0.414443506
## 2     2 0.146527907
## 3     3 0.079759690
## 4     4 0.051805438
## 5     5 0.037068954
## 6     6 0.028199309
## 7     7 0.022377846
## 8     8 0.018315988
## 9     9 0.015349759
## 10   10 0.013105854
## 11   11 0.011359947
## 12   12 0.009969961
## 13   13 0.008841996
## 14   14 0.007911763
## 15   15 0.007133924
## 16   16 0.006475680
## 17   17 0.005912783
## 18   18 0.005426960
## 19   19 0.005004203
## 20   20 0.004633619

2.2 A Zipf-szabály mint hatványfüggvény

#Basic Zipf distribution plot
ggplot(zipf_data, aes(x = Rank, y = Probability)) +
  geom_line(color = "brown", size = .75) +
  labs(title = "Basic Zipf Distribution",
       x = "Rank",
       y = "Probability") +
  theme_minimal()

2.3 A Zipf-szabály log-log skálán

#Log10 Zipf distribution plot
ggplot(zipf_data, aes(x = Rank, y = Probability)) +
  geom_line(color = "brown", size = .75) +
  scale_x_log10() +
  scale_y_log10() +
  labs(title = "Log/log Scale Zipf Distribution",
       x = "Rank",
       y = "Probability") +
  theme_minimal()

2.4 A Zipf-szabály és Harry Potter

Harry Potter books
Harry Potter books

2.5 A Zipf-szabály és az evangéliumok

2.5.1 Táblázatban

#Get the data
library(readxl) 
#Make frequency tables 
library(tidyverse)

evm <- read_xlsx("Mt_short.xlsx")
freqtab1 <- evm %>% count(FullWord, sort=TRUE)
top50Mt <- freqtab1[1:50,]
Mt_total <- sum(freqtab1$n)
#
evm <- read_xlsx("Mk_short.xlsx")
freqtab2 <- evm %>% count(FullWord, sort=TRUE)
top50Mk <- freqtab2[1:50,]
Mk_total <- sum(freqtab2$n)
evm <- read_xlsx("Lk_short.xlsx")
freqtab3 <- evm %>% count(FullWord, sort=TRUE)
top50Lk <- freqtab3[1:50,]
#
Lk_total <- sum(freqtab3$n)
#
evm <- read_xlsx("Jn_short.xlsx")
freqtab4 <- evm %>% count(FullWord, sort=TRUE)
top50Jn <- freqtab4[1:50,]
Jn_total <- sum(freqtab4$n)
#
evmtab50 <- cbind(top50Mt,top50Mk,top50Lk,top50Jn)
names(evmtab50) <- c("Szó(Mt)","n","Szó(Mk)","n","Szó(Lk)","n","Szó(Jn)","n")
evmtab50
##    Szó(Mt)    n Szó(Mk)    n Szó(Lk)    n  Szó(Jn)   n
## 1      καὶ 1175     καὶ 1085     καὶ 1466      καὶ 827
## 2        ὁ  493       ὁ  237      δὲ  513        ὁ 565
## 3       δὲ  471   αὐτοῦ  173       ὁ  399      ὅτι 271
## 4      τοῦ  294     εἰς  168     τοῦ  380      τοῦ 243
## 5       ἐν  293      δὲ  155      ἐν  360      τὸν 240
## 6    αὐτοῦ  266     τὸν  150   αὐτοῦ  255       ἐν 226
## 7       τὸ  227      ἐν  135   εἶπεν  229       δὲ 203
## 8       οἱ  224     τοῦ  132     εἰς  225      οὖν 200
## 9      τὸν  221      τὸ  131      τὸ  222   Ἰησοῦς 198
## 10     εἰς  218     τὴν  126     τὸν  216      εἰς 187
## 11     τῶν  206      οἱ  123      οἱ  185    αὐτοῦ 173
## 12     τὴν  203    αὐτῷ  121      τῷ  177     αὐτῷ 173
## 13    αὐτῷ  170  αὐτοῖς  120     ὅτι  174      οὐκ 151
## 14      τῷ  149   αὐτὸν  117     τὴν  171       τὸ 150
## 15     ὅτι  140     τῶν  108    πρὸς  161      ἵνα 145
## 16      μὴ  123     ὅτι  102    αὐτῷ  153       οἱ 144
## 17     τῆς  121     τῆς   80   αὐτὸν  145      τὴν 142
## 18       ἡ  121      τῷ   77      τῇ  136       ἐκ 139
## 19   εἶπεν  119      μὴ   72      μὴ  132    λέγει 123
## 20  Ἰησοῦς  111     οὐκ   66     τῶν  131        ἡ 122
## 21      τὰ  110     ἵνα   64     τῆς  119       τῷ 114
## 22     γὰρ  108    πρὸς   63    τοὺς  118    εἶπεν 112
## 23    τοὺς  108    τοὺς   63     ἐπὶ  116      τῶν 109
## 24    τοῖς  108   λέγει   62     σου  104       οὐ 108
## 25    ὑμῖν  107   αὐτόν   61      τὰ  104    ἐστιν 107
## 26  αὐτοῖς  103     γὰρ   60       ἡ  102       μὴ 106
## 27      τῇ  103       ἡ   60     οὐκ   99      ἐγὼ 103
## 28   αὐτῶν  100   εἶπεν   59   αὐτῶν   98     ὑμῖν 103
## 29     ἐπὶ   99      τῇ   59    ὑμῖν   96   αὐτοῖς 100
## 30     οὐκ   98  Ἰησοῦς   58  αὐτοῖς   91    αὐτὸν 100
## 31     σου   98    τοῖς   56     γὰρ   87       με  99
## 32   αὐτὸν   94      τί   54     μου   87      μου  98
## 33      οὐ   92      τὰ   52     ἀπὸ   83     πρὸς  97
## 34     ἀπὸ   92     ἐπὶ   52      ἦν   75       ἦν  96
## 35    τότε   90   ἐστιν   52    τοῖς   74      τῆς  82
## 36     μου   83      οὐ   45    θεοῦ   72       τὰ  80
## 37   ἐστιν   83   αὐτῶν   42 ἐγένετο   69    αὐτόν  76
## 38    ὑμῶν   76      ἐκ   42   ἐστιν   69       τῇ  72
## 39       ἢ   65   εὐθὺς   41    ὑμῶν   67    ὑμεῖς  68
## 40      τί   64     μου   40   αὐτόν   66     περὶ  67
## 41    λέγω   61     σου   39      οὐ   66    ταῦτα  61
## 42    ἰδοὺ   60      ἦν   38      τί   62      γὰρ  60
## 43     ἐὰν   58    ὑμῖν   37    ἰδοὺ   57 ἀπεκρίθη  57
## 44     οὖν   56    μετὰ   36  Ἰησοῦς   55     τοὺς  55
## 45   λέγει   54     ἀπὸ   36      ὡς   51     ἀλλὰ  52
## 46     διὰ   53       ἢ   33      εἰ   50    τοῦτο  51
## 47      εἰ   53    θεοῦ   31    λέγω   50     ἀλλ’  50
## 48      ἐκ   52     τὰς   31      ἐκ   50     ἀμὴν  50
## 49   λέγων   49  ἔλεγεν   31  αὐτούς   47       εἰ  49
## 50     ἕως   49     διὰ   30   λέγων   47       τί  48

2.5.2 Grafikonon

#install.packages("plotly")
library(plotly)
datus <- data.frame(Roll_number = 1:50, 
                          y1 = top50Mt$n,
                          y2 = top50Mk$n,
                          y3 = top50Lk$n,
                          y4 = top50Jn$n)
#
fig <-plotly::plot_ly(data = datus, x = ~Roll_number,
                      y = ~y1, name = "Mt",
                      type = "scatter",mode = "lines") %>%
  add_trace(y = ~y2, name = "Mk") %>% 
  add_trace(y = ~y4, name = "Jn") %>%
  add_trace(y = ~y3, name = "Lk") %>% 
  layout(title = 'Zipfs law and the gospels', xaxis = list(title = 'Helyezés'),
         yaxis = list(title = 'Előfordulás'), legend = list(title=list(text='Legend Title')))
  
fig

2.6 The largest cities in the world

2.6.1 In datatable

library(DT)
library(readxl)
bigs <- read_xlsx("C:/users/weltl/OneDrive/Dokumentumok/bigcities5col.xlsx")
#dtbigs <- datatable(bigs)
print(bigs, n=100)
## # A tibble: 822 × 5
##     population growthRate city             country         rank
##          <dbl> <chr>      <chr>            <chr>          <dbl>
##   1   37036200 -0.00212   Tokyo            Japan              1
##   2   34665600 0.02538    Delhi            India              2
##   3   30482100 0.02056    Shanghai         China              3
##   4   24652900 0.02996    Dhaka            Bangladesh         4
##   5   23074200 0.0199     Cairo            Egypt              5
##   6   22990000 0.00804    Sao Paulo        Brazil             6
##   7   22752400 0.01098    Mexico City      Mexico             7
##   8   22596500 0.01836    Beijing          China              8
##   9   22089000 0.01919    Mumbai           India              9
##  10   18921600 -0.00242   Osaka            Japan             10
##  11   18171200 0.02235    Chongqing        China             11
##  12   18076800 0.02426    Karachi          Pakistan          12
##  13   17778500 0.04381    Kinshasa         DR Congo          13
##  14   17156400 0.03752    Lagos            Nigeria           14
##  15   16236700 0.0118     Istanbul         Turkey            15
##  16   15845200 0.01762    Kolkata          India             16
##  17   15752300 0.00858    Buenos Aires     Argentina         17
##  18   15230600 0.01931    Manila           Philippines       18
##  19   14878700 0.01978    Guangzhou        China             19
##  20   14825800 0.02906    Lahore           Pakistan          20
##  21   14704100 0.01612    Tianjin          China             21
##  22   14395400 0.02763    Bangalore        India             22
##  23   13923200 0.00715    Rio de Janeiro   Brazil            23
##  24   13545400 0.01754    Shenzhen         China             24
##  25   12737400 0.00197    Moscow           Russia            25
##  26   12336000 0.02342    Chennai          India             26
##  27   11795800 0.0118     Bogota           Colombia          27
##  28   11634100 0.01732    Jakarta          Indonesia         28
##  29   11517300 0.01368    Lima             Peru              29
##  30   11391700 0.01405    Bangkok          Thailand          30
##  31   11346800 0.00622    Paris            France            31
##  32   11337900 0.0243     Hyderabad        India             32
##  33   10174900 0.02285    Nanjing          China             33
##  34   10027900 0.03905    Luanda           Angola            34
##  35   10025800 0.0021     Seoul            South Korea       35
##  36    9998870 0.01737    Chengdu          China             36
##  37    9840740 0.00951    London           United Kingdom    37
##  38    9816320 0.02599    Ho Chi Minh City Vietnam           38
##  39    9729740 0.01183    Tehran           Iran              39
##  40    9534790 -0.00231   Nagoya           Japan             40
##  41    9222080 0.0231     Xi-an            China             41
##  42    9061820 0.02342    Ahmedabad        India             42
##  43    9000280 0.02095    Kuala Lumpur     Malaysia          43
##  44    8986480 0.01532    Wuhan            China             44
##  45    8592820 0.029      Suzhou           China             45
##  46    8591040 0.02033    Hangzhou         China             46
##  47    8581730 0.03015    Surat            India             47
##  48    8561520 0.04905    Dar es Salaam    Tanzania          48
##  49    8141120 0.02777    Baghdad          Iraq              49
##  50    7974270 0.01838    Shenyang         China             50
##  51    7952860 0.01692    Riyadh           Saudi Arabia      51
##  52    7936530 -0.0198    New York City    United States     52
##  53    7817160 0.01456    Foshan           China             53
##  54    7772860 0.01273    Dongguan         China             54
##  55    7768510 0.00552    Hong Kong        Hong Kong         55
##  56    7525720 0.02449    Pune             India             56
##  57    7066860 0.01857    Haerbin          China             57
##  58    6999460 0.00698    Santiago         Chile             58
##  59    6810530 0.00402    Madrid           Spain             59
##  60    6754180 0.03242    Khartoum         Sudan             60
##  61    6491290 0.00931    Toronto          Canada            61
##  62    6444580 0.01901    Johannesburg     South Africa      62
##  63    6351680 0.00814    Belo Horizonte   Brazil            63
##  64    6347380 0.02089    Dalian           China             64
##  65    6217970 0.01857    Qingdao          China             65
##  66    6157270 0.00622    Singapore        Singapore         66
##  67    6156140 0.02348    Zhengzhou        China             67
##  68    6065850 0.02107    Ji nan Shandong  China             68
##  69    6056880 0.03242    Abidjan          Ivory Coast       69
##  70    5956680 0.04437    Addis Ababa      Ethiopia          70
##  71    5813190 0.01813    Yangon           Myanmar           71
##  72    5807050 0.01947    Alexandria       Egypt             72
##  73    5766990 0.04075    Nairobi          Kenya             73
##  74    5733250 0.00373    Barcelona        Spain             74
##  75    5653490 0.02537    Chittagong       Bangladesh        75
##  76    5602200 0.03137    Hanoi            Vietnam           76
##  77    5597340 0.0028     Saint Petersburg Russia            77
##  78    5578580 0.01435    Guadalajara      Mexico            78
##  79    5550490 0.0134     Ankara           Turkey            79
##  80    5465920 -0.00222   Fukuoka          Japan             80
##  81    5391890 0.01435    Melbourne        Australia         81
##  82    5272360 0.01482    Monterrey        Mexico            82
##  83    5248790 0.01232    Sydney           Australia         83
##  84    5132170 0.02521    Urumqi           China             84
##  85    5128270 0.01995    Changsha         China             85
##  86    5063580 0.01723    Cape Town        South Africa      86
##  87    5021600 0.01586    Jiddah           Saudi Arabia      87
##  88    4990930 0.01128    Brasilia         Brazil            88
##  89    4955680 0.01946    Kunming          China             89
##  90    4891020 0.01844    Changchun        China             90
##  91    4877020 0.03144    Kabul            Afghanistan       91
##  92    4854260 0.03684    Yaounde          Cameroon          92
##  93    4830170 0.02176    Hefei            China             93
##  94    4770300 0.02371    Ningbo           China             94
##  95    4737590 0.01741    Shantou          China             95
##  96    4645320 0.03442    Kano             Nigeria           96
##  97    4568530 0.01619    Tel Aviv         Israel            97
##  98    4563850 0.00639    New Taipei       Taiwan            98
##  99    4534990 0.01815    Shijiazhuang     China             99
## 100    4411110 0.02381    Jaipur           India            100
## # ℹ 722 more rows

2.6.2 On graph

library(plotly)
library(dplyr)

slope <- -1
intercept <- 1000

fig <- plot_ly(data = bigs, x = ~rank, y = ~population,
                      text = ~city,
                      name = "Biggest cities of the world",
                      type = "scatter",mode = "lines")

fig

2.7 Hamlet’s monologue

2.7.1 Shakespeare’s text

Hamlet <- "
To be, or not to be: that is the question:
Whether ’tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles,
And by opposing end them? To die: to sleep;
No more; and, by a sleep to say we end
The heart-ache and the thousand natural shocks
That flesh is heir to, ’tis a consummation
Devoutly to be wish’d. To die, to sleep;
To sleep: perchance to dream: ay, there’s the rub;
For in that sleep of death what dreams may come
When we have shuffled off this mortal coil,
Must give us pause. There’s the respect
That makes calamity of so long life;
For who would bear the whips and scorns of time,
The oppressor’s wrong, the proud man’s contumely,
The pangs of dispriz’d love, the law’s delay,
The insolence of office, and the spurns
That patient merit of the unworthy takes,
When he himself might his quietus make
With a bare bodkin? who would fardels bear,
To grunt and sweat under a weary life,
But that the dread of something after death,
The undiscover’d country from whose bourn
No traveller returns, puzzles the will,
And makes us rather bear those ills we have
Than fly to others that we know not of?
Thus conscience does make cowards of us all;
And thus the native hue of resolution
Is sicklied o’er with the pale cast of thought,
And enterprises of great pith and moment
With this regard their currents turn awry,
And lose the name of action. Soft you now!
The fair Ophelia! Nymph, in thy orisons
Be all my sins remember’d."
Hamlet
## [1] "\nTo be, or not to be: that is the question:\nWhether ’tis nobler in the mind to suffer\nThe slings and arrows of outrageous fortune,\nOr to take arms against a sea of troubles,\nAnd by opposing end them? To die: to sleep;\nNo more; and, by a sleep to say we end\nThe heart-ache and the thousand natural shocks\nThat flesh is heir to, ’tis a consummation\nDevoutly to be wish’d. To die, to sleep;\nTo sleep: perchance to dream: ay, there’s the rub;\nFor in that sleep of death what dreams may come\nWhen we have shuffled off this mortal coil,\nMust give us pause. There’s the respect\nThat makes calamity of so long life;\nFor who would bear the whips and scorns of time,\nThe oppressor’s wrong, the proud man’s contumely,\nThe pangs of dispriz’d love, the law’s delay,\nThe insolence of office, and the spurns\nThat patient merit of the unworthy takes,\nWhen he himself might his quietus make\nWith a bare bodkin? who would fardels bear,\nTo grunt and sweat under a weary life,\nBut that the dread of something after death,\nThe undiscover’d country from whose bourn\nNo traveller returns, puzzles the will,\nAnd makes us rather bear those ills we have\nThan fly to others that we know not of?\nThus conscience does make cowards of us all;\nAnd thus the native hue of resolution\nIs sicklied o’er with the pale cast of thought,\nAnd enterprises of great pith and moment\nWith this regard their currents turn awry,\nAnd lose the name of action. Soft you now!\nThe fair Ophelia! Nymph, in thy orisons\nBe all my sins remember’d."

2.7.2 Arany János fordítása

Arany <- "Lenni vagy nem lenni: az itt a kérdés.
Akkor nemesb-e a lélek, ha tűri
Balsorsa minden nyűgét s nyilait;
Vagy ha kiszáll tenger fájdalma ellen,
S fegyvert ragadva véget vet neki?
Meghalni – elszunnyadni – semmi több;
S egy álom által elvégezni mind
A szív keservét, a test eredendő,
Természetes rázkódtatásait
Oly cél, minőt óhajthat a kegyes.
Meghalni – elszunnyadni – és alunni!
Talán álmodni: ez a bökkenő;
Mert hogy mi álmok jőnek a halálban,
Ha majd leráztuk mind e földi bajt,
Ez visszadöbbent. E meggondolás az,
Mi a nyomort oly hosszan élteti
Mert ki viselné a kor gúny-csapásit,
Zsarnok bosszúját, gőgös ember dölyfét,
Útált szerelme kínját, pör-halasztást,
A hivatalnak packázásait,
S mind a rugást, mellyel méltatlanok
Bántalmazzák a tűrő érdemet
Ha nyúgalomba küldhetné magát
Egy puszta tőrrel? – Ki hordaná e terheket,
Izzadva, nyögve élte fáradalmin,
Ha rettegésünk egy halál utáni
Valamitől – a nem ismert tartomány,
Melyből nem tér meg utazó – le nem
Lohasztja kedvünk, inkább tűrni a
Jelen gonoszt, mint ismeretlenek
Felé sietni? – Ekképp az öntudat
Belőlünk mind gyávát csinál,
S az elszántság természetes szinét
A gondolat halványra betegíti;
Ily kétkedés által sok nagyszerű,
Fontos merény kifordul medriből
S elveszti »tett« nevét. – De csöndesen!
A szép Ophelia jő. – Szép hölgy, imádba
Legyenek foglalva minden bűneim."
Arany
## [1] "Lenni vagy nem lenni: az itt a kérdés.\nAkkor nemesb-e a lélek, ha tűri\nBalsorsa minden nyűgét s nyilait;\nVagy ha kiszáll tenger fájdalma ellen,\nS fegyvert ragadva véget vet neki?\nMeghalni – elszunnyadni – semmi több;\nS egy álom által elvégezni mind\nA szív keservét, a test eredendő,\nTermészetes rázkódtatásait\nOly cél, minőt óhajthat a kegyes.\nMeghalni – elszunnyadni – és alunni!\nTalán álmodni: ez a bökkenő;\nMert hogy mi álmok jőnek a halálban,\nHa majd leráztuk mind e földi bajt,\nEz visszadöbbent. E meggondolás az,\nMi a nyomort oly hosszan élteti\nMert ki viselné a kor gúny-csapásit,\nZsarnok bosszúját, gőgös ember dölyfét,\nÚtált szerelme kínját, pör-halasztást,\nA hivatalnak packázásait,\nS mind a rugást, mellyel méltatlanok\nBántalmazzák a tűrő érdemet\nHa nyúgalomba küldhetné magát\nEgy puszta tőrrel? – Ki hordaná e terheket,\nIzzadva, nyögve élte fáradalmin,\nHa rettegésünk egy halál utáni\nValamitől – a nem ismert tartomány,\nMelyből nem tér meg utazó – le nem\nLohasztja kedvünk, inkább tűrni a\nJelen gonoszt, mint ismeretlenek\nFelé sietni? – Ekképp az öntudat\nBelőlünk mind gyávát csinál,\nS az elszántság természetes szinét\nA gondolat halványra betegíti;\nIly kétkedés által sok nagyszerű,\nFontos merény kifordul medriből\nS elveszti »tett« nevét. – De csöndesen!\nA szép Ophelia jő. – Szép hölgy, imádba\nLegyenek foglalva minden bűneim."

2.7.3 Key lexicographic facts to know about Hamlet (under construction)

3 Korpuszok (szótestek)

## Corpus consisting of 60 documents, showing 60 documents:
## 
##             Text Types Tokens Sentences Year  President       FirstName                 Party
##  1789-Washington   625   1537        23 1789 Washington          George                  none
##  1793-Washington    96    147         4 1793 Washington          George                  none
##       1797-Adams   826   2577        37 1797      Adams            John            Federalist
##   1801-Jefferson   717   1923        41 1801  Jefferson          Thomas Democratic-Republican
##   1805-Jefferson   804   2380        45 1805  Jefferson          Thomas Democratic-Republican
##     1809-Madison   535   1261        21 1809    Madison           James Democratic-Republican
##     1813-Madison   541   1302        33 1813    Madison           James Democratic-Republican
##      1817-Monroe  1040   3677       121 1817     Monroe           James Democratic-Republican
##      1821-Monroe  1259   4886       131 1821     Monroe           James Democratic-Republican
##       1825-Adams  1003   3147        74 1825      Adams     John Quincy Democratic-Republican
##     1829-Jackson   517   1208        25 1829    Jackson          Andrew            Democratic
##     1833-Jackson   499   1267        29 1833    Jackson          Andrew            Democratic
##    1837-VanBuren  1315   4158        95 1837  Van Buren          Martin            Democratic
##    1841-Harrison  1896   9125       210 1841   Harrison   William Henry                  Whig
##        1845-Polk  1334   5186       153 1845       Polk      James Knox                  Whig
##      1849-Taylor   496   1178        22 1849     Taylor         Zachary                  Whig
##      1853-Pierce  1165   3636       104 1853     Pierce        Franklin            Democratic
##    1857-Buchanan   945   3083        89 1857   Buchanan           James            Democratic
##     1861-Lincoln  1075   3999       135 1861    Lincoln         Abraham            Republican
##     1865-Lincoln   360    775        26 1865    Lincoln         Abraham            Republican
##       1869-Grant   485   1229        40 1869      Grant      Ulysses S.            Republican
##       1873-Grant   552   1472        43 1873      Grant      Ulysses S.            Republican
##       1877-Hayes   831   2707        59 1877      Hayes   Rutherford B.            Republican
##    1881-Garfield  1021   3209       111 1881   Garfield        James A.            Republican
##   1885-Cleveland   676   1816        44 1885  Cleveland          Grover            Democratic
##    1889-Harrison  1352   4721       157 1889   Harrison        Benjamin            Republican
##   1893-Cleveland   821   2125        58 1893  Cleveland          Grover            Democratic
##    1897-McKinley  1232   4353       130 1897   McKinley         William            Republican
##    1901-McKinley   854   2437       100 1901   McKinley         William            Republican
##   1905-Roosevelt   404   1079        33 1905  Roosevelt        Theodore            Republican
##        1909-Taft  1437   5821       158 1909       Taft  William Howard            Republican
##      1913-Wilson   658   1882        68 1913     Wilson         Woodrow            Democratic
##      1917-Wilson   549   1652        59 1917     Wilson         Woodrow            Democratic
##     1921-Harding  1169   3719       148 1921    Harding       Warren G.            Republican
##    1925-Coolidge  1220   4440       196 1925   Coolidge          Calvin            Republican
##      1929-Hoover  1090   3860       158 1929     Hoover         Herbert            Republican
##   1933-Roosevelt   743   2057        85 1933  Roosevelt     Franklin D.            Democratic
##   1937-Roosevelt   725   1989        96 1937  Roosevelt     Franklin D.            Democratic
##   1941-Roosevelt   526   1519        68 1941  Roosevelt     Franklin D.            Democratic
##   1945-Roosevelt   275    633        27 1945  Roosevelt     Franklin D.            Democratic
##      1949-Truman   781   2504       116 1949     Truman        Harry S.            Democratic
##  1953-Eisenhower   900   2743       119 1953 Eisenhower       Dwight D.            Republican
##  1957-Eisenhower   621   1907        92 1957 Eisenhower       Dwight D.            Republican
##     1961-Kennedy   566   1541        52 1961    Kennedy         John F.            Democratic
##     1965-Johnson   568   1710        93 1965    Johnson   Lyndon Baines            Democratic
##       1969-Nixon   743   2416       103 1969      Nixon Richard Milhous            Republican
##       1973-Nixon   544   1995        68 1973      Nixon Richard Milhous            Republican
##      1977-Carter   527   1370        52 1977     Carter           Jimmy            Democratic
##      1981-Reagan   902   2781       129 1981     Reagan          Ronald            Republican
##      1985-Reagan   925   2909       123 1985     Reagan          Ronald            Republican
##        1989-Bush   795   2674       141 1989       Bush          George            Republican
##     1993-Clinton   642   1833        81 1993    Clinton            Bill            Democratic
##     1997-Clinton   773   2436       111 1997    Clinton            Bill            Democratic
##        2001-Bush   621   1806        97 2001       Bush       George W.            Republican
##        2005-Bush   772   2312        99 2005       Bush       George W.            Republican
##       2009-Obama   938   2689       110 2009      Obama          Barack            Democratic
##       2013-Obama   814   2317        88 2013      Obama          Barack            Democratic
##       2017-Trump   582   1660        88 2017      Trump       Donald J.            Republican
##       2021-Biden   812   2766       216 2021      Biden       Joseph R.            Democratic
##       2025-Trump  1000   3347       177 2025      Trump       Donald J.            Republican

3.1 Inauguration speeches - Number of Words

require(ggplot2)

korpa %>%
  summary %>%
    ggplot(aes(x = Year, y = Tokens, group = 1)) +
     geom_line() +
     geom_point() +
     geom_label(aes(label = President, fill = Party), nudge_x = 0.1, nudge_y = 0.1) +
     ggtitle("Inauguration Speeches of Presidents - Number of Words") +
     theme_bw() +
     theme(plot.title=element_text( hjust=0.5, vjust=0.5, face='bold' ) +
     scale_color_paletteer_d(nord::frost))

3.2 Inauguration speeches - T(ypes)/(T)okens (R)atio

library(quanteda)
library(dplyr)
#summary(korpa)

# Most jön a brutale rész: A 'korpa' korpuszból 'mysummary' data frame, TTR-rel!
library(quanteda)
library(dplyr)
data("data_corpus_inaugural")
korpa <- corpus(data_corpus_inaugural) # save the `corpus` to a short obj name
docvars_df <- docvars(data_corpus_inaugural)
#docvars_df
#str(docvars_df)
speech_texts <- as.character(data_corpus_inaugural)
#str(speech_texts)
mysummary <- as.data.frame(summary(korpa, verbose = FALSE))
#mysummary
ms <- mysummary %>% mutate (TTR = Types/Tokens)
#
require(ggplot2)
ms1 <- ms[-2,]            # Drop the second row of ms, i.e. Wahington's 2. speech!
ms1 %>%
#  options(repr.plot.width = 12, repr.plot.height = 12)
  ggplot(aes(x = Year, y = TTR, group = 1)) +
  geom_line() +
  geom_point() +
  geom_label(aes(label = President, fill = Party)) +
  ggtitle("Inauguration Speeches of Presidents - T(ypes)/(T)okens (R)atio") +
  theme_bw() +
  theme(plot.title=element_text( hjust=0.5, vjust=0.5, face='bold' ))

3.3 A basic aspect: sentence length vs. word length

library(plotly)
library(quanteda)
#calculate y as the mean word length
korpa <- corpus(data_corpus_inaugural) # save the `corpus` to a short obj name
df <- docvars(data_corpus_inaugural)
df$text <- as.character(data_corpus_inaugural)

library(tokenizers)
words <- tokenize_words(
  df$text,
  lowercase = TRUE,
  stopwords = NULL,
  strip_punct = TRUE,
  strip_numeric = FALSE,
  simplify = FALSE
)
wc <- count_words(df$text)
st <- count_sentences(df$text)
wl <- count_characters(df$text)
fig <- plot_ly(data = ms, type = "scatter", mode = "markers", 
               x = round(wc/st,2), y = round(wl/wc,2),
               text = ~President,
               color = ~Party, 
               colors = c("red","green","blue","tomato","magenta","seagreen","salmon")) %>%
      layout(title = "Sentence length vs. word length",
         xaxis = list(title = "Sentence Length in Words"),
         yaxis = list(title = "Word Length in Characters"))
      
fig

3.4 Phrase dispersion in the subcorpus 2001-2025

library(quanteda)
library(quanteda.textplots)
library(quanteda.textstats)
library(ggplot2)

#Example corpus (replace with your actual corpus) 
corp_us <- corpus(data_corpus_inaugural) # save the corpus to a short obj name
my_corpus <- corpus_subset(corp_us, Year > 2000)
toki <- tokens(my_corpus)
kwic_results <- kwic(toki, phrase("our country"))
kwic_results
## Keyword-in-context with 37 matches.                                                                                                               
##       [2001-Bush, 24:25]               history, yet common in | our country | . With a simple oath             
##     [2001-Bush, 311:312]               more than the creed of | our country | , it is the inborn               
##     [2001-Bush, 569:570]        embracing these ideals, makes | our country | more, not less,                  
##     [2001-Bush, 673:674]                  are never small. If | our country | does not lead the cause          
##     [2001-Bush, 947:948]           The enemies of liberty and | our country | should make no mistake:          
##   [2001-Bush, 1225:1226]                    our laws. Many in | our country | do not know the pain             
##   [2001-Bush, 1757:1758]               purpose today, to make | our country | more just and generous,          
##       [2005-Bush, 63:64]      the deep commitments that unite | our country | . I am grateful for              
##   [2005-Bush, 1169:1170]             granted in good measure. | Our country | has accepted obligations that are
##   [2005-Bush, 1320:1321]         have shown their devotion to | our country | in deaths that honored their     
##   [2005-Bush, 1426:1427]                just to the wealth of | our country | but to its character.            
##   [2005-Bush, 1554:1555]            the promise and future of | our country | , we will bring the              
##   [2005-Bush, 1842:1843]             unwanted have worth. And | our country | must abandon all the habits      
##   [2005-Bush, 1889:1890]      the issues and questions before | our country | are many. From the               
##    [2013-Obama, 666:667]          the people, understand that | our country | cannot succeed when a shrinking  
##  [2013-Obama, 1830:1831]  workforce rather than expelled from | our country | . Our journey is not             
##      [2017-Trump, 46:47]     great national effort to rebuild | our country | and restore its promise for      
##    [2017-Trump, 254:255]              but not the citizens of | our country | . Their victories have not       
##    [2017-Trump, 404:405]           forgotten men and women of | our country | will be forgotten no longer      
##    [2017-Trump, 572:573]            too many lives and robbed | our country | of so much unrealized potential  
##    [2017-Trump, 723:724]          strength, and confidence of | our country | has dissipated over the horizon  
##  [2017-Trump, 1015:1016]            back to work - rebuilding | our country | with American hands and American 
##  [2017-Trump, 1153:1154]           and through our loyalty to | our country | , we will rediscover our         
##  [2017-Trump, 1368:1369]                    We will not fail. | Our country | will thrive and prosper again    
##  [2021-Biden, 1869:1870]                     we are this way, | our country | will be stronger, more           
##  [2021-Biden, 2174:2175]                 left behind, and for | our country | . Amen. This is                  
##      [2025-Trump, 79:80]               From this day forward, | our country | will flourish and be respected   
##    [2025-Trump, 404:405]        , that have illegally entered | our country | from all over the world          
##    [2025-Trump, 442:443]                    , its own people. | Our country | can no longer deliver basic      
##    [2025-Trump, 538:539]     and most powerful individuals in | our country | — some of whom are               
##    [2025-Trump, 634:635]                  many cases, to hate | our country | despite the love that we         
##    [2025-Trump, 976:977]           election in the history of | our country | . As our victory showed          
##  [2025-Trump, 1215:1216]                 . We will not forget | our country | , we will not forget             
##  [2025-Trump, 1352:1353]     repel the disastrous invasion of | our country | . Under the orders I             
##  [2025-Trump, 1441:1442] higher responsibility than to defend | our country | from threats and invasions,      
##  [2025-Trump, 2292:2293]     belongs. President McKinley made | our country | very rich through tariffs and    
##  [2025-Trump, 2897:2898]                    Washington, D.C., | our country | was forged and built by
Phrase dispersion
Phrase dispersion

3.5 A sophisticated measure for corpora: TF-IDF

TF-IDF Top Keywords of Presidents
TF-IDF Top Keywords of Presidents