getwd()
[1] "/cloud/project"
txt<-readLines("ou_msg/ou_msg_en.txt")
wordLst <- strsplit(txt,"\\s|\\W")
wordLst<-unlist(wordLst)
wordLst<-tolower(wordLst)
head(wordLst)
[1] "greetings" "" "everyone" "today" "" "i"
#wordLst<-wordLst[nchar(wordLst)>0]
wordLst<- wordLst[wordLst != ""]
head(wordLst)
[1] "greetings" "everyone" "today" "i" "was" "appointed"
tokens <- length(wordLst)
types <- length(unique(wordLst))
paste("Tokens =", tokens)
[1] "Tokens = 844"
paste("Types =", types)
[1] "Types = 360"
\[TTR=\frac{types}{tokens} \times 100 \]
types/tokens*100
[1] 42.65403
round(types/tokens*100,2)
[1] 42.65
\[RTTR=\frac{types}{\sqrt{tokens}} \]
[1] 12.39
freq <- table(wordLst)
head(freq)
wordLst
100th 19 1931 2015 2021 2022
1 1 1 1 3 2
freq_data<-sort(freq, decreasing=TRUE)
head(freq_data)
wordLst
the and of to university society
43 35 31 23 22 20
colors = c("orange", "lightblue", "green")
barplot(freq_data[1:20], las=3,col=colors)
install.packages("cleanNLP")
cnlp_init_udpipe()
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to /cloud/lib/x86_64-pc-linux-gnu-library/4.3/cleanNLP/extdata/english-ewt-ud-2.5-191206.udpipe
- This model has been trained on version 2.5 of data from https://universaldependencies.org
- The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0
- Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.
- For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')
trying URL 'https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe'
Content type 'application/octet-stream' length 16309608 bytes (15.6 MB)
==================================================
downloaded 15.6 MB
Downloading finished, model stored at '/cloud/lib/x86_64-pc-linux-gnu-library/4.3/cleanNLP/extdata/english-ewt-ud-2.5-191206.udpipe'
head(res$token$lemma)
[1] "greeting" "," "everyone" "." "today" ","
head(sort(freqBycnlp, decreasing=TRUE))
, the and of . to
50 43 35 31 30 23
cnlp_init_udpipe(model_name = "chinese")
Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/chinese-gsd-ud-2.5-191206.udpipe to /cloud/lib/x86_64-pc-linux-gnu-library/4.3/cleanNLP/extdata/chinese-gsd-ud-2.5-191206.udpipe
- This model has been trained on version 2.5 of data from https://universaldependencies.org
- The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0
- Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.
- For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')
trying URL 'https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/chinese-gsd-ud-2.5-191206.udpipe'
Content type 'application/octet-stream' length 13863905 bytes (13.2 MB)
==================================================
downloaded 13.2 MB
Downloading finished, model stored at '/cloud/lib/x86_64-pc-linux-gnu-library/4.3/cleanNLP/extdata/chinese-gsd-ud-2.5-191206.udpipe'
txt_ch<-readLines("ou_msg/ou_msg_ch.txt")
txt_ch
res$token[res$token$upos == "NOUN",]
cnlp_init_udpipe(model_name = "japanese")
res <- cnlp_annotate(input = txt_ja)
Processed document 10 of 19
res <- cnlp_annotate(input = txt_ja)
Processed document 10 of 19
res$token
cnlp_init_udpipe(model_name = "spanish")
Registered S3 method overwritten by 'data.table':
method from
print.data.table
res <- res$token[res$token$upos != "PUNCT",]
head(res)
sort(freqBycnlp, decreasing=TRUE)
de la en un a el que 18
11 7 5 4 3 3 3 1
500 Al-Ahli alrededor armado ataque autoridades Aviv centro
1 1 1 1 1 1 1 1
cohete contra decenas Desde deslinda después diplomática El
1 1 1 1 1 1 1 1
embajada estadounidense explotaron fallido Franja Gaza grupo ha
1 1 1 1 1 1 1 1
Hamás hasta hecho islámica Israel israelí Jordania lanzamiento
1 1 1 1 1 1 1 1
Líbano llamado madrugada manifestaciones médico movilización muerte mundial
1 1 1 1 1 1 1 1
octubre palestinas personas provocó representación sanitarias se según
1 1 1 1 1 1 1 1
señalando su sufriera Tel vez Yihad
1 1 1 1 1 1