#######Text analysis Exercise###########
library(tm)
## Loading required package: NLP
#library(topicmodels)
library(gutenbergr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(quanteda)
## Package version: 1.5.2
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:quanteda':
##
## as.igraph
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
devtools::install_github("quanteda/quanteda.corpora")
## Skipping install of 'quanteda.corpora' from a github remote, the SHA1 (5933cc86) has not changed since last install.
## Use `force = TRUE` to force installation
gutenberg_metadata %>%
filter(title == "")
## # A tibble: 0 x 8
## # … with 8 variables: gutenberg_id <int>, title <chr>, author <chr>,
## # gutenberg_author_id <int>, language <chr>, gutenberg_bookshelf <chr>,
## # rights <chr>, has_text <lgl>
#We can get the ID number of each book on the Project Gutemberg website
prince <- gutenberg_download (57037)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
#inspect a particular document in corpus (E.g. number 5925)
writeLines(as.character(prince$text[[925]]))
## rate to prevent a pope being created whom he did not wish. But if at
#Create a corpus out of the dataframe object
prince_corpus <- corpus(prince$text)
summary(prince_corpus)
## Corpus consisting of 2914 documents, showing 100 documents:
##
## Text Types Tokens Sentences
## text1 2 2 1
## text2 0 0 0
## text3 1 1 1
## text4 0 0 0
## text5 2 2 1
## text6 0 0 0
## text7 4 4 1
## text8 0 0 0
## text9 2 2 1
## text10 0 0 0
## text11 2 2 1
## text12 0 0 0
## text13 3 3 1
## text14 0 0 0
## text15 3 3 1
## text16 0 0 0
## text17 5 5 1
## text18 0 0 0
## text19 4 4 1
## text20 0 0 0
## text21 1 1 1
## text22 0 0 0
## text23 0 0 0
## text24 0 0 0
## text25 0 0 0
## text26 1 1 1
## text27 0 0 0
## text28 0 0 0
## text29 11 11 1
## text30 14 15 1
## text31 11 12 1
## text32 2 2 1
## text33 0 0 0
## text34 11 13 1
## text35 11 12 1
## text36 16 17 2
## text37 12 12 1
## text38 11 11 1
## text39 11 14 1
## text40 14 14 2
## text41 9 9 1
## text42 13 14 1
## text43 10 11 1
## text44 14 14 1
## text45 13 15 2
## text46 12 13 1
## text47 15 16 1
## text48 12 13 1
## text49 12 15 1
## text50 4 4 1
## text51 0 0 0
## text52 11 11 1
## text53 10 10 1
## text54 12 14 1
## text55 11 13 2
## text56 13 15 1
## text57 13 17 1
## text58 9 9 1
## text59 0 0 0
## text60 3 3 1
## text61 0 0 0
## text62 7 8 1
## text63 0 0 0
## text64 0 0 0
## text65 13 14 1
## text66 12 14 2
## text67 3 3 1
## text68 0 0 0
## text69 0 0 0
## text70 0 0 0
## text71 1 1 1
## text72 0 0 0
## text73 0 0 0
## text74 6 6 1
## text75 0 0 0
## text76 14 14 2
## text77 2 2 1
## text78 0 0 0
## text79 5 6 2
## text80 0 0 0
## text81 5 6 2
## text82 0 0 0
## text83 14 15 2
## text84 9 10 1
## text85 0 0 0
## text86 13 14 2
## text87 8 8 1
## text88 0 0 0
## text89 14 14 2
## text90 2 2 1
## text91 0 0 0
## text92 13 15 2
## text93 0 0 0
## text94 13 14 2
## text95 0 0 0
## text96 6 7 2
## text97 0 0 0
## text98 11 12 2
## text99 0 0 0
## text100 5 6 2
##
## Source: /home/giuseppev/Dropbox/Trento Insegnamento/Dottorato Teaching/* on x86_64 by giuseppev
## Created: Thu Feb 13 18:31:45 2020
## Notes:
#Generate Tokens
toks<- tokens(prince$text, remove_punct = TRUE, remove_numbers = TRUE )
#generate ngrams
toks_ngram <- tokens_ngrams(toks, n = 2:4)
head(toks_ngram[[1]], 50)
## [1] "THE_PRINCE"
#Construct a document features matrix DFM
dfmat_prince <- dfm(toks)
dfmat_prince <- dfm_tfidf(dfmat_prince)
# to view the terms again
# get_terms(my_lda_fit20, 10)
textplot_wordcloud(dfmat_prince, min_count = 6, random_order = FALSE,
rotation = .25,
color = RColorBrewer::brewer.pal(8, "Dark2"))
## Warning in dfm_trim.dfm(x, min_termfreq = min_count): dfm has been previously
## weighted

## Loading required package: readtext
require(readtext)
## Loading required package: readtext
names(prince$text) <- "The Prince"
textplot_xray(
kwic(prince$text, pattern = "power"),
kwic(prince$text, pattern = "love")
)

#Network of co-occurrences between words
set.seed(100)
toks <- corpus_subset(data_corpus_irishbudget2010) %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5)
## Registered S3 method overwritten by 'network':
## method from
## summary.character quanteda

fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.8, vertex_labelcolor = rep(c('gray40', NA), 15))

fcm_select(fcmat, pattern = feat) %>%
textplot_network(vertex_labelsize = 10)

fcm_30 <- fcm_select(fcmat, pattern = feat)
textplot_network(fcm_30, vertex_labelsize = rowSums(fcm_30)/min(rowSums(fcm_30)))

# Vector inputs to vertex_labelsize can be scaled if too small / large
textplot_network(fcm_30, vertex_labelsize = 1.5 * rowSums(fcm_30)/min(rowSums(fcm_30)))

###########SIMILARITY BETWEEN TWITTER USERS################
dat_twitter <- readtext("twitter.json", source = "twitter")
#Construct a corpus of Tweets
corp_tweets <- corpus(dat_twitter)
#Construct a document-feature matrix removing tags and links.
dfmat_tweets <- dfm(corp_tweets,
remove_punct = TRUE, remove_url = TRUE,
remove = c('*.tt', '*.uk', '*.com', 'rt', '#*', '@*')) %>%
dfm_remove(stopwords('en'))
ndoc(dfmat_tweets)
## [1] 7504
topfeatures(dfmat_tweets)
## vote conservatives labour today share
## 1873 953 758 674 648
## britain find fairer voting tomorrow
## 639 615 571 570 565
#Group documents by usernames.
dfmat_users <- dfm_group(dfmat_tweets, groups = 'screen_name')
ndoc(dfmat_users)
## [1] 5061
#Remove rare (less than 10 times) and short (one character) features,
#and keep only users with more than 50 tokens in total.
dfmat_users <- dfmat_users %>%
dfm_select(min_nchar = 2) %>%
dfm_trim(min_termfreq = 10)
dfmat_users <- dfmat_users[ntoken(dfmat_users) > 50,]
#Calculate user-user similarity using textstat_dist().
tstat_dist <- as.dist(textstat_dist(dfmat_users))
user_clust <- hclust(tstat_dist)
plot(user_clust)
