Text-mining-1.R

#######Text analysis Exercise###########
library(tm)

## Loading required package: NLP

#library(topicmodels)
library(gutenbergr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(quanteda)

## Package version: 1.5.2

## Parallel computing: 2 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:utils':
## 
##     View

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(igraph)

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:quanteda':
## 
##     as.igraph

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

devtools::install_github("quanteda/quanteda.corpora")

## Skipping install of 'quanteda.corpora' from a github remote, the SHA1 (5933cc86) has not changed since last install.
##   Use `force = TRUE` to force installation

gutenberg_metadata %>%
  filter(title == "")

## # A tibble: 0 x 8
## # … with 8 variables: gutenberg_id <int>, title <chr>, author <chr>,
## #   gutenberg_author_id <int>, language <chr>, gutenberg_bookshelf <chr>,
## #   rights <chr>, has_text <lgl>

#We can get the ID number of each book on the Project Gutemberg website
prince <- gutenberg_download (57037)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

#inspect a particular document in corpus (E.g. number 5925)
writeLines(as.character(prince$text[[925]]))

## rate to prevent a pope being created whom he did not wish. But if at

#Create a corpus out of the dataframe object
prince_corpus <- corpus(prince$text)

summary(prince_corpus)

## Corpus consisting of 2914 documents, showing 100 documents:
## 
##     Text Types Tokens Sentences
##    text1     2      2         1
##    text2     0      0         0
##    text3     1      1         1
##    text4     0      0         0
##    text5     2      2         1
##    text6     0      0         0
##    text7     4      4         1
##    text8     0      0         0
##    text9     2      2         1
##   text10     0      0         0
##   text11     2      2         1
##   text12     0      0         0
##   text13     3      3         1
##   text14     0      0         0
##   text15     3      3         1
##   text16     0      0         0
##   text17     5      5         1
##   text18     0      0         0
##   text19     4      4         1
##   text20     0      0         0
##   text21     1      1         1
##   text22     0      0         0
##   text23     0      0         0
##   text24     0      0         0
##   text25     0      0         0
##   text26     1      1         1
##   text27     0      0         0
##   text28     0      0         0
##   text29    11     11         1
##   text30    14     15         1
##   text31    11     12         1
##   text32     2      2         1
##   text33     0      0         0
##   text34    11     13         1
##   text35    11     12         1
##   text36    16     17         2
##   text37    12     12         1
##   text38    11     11         1
##   text39    11     14         1
##   text40    14     14         2
##   text41     9      9         1
##   text42    13     14         1
##   text43    10     11         1
##   text44    14     14         1
##   text45    13     15         2
##   text46    12     13         1
##   text47    15     16         1
##   text48    12     13         1
##   text49    12     15         1
##   text50     4      4         1
##   text51     0      0         0
##   text52    11     11         1
##   text53    10     10         1
##   text54    12     14         1
##   text55    11     13         2
##   text56    13     15         1
##   text57    13     17         1
##   text58     9      9         1
##   text59     0      0         0
##   text60     3      3         1
##   text61     0      0         0
##   text62     7      8         1
##   text63     0      0         0
##   text64     0      0         0
##   text65    13     14         1
##   text66    12     14         2
##   text67     3      3         1
##   text68     0      0         0
##   text69     0      0         0
##   text70     0      0         0
##   text71     1      1         1
##   text72     0      0         0
##   text73     0      0         0
##   text74     6      6         1
##   text75     0      0         0
##   text76    14     14         2
##   text77     2      2         1
##   text78     0      0         0
##   text79     5      6         2
##   text80     0      0         0
##   text81     5      6         2
##   text82     0      0         0
##   text83    14     15         2
##   text84     9     10         1
##   text85     0      0         0
##   text86    13     14         2
##   text87     8      8         1
##   text88     0      0         0
##   text89    14     14         2
##   text90     2      2         1
##   text91     0      0         0
##   text92    13     15         2
##   text93     0      0         0
##   text94    13     14         2
##   text95     0      0         0
##   text96     6      7         2
##   text97     0      0         0
##   text98    11     12         2
##   text99     0      0         0
##  text100     5      6         2
## 
## Source: /home/giuseppev/Dropbox/Trento Insegnamento/Dottorato Teaching/* on x86_64 by giuseppev
## Created: Thu Feb 13 18:31:45 2020
## Notes:

#Generate Tokens
toks<- tokens(prince$text, remove_punct = TRUE, remove_numbers = TRUE )
#generate ngrams
toks_ngram <- tokens_ngrams(toks, n = 2:4)
head(toks_ngram[[1]], 50)

## [1] "THE_PRINCE"

#Construct a document features matrix DFM
dfmat_prince <- dfm(toks)
dfmat_prince <- dfm_tfidf(dfmat_prince)

# to view the terms again 
# get_terms(my_lda_fit20, 10)


textplot_wordcloud(dfmat_prince, min_count = 6, random_order = FALSE,
                   rotation = .25,
                   color = RColorBrewer::brewer.pal(8, "Dark2"))

## Warning in dfm_trim.dfm(x, min_termfreq = min_count): dfm has been previously
## weighted

## Loading required package: readtext
require(readtext)

## Loading required package: readtext

names(prince$text) <- "The Prince"

textplot_xray(
  kwic(prince$text, pattern = "power"),
  kwic(prince$text, pattern = "love")
)

#Network of co-occurrences between words
set.seed(100)
toks <- corpus_subset(data_corpus_irishbudget2010) %>%
  tokens(remove_punct = TRUE) %>%
  tokens_tolower() %>%
  tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
  textplot_network(min_freq = 0.5)

## Registered S3 method overwritten by 'network':
##   method            from    
##   summary.character quanteda

fcm_select(fcmat, pattern = feat) %>%
  textplot_network(min_freq = 0.8, vertex_labelcolor = rep(c('gray40', NA), 15))

fcm_select(fcmat, pattern = feat) %>%
  textplot_network(vertex_labelsize = 10)

fcm_30 <- fcm_select(fcmat, pattern = feat)
textplot_network(fcm_30, vertex_labelsize = rowSums(fcm_30)/min(rowSums(fcm_30)))

# Vector inputs to vertex_labelsize can be scaled if too small / large
textplot_network(fcm_30, vertex_labelsize = 1.5 * rowSums(fcm_30)/min(rowSums(fcm_30)))

###########SIMILARITY BETWEEN TWITTER USERS################

dat_twitter <- readtext("twitter.json", source = "twitter")


#Construct a corpus of Tweets
corp_tweets <- corpus(dat_twitter)

#Construct a document-feature matrix removing tags and links.
dfmat_tweets <- dfm(corp_tweets,
                    remove_punct = TRUE, remove_url = TRUE,
                    remove = c('*.tt', '*.uk', '*.com', 'rt', '#*', '@*')) %>% 
  dfm_remove(stopwords('en'))

ndoc(dfmat_tweets)

## [1] 7504

topfeatures(dfmat_tweets)

##          vote conservatives        labour         today         share 
##          1873           953           758           674           648 
##       britain          find        fairer        voting      tomorrow 
##           639           615           571           570           565

#Group documents by usernames.

dfmat_users <- dfm_group(dfmat_tweets, groups = 'screen_name')
ndoc(dfmat_users)

## [1] 5061

#Remove rare (less than 10 times) and short (one character) features, 
#and keep only users with more than 50 tokens in total.

dfmat_users <- dfmat_users %>% 
  dfm_select(min_nchar = 2) %>% 
  dfm_trim(min_termfreq = 10) 
dfmat_users <- dfmat_users[ntoken(dfmat_users) > 50,]

#Calculate user-user similarity using textstat_dist().

tstat_dist <- as.dist(textstat_dist(dfmat_users))
user_clust <- hclust(tstat_dist)
plot(user_clust)

Text-mining-1.R

giuseppev

2020-02-13