library(quanteda)
## Package version: 1.5.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(readtext)
library(ggplot2)
#reading the text from the CSV
sam<-readtext('C:/Users/somy/Documents/text_analysis_intern/Demo_Task - assignment.csv')
#creating the corpus( a collection of text document over which we would apply text analysis)
#seperate corpus for the raw text
#we might :1)remove stop_words,2)perform stemming,
#3)remove hyphen,4)remove numbers,5)creating a to_lower function so that
#all the letters are in lower_case
corp_raw_text<-corpus(tolower(sam$raw_text))%>%
corpus_reshape(stem=T,remove_punct = TRUE,remove_numbers=T,remove_hyphen=T)%>%
corpus(remove=stopwords('english'))
## Warning: Argument stem not used.
#to View any text from review text corpus
corp_raw_text[1]
## text1.1
## "spiritually and mentally inspiring!"
#creating the tokens for the raw_text
#tokens() function splits the text into words
toks_raw_text <- tokens(sam$raw_text)
summary(corp_raw_text)
## Warning in nsentence.character(object, ...): nsentence() does not correctly
## count sentences in all lower-cased text
## Corpus consisting of 57128 documents, showing 100 documents:
##
## Text Types Tokens Sentences
## text1.1 5 5 1
## text1.2 17 19 1
## text2.1 7 7 1
## text3.1 6 6 1
## text4.1 12 12 1
## text5.1 17 20 1
## text6.1 6 6 1
## text7.1 13 13 1
## text8.1 25 29 1
## text9.1 11 13 1
## text10.1 14 15 1
## text11.1 14 16 1
## text12.1 30 33 1
## text13.1 18 23 1
## text14.1 23 26 1
## text15.1 9 9 1
## text16.1 14 14 1
## text17.1 15 15 1
## text18.1 3 3 1
## text19.1 18 20 1
## text20.1 19 21 1
## text21.1 24 29 1
## text22.1 4 4 1
## text23.1 3 3 1
## text24.1 3 3 1
## text25.1 11 11 1
## text26.1 27 29 1
## text27.1 25 32 1
## text28.1 4 4 1
## text29.1 19 20 1
## text30.1 22 23 1
## text31.1 17 21 1
## text32.1 14 15 1
## text33.1 22 25 1
## text34.1 10 10 1
## text35.1 10 10 1
## text36.1 4 4 1
## text37.1 5 5 1
## text38.1 22 22 1
## text39.1 22 24 1
## text40.1 6 7 1
## text41.1 3 3 1
## text42.1 22 25 1
## text43.1 18 22 1
## text44.1 4 4 1
## text45.1 8 10 1
## text46.1 11 12 1
## text47.1 9 9 1
## text48.1 20 22 1
## text49.1 8 8 1
## text50.1 17 17 1
## text51.1 10 11 1
## text52.1 20 22 1
## text53.1 31 38 1
## text54.1 12 14 1
## text55.1 34 67 1
## text56.1 16 17 1
## text57.1 21 23 1
## text58.1 17 21 1
## text59.1 18 18 1
## text60.1 15 16 1
## text61.1 22 24 1
## text62.1 21 27 1
## text63.1 12 16 1
## text64.1 18 18 1
## text65.1 12 13 1
## text66.1 28 35 1
## text67.1 14 15 1
## text68.1 10 10 1
## text69.1 9 9 1
## text69.2 17 17 1
## text70.1 16 16 1
## text71.1 21 24 1
## text72.1 5 5 1
## text73.1 9 9 1
## text74.1 13 15 1
## text75.1 15 15 1
## text76.1 15 16 1
## text77.1 10 11 1
## text78.1 11 12 1
## text79.1 11 12 1
## text80.1 12 14 1
## text81.1 24 26 1
## text82.1 22 30 1
## text83.1 10 11 1
## text84.1 14 14 1
## text85.1 9 9 1
## text86.1 21 23 1
## text87.1 27 28 1
## text88.1 16 17 1
## text89.1 24 27 1
## text90.1 5 5 1
## text91.1 34 38 1
## text92.1 7 7 1
## text93.1 7 7 1
## text94.1 11 14 1
## text95.1 26 31 1
## text96.1 17 19 1
## text97.1 4 4 1
## text98.1 11 12 1
##
## Source: C:/Users/somy/Documents/text_analysis_intern/* on x86-64 by somy
## Created: Sun Nov 10 14:12:46 2019
## Notes: corpus_reshape.corpus(., stem = T, remove_punct = TRUE, remove_numbers = T, remove_hyphen = T)
#seperate corpus for the review text
#we might :1)remove stop_words,2)perform stemming,
#3)remove hyphen,4)remove numbers,5)creating a to_lower function so that
#all the letters are in lower_case
corp_review_text<-corpus(tolower(as.character(sam$review_text))) %>%
corpus_reshape(stem=T,remove_punct = TRUE,remove_numbers=T,remove_hyphens=T)%>%
corpus(remove=stopwords('id',source ='stopwords-iso'))
## Warning: Argument stem not used.
#to View any text from review text corpus
corp_review_text[1]
## text1.1
## "menginspirasi secara spiritual dan mental!"
#creating the tokens for the raw_text
#tokens() function splits the text into words
tok_review_text<-tokens(sam$review_text)
summary(corp_review_text)
## Warning in nsentence.character(object, ...): nsentence() does not correctly
## count sentences in all lower-cased text
## Corpus consisting of 57079 documents, showing 100 documents:
##
## Text Types Tokens Sentences
## text1.1 6 6 1
## text1.2 14 17 1
## text2.1 9 9 1
## text3.1 4 4 1
## text4.1 11 11 1
## text5.1 15 15 1
## text6.1 6 6 1
## text7.1 11 12 1
## text8.1 20 24 1
## text9.1 11 12 1
## text10.1 13 14 1
## text11.1 12 13 1
## text12.1 31 34 1
## text13.1 16 22 1
## text14.1 18 20 1
## text15.1 9 9 1
## text16.1 12 12 1
## text17.1 13 13 1
## text18.1 2 2 1
## text19.1 18 20 1
## text20.1 16 17 1
## text21.1 21 22 1
## text22.1 3 3 1
## text23.1 3 3 1
## text24.1 2 2 1
## text25.1 10 10 1
## text26.1 22 27 1
## text27.1 24 27 1
## text28.1 3 3 1
## text29.1 16 16 1
## text30.1 19 20 1
## text31.1 15 18 1
## text32.1 13 14 1
## text33.1 21 23 1
## text34.1 9 9 1
## text35.1 8 8 1
## text36.1 3 3 1
## text37.1 4 4 1
## text38.1 17 18 1
## text39.1 22 26 1
## text40.1 5 5 1
## text41.1 4 4 1
## text42.1 19 22 1
## text43.1 15 17 1
## text44.1 4 4 1
## text45.1 6 7 1
## text46.1 11 12 1
## text47.1 7 8 1
## text48.1 17 18 1
## text49.1 6 7 1
## text50.1 14 15 1
## text51.1 11 11 1
## text52.1 17 18 1
## text53.1 27 35 1
## text54.1 11 13 1
## text55.1 34 66 1
## text56.1 16 17 1
## text57.1 18 22 1
## text58.1 21 24 1
## text59.1 18 18 1
## text60.1 12 14 1
## text61.1 18 18 1
## text62.1 17 22 1
## text63.1 12 13 1
## text64.1 17 17 1
## text65.1 12 13 1
## text66.1 28 36 1
## text67.1 12 13 1
## text68.1 11 12 1
## text69.1 9 9 1
## text69.2 13 13 1
## text70.1 14 14 1
## text71.1 19 22 1
## text72.1 4 4 1
## text73.1 9 9 1
## text74.1 11 13 1
## text75.1 14 15 1
## text76.1 14 15 1
## text77.1 8 8 1
## text78.1 12 14 1
## text79.1 8 9 1
## text80.1 11 13 1
## text81.1 19 21 1
## text82.1 24 32 1
## text83.1 8 8 1
## text84.1 13 14 1
## text85.1 6 7 1
## text86.1 18 19 1
## text87.1 24 25 1
## text88.1 12 13 1
## text89.1 16 18 1
## text90.1 4 4 1
## text91.1 30 34 1
## text92.1 6 6 1
## text93.1 9 9 1
## text94.1 12 14 1
## text95.1 23 27 1
## text96.1 14 17 1
## text97.1 4 4 1
## text98.1 9 9 1
##
## Source: C:/Users/somy/Documents/text_analysis_intern/* on x86-64 by somy
## Created: Sun Nov 10 14:13:51 2019
## Notes: corpus_reshape.corpus(., stem = T, remove_punct = TRUE, remove_numbers = T, remove_hyphens = T)
#to View any text from review text corpus
corp_review_text[1]
## text1.1
## "menginspirasi secara spiritual dan mental!"
#creation of the document feature matrix
#a mathematical matrix that describes the frequency
#of terms that occur in a collection of documents
dfm_raw_text <- corpus_subset(corp_raw_text) %>%
dfm(remove = stopwords('english'), remove_punct = TRUE,remove_numbers=T,remove_hyphens=T)
#creation of lsa model
#raw_lsa <- textmodel_lsa(dfm_raw_text)
# we will trim the documentfeature matrix only for those words who have a
#minimum of 10 occurence frequency
dfm_raw_text_trim<-dfm_trim(dfm_raw_text,min_termfreq = 10, verbose = FALSE)
set.seed(100)
#creation of the wordcloud with new trimmed DFM
textplot_wordcloud(dfm_raw_text_trim,)

#feature frequency for raw_text
features_dfm_raw_text <- textstat_frequency(dfm_raw_text, n = 100)
features_dfm_raw_text$feature <- with(features_dfm_raw_text,reorder(feature, -frequency))
ggplot(features_dfm_raw_text, aes(x = feature, y = frequency)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

#stop word for raw_text
head(stopwords("en"), 20)
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
#creation of the network using top 200 eng words
toptag_raw_text <- names(topfeatures(dfm_raw_text,200))
tag_fcm_raw_text <- fcm(dfm_raw_text)
head(tag_fcm_raw_text)
## Feature co-occurrence matrix of: 6 by 6 features.
## 6 x 6 sparse Matrix of class "fcm"
## features
## features spiritually mentally inspiring book allows question
## spiritually 0 1 1 2 0 0
## mentally 0 0 1 0 0 0
## inspiring 0 0 0 5 0 0
## book 0 0 0 149 9 2
## allows 0 0 0 0 1 1
## question 0 0 0 0 0 0
topgat_fcm_raw <- fcm_select(tag_fcm_raw_text, pattern = toptag_raw_text)
textplot_network(topgat_fcm_raw, min_freq = 0.1, edge_alpha = 0.8, edge_size = 5)
## Registered S3 method overwritten by 'network':
## method from
## summary.character quanteda

#creation of the network using top 20 eng words
toptag_raw_text_20 <- names(topfeatures(dfm_raw_text,20))
tag_fcm_raw_text_20 <- fcm(dfm_raw_text)
head(tag_fcm_raw_text_20)
## Feature co-occurrence matrix of: 6 by 6 features.
## 6 x 6 sparse Matrix of class "fcm"
## features
## features spiritually mentally inspiring book allows question
## spiritually 0 1 1 2 0 0
## mentally 0 0 1 0 0 0
## inspiring 0 0 0 5 0 0
## book 0 0 0 149 9 2
## allows 0 0 0 0 1 1
## question 0 0 0 0 0 0
topgat_fcm_raw_20 <- fcm_select(tag_fcm_raw_text_20, pattern = toptag_raw_text_20)
textplot_network(topgat_fcm_raw_20, min_freq = 0.1, edge_alpha = 0.8, edge_size = 5)

#indonesia
# we will trim the documentfeature matrix only for those words who have a
#minimum of 10 occurence frequency
dfm_review_text <- corpus_subset(corp_review_text) %>%
dfm(remove = stopwords('id',source = 'stopwords-iso'), remove_punct = TRUE,remove_numbers=T,remove_hyphens=T)
dfm_review_text_trim<-dfm_trim(dfm_review_text,min_termfreq = 10, verbose = FALSE)
set.seed(100)
textplot_wordcloud(dfm_review_text_trim,color = c('brown', 'pink', 'green', 'purple', 'orange', 'blue'))

#creation of lsa model
#raw_lsa <- textmodel_lsa(dfm_review_text)
#feature frequency for review_text
features_dfm_review_text <- textstat_frequency(dfm_review_text, n = 100)
features_dfm_review_text$feature <- with(features_dfm_review_text,reorder(feature, -frequency))
#plotting the same with ggplot (grammerof graphics)
ggplot(features_dfm_review_text, aes(x = feature, y = frequency)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

#stop word for review_text
head(stopwords("id",source='stopwords-iso'), 20)
## [1] "ada" "adalah" "adanya" "adapun" "agak" "agaknya"
## [7] "agar" "akan" "akankah" "akhir" "akhiri" "akhirnya"
## [13] "aku" "akulah" "amat" "amatlah" "anda" "andalah"
## [19] "antar" "antara"
#creation of the network using top 200 indonesian words
toptag_review_text <- names(topfeatures(dfm_review_text,200))
tag_fcm_review_text <- fcm(dfm_review_text)
head(tag_fcm_review_text)
## Feature co-occurrence matrix of: 6 by 6 features.
## 6 x 6 sparse Matrix of class "fcm"
## features
## features menginspirasi spiritual mental buku moral membantu
## menginspirasi 0 2 1 7 0 0
## spiritual 0 2 3 19 1 0
## mental 0 0 0 1 0 0
## buku 0 0 0 699 3 38
## moral 0 0 0 0 0 1
## membantu 0 0 0 0 0 7
topgat_fcm_review <- fcm_select(tag_fcm_review_text, pattern = toptag_review_text)
textplot_network(topgat_fcm_review, min_freq = 0.1, edge_alpha = 0.8, edge_size = 5)

#creation of the network using top 20 indonesian words
toptag_review_text_20 <- names(topfeatures(dfm_review_text,20))
tag_fcm_review_text_20 <- fcm(dfm_review_text)
head(tag_fcm_review_text)
## Feature co-occurrence matrix of: 6 by 6 features.
## 6 x 6 sparse Matrix of class "fcm"
## features
## features menginspirasi spiritual mental buku moral membantu
## menginspirasi 0 2 1 7 0 0
## spiritual 0 2 3 19 1 0
## mental 0 0 0 1 0 0
## buku 0 0 0 699 3 38
## moral 0 0 0 0 0 1
## membantu 0 0 0 0 0 7
topgat_fcm_review_20 <- fcm_select(tag_fcm_review_text_20, pattern = toptag_review_text_20)
textplot_network(topgat_fcm_review_20, min_freq = 0.1, edge_alpha = 0.8, edge_size = 5)
