text_analysis.utf8.md

library(quanteda)

## Package version: 1.5.1

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

library(readtext)
library(ggplot2)
#reading the text from the CSV
sam<-readtext('C:/Users/somy/Documents/text_analysis_intern/Demo_Task - assignment.csv')


#creating the corpus( a collection of text document over which we would apply text analysis)

#seperate corpus for the raw text
#we might :1)remove stop_words,2)perform stemming, 
#3)remove hyphen,4)remove numbers,5)creating a to_lower function so that 
#all the letters are in lower_case

corp_raw_text<-corpus(tolower(sam$raw_text))%>%
  corpus_reshape(stem=T,remove_punct = TRUE,remove_numbers=T,remove_hyphen=T)%>%
  corpus(remove=stopwords('english'))

## Warning: Argument stem not used.

#to View any text from review text corpus 
corp_raw_text[1]

##                               text1.1 
## "spiritually and mentally inspiring!"

#creating the tokens for the raw_text 
#tokens() function splits the text into words
toks_raw_text <- tokens(sam$raw_text)

summary(corp_raw_text)

## Warning in nsentence.character(object, ...): nsentence() does not correctly
## count sentences in all lower-cased text

## Corpus consisting of 57128 documents, showing 100 documents:
## 
##      Text Types Tokens Sentences
##   text1.1     5      5         1
##   text1.2    17     19         1
##   text2.1     7      7         1
##   text3.1     6      6         1
##   text4.1    12     12         1
##   text5.1    17     20         1
##   text6.1     6      6         1
##   text7.1    13     13         1
##   text8.1    25     29         1
##   text9.1    11     13         1
##  text10.1    14     15         1
##  text11.1    14     16         1
##  text12.1    30     33         1
##  text13.1    18     23         1
##  text14.1    23     26         1
##  text15.1     9      9         1
##  text16.1    14     14         1
##  text17.1    15     15         1
##  text18.1     3      3         1
##  text19.1    18     20         1
##  text20.1    19     21         1
##  text21.1    24     29         1
##  text22.1     4      4         1
##  text23.1     3      3         1
##  text24.1     3      3         1
##  text25.1    11     11         1
##  text26.1    27     29         1
##  text27.1    25     32         1
##  text28.1     4      4         1
##  text29.1    19     20         1
##  text30.1    22     23         1
##  text31.1    17     21         1
##  text32.1    14     15         1
##  text33.1    22     25         1
##  text34.1    10     10         1
##  text35.1    10     10         1
##  text36.1     4      4         1
##  text37.1     5      5         1
##  text38.1    22     22         1
##  text39.1    22     24         1
##  text40.1     6      7         1
##  text41.1     3      3         1
##  text42.1    22     25         1
##  text43.1    18     22         1
##  text44.1     4      4         1
##  text45.1     8     10         1
##  text46.1    11     12         1
##  text47.1     9      9         1
##  text48.1    20     22         1
##  text49.1     8      8         1
##  text50.1    17     17         1
##  text51.1    10     11         1
##  text52.1    20     22         1
##  text53.1    31     38         1
##  text54.1    12     14         1
##  text55.1    34     67         1
##  text56.1    16     17         1
##  text57.1    21     23         1
##  text58.1    17     21         1
##  text59.1    18     18         1
##  text60.1    15     16         1
##  text61.1    22     24         1
##  text62.1    21     27         1
##  text63.1    12     16         1
##  text64.1    18     18         1
##  text65.1    12     13         1
##  text66.1    28     35         1
##  text67.1    14     15         1
##  text68.1    10     10         1
##  text69.1     9      9         1
##  text69.2    17     17         1
##  text70.1    16     16         1
##  text71.1    21     24         1
##  text72.1     5      5         1
##  text73.1     9      9         1
##  text74.1    13     15         1
##  text75.1    15     15         1
##  text76.1    15     16         1
##  text77.1    10     11         1
##  text78.1    11     12         1
##  text79.1    11     12         1
##  text80.1    12     14         1
##  text81.1    24     26         1
##  text82.1    22     30         1
##  text83.1    10     11         1
##  text84.1    14     14         1
##  text85.1     9      9         1
##  text86.1    21     23         1
##  text87.1    27     28         1
##  text88.1    16     17         1
##  text89.1    24     27         1
##  text90.1     5      5         1
##  text91.1    34     38         1
##  text92.1     7      7         1
##  text93.1     7      7         1
##  text94.1    11     14         1
##  text95.1    26     31         1
##  text96.1    17     19         1
##  text97.1     4      4         1
##  text98.1    11     12         1
## 
## Source: C:/Users/somy/Documents/text_analysis_intern/* on x86-64 by somy
## Created: Sun Nov 10 14:12:46 2019
## Notes: corpus_reshape.corpus(., stem = T, remove_punct = TRUE, remove_numbers = T, remove_hyphen = T)

#seperate corpus for the review text
#we might :1)remove stop_words,2)perform stemming, 
#3)remove hyphen,4)remove numbers,5)creating a to_lower function so that 
#all the letters are in lower_case

corp_review_text<-corpus(tolower(as.character(sam$review_text))) %>%
  corpus_reshape(stem=T,remove_punct = TRUE,remove_numbers=T,remove_hyphens=T)%>%
  corpus(remove=stopwords('id',source ='stopwords-iso'))

## Warning: Argument stem not used.

#to View any text from review text corpus 
corp_review_text[1]

##                                      text1.1 
## "menginspirasi secara spiritual dan mental!"

#creating the tokens for the raw_text 
#tokens() function splits the text into words

tok_review_text<-tokens(sam$review_text)

summary(corp_review_text)

## Warning in nsentence.character(object, ...): nsentence() does not correctly
## count sentences in all lower-cased text

## Corpus consisting of 57079 documents, showing 100 documents:
## 
##      Text Types Tokens Sentences
##   text1.1     6      6         1
##   text1.2    14     17         1
##   text2.1     9      9         1
##   text3.1     4      4         1
##   text4.1    11     11         1
##   text5.1    15     15         1
##   text6.1     6      6         1
##   text7.1    11     12         1
##   text8.1    20     24         1
##   text9.1    11     12         1
##  text10.1    13     14         1
##  text11.1    12     13         1
##  text12.1    31     34         1
##  text13.1    16     22         1
##  text14.1    18     20         1
##  text15.1     9      9         1
##  text16.1    12     12         1
##  text17.1    13     13         1
##  text18.1     2      2         1
##  text19.1    18     20         1
##  text20.1    16     17         1
##  text21.1    21     22         1
##  text22.1     3      3         1
##  text23.1     3      3         1
##  text24.1     2      2         1
##  text25.1    10     10         1
##  text26.1    22     27         1
##  text27.1    24     27         1
##  text28.1     3      3         1
##  text29.1    16     16         1
##  text30.1    19     20         1
##  text31.1    15     18         1
##  text32.1    13     14         1
##  text33.1    21     23         1
##  text34.1     9      9         1
##  text35.1     8      8         1
##  text36.1     3      3         1
##  text37.1     4      4         1
##  text38.1    17     18         1
##  text39.1    22     26         1
##  text40.1     5      5         1
##  text41.1     4      4         1
##  text42.1    19     22         1
##  text43.1    15     17         1
##  text44.1     4      4         1
##  text45.1     6      7         1
##  text46.1    11     12         1
##  text47.1     7      8         1
##  text48.1    17     18         1
##  text49.1     6      7         1
##  text50.1    14     15         1
##  text51.1    11     11         1
##  text52.1    17     18         1
##  text53.1    27     35         1
##  text54.1    11     13         1
##  text55.1    34     66         1
##  text56.1    16     17         1
##  text57.1    18     22         1
##  text58.1    21     24         1
##  text59.1    18     18         1
##  text60.1    12     14         1
##  text61.1    18     18         1
##  text62.1    17     22         1
##  text63.1    12     13         1
##  text64.1    17     17         1
##  text65.1    12     13         1
##  text66.1    28     36         1
##  text67.1    12     13         1
##  text68.1    11     12         1
##  text69.1     9      9         1
##  text69.2    13     13         1
##  text70.1    14     14         1
##  text71.1    19     22         1
##  text72.1     4      4         1
##  text73.1     9      9         1
##  text74.1    11     13         1
##  text75.1    14     15         1
##  text76.1    14     15         1
##  text77.1     8      8         1
##  text78.1    12     14         1
##  text79.1     8      9         1
##  text80.1    11     13         1
##  text81.1    19     21         1
##  text82.1    24     32         1
##  text83.1     8      8         1
##  text84.1    13     14         1
##  text85.1     6      7         1
##  text86.1    18     19         1
##  text87.1    24     25         1
##  text88.1    12     13         1
##  text89.1    16     18         1
##  text90.1     4      4         1
##  text91.1    30     34         1
##  text92.1     6      6         1
##  text93.1     9      9         1
##  text94.1    12     14         1
##  text95.1    23     27         1
##  text96.1    14     17         1
##  text97.1     4      4         1
##  text98.1     9      9         1
## 
## Source: C:/Users/somy/Documents/text_analysis_intern/* on x86-64 by somy
## Created: Sun Nov 10 14:13:51 2019
## Notes: corpus_reshape.corpus(., stem = T, remove_punct = TRUE, remove_numbers = T, remove_hyphens = T)

#to View any text from review text corpus 

corp_review_text[1]

##                                      text1.1 
## "menginspirasi secara spiritual dan mental!"

#creation of the document feature matrix 
#a mathematical matrix that describes the frequency
#of terms that occur in a collection of documents

dfm_raw_text <- corpus_subset(corp_raw_text) %>% 
  dfm(remove = stopwords('english'), remove_punct = TRUE,remove_numbers=T,remove_hyphens=T)

#creation of lsa model

#raw_lsa <- textmodel_lsa(dfm_raw_text)



# we will trim the documentfeature matrix only for those words who have a 
#minimum of 10 occurence frequency
dfm_raw_text_trim<-dfm_trim(dfm_raw_text,min_termfreq = 10, verbose = FALSE)
set.seed(100)

#creation of the wordcloud with new trimmed DFM
textplot_wordcloud(dfm_raw_text_trim,)

#feature frequency for raw_text


features_dfm_raw_text <- textstat_frequency(dfm_raw_text, n = 100)

features_dfm_raw_text$feature <- with(features_dfm_raw_text,reorder(feature, -frequency))

ggplot(features_dfm_raw_text, aes(x = feature, y = frequency)) +
  geom_point() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#stop word  for raw_text
head(stopwords("en"), 20)

##  [1] "i"          "me"         "my"         "myself"     "we"        
##  [6] "our"        "ours"       "ourselves"  "you"        "your"      
## [11] "yours"      "yourself"   "yourselves" "he"         "him"       
## [16] "his"        "himself"    "she"        "her"        "hers"

#creation of the network using top 200 eng words 
toptag_raw_text <- names(topfeatures(dfm_raw_text,200))

tag_fcm_raw_text <- fcm(dfm_raw_text)
head(tag_fcm_raw_text)

## Feature co-occurrence matrix of: 6 by 6 features.
## 6 x 6 sparse Matrix of class "fcm"
##              features
## features      spiritually mentally inspiring book allows question
##   spiritually           0        1         1    2      0        0
##   mentally              0        0         1    0      0        0
##   inspiring             0        0         0    5      0        0
##   book                  0        0         0  149      9        2
##   allows                0        0         0    0      1        1
##   question              0        0         0    0      0        0

topgat_fcm_raw <- fcm_select(tag_fcm_raw_text, pattern = toptag_raw_text)
textplot_network(topgat_fcm_raw, min_freq = 0.1, edge_alpha = 0.8, edge_size = 5)

## Registered S3 method overwritten by 'network':
##   method            from    
##   summary.character quanteda

#creation of the network using top 20 eng words 
toptag_raw_text_20 <- names(topfeatures(dfm_raw_text,20))

tag_fcm_raw_text_20 <- fcm(dfm_raw_text)
head(tag_fcm_raw_text_20)

## Feature co-occurrence matrix of: 6 by 6 features.
## 6 x 6 sparse Matrix of class "fcm"
##              features
## features      spiritually mentally inspiring book allows question
##   spiritually           0        1         1    2      0        0
##   mentally              0        0         1    0      0        0
##   inspiring             0        0         0    5      0        0
##   book                  0        0         0  149      9        2
##   allows                0        0         0    0      1        1
##   question              0        0         0    0      0        0

topgat_fcm_raw_20 <- fcm_select(tag_fcm_raw_text_20, pattern = toptag_raw_text_20)
textplot_network(topgat_fcm_raw_20, min_freq = 0.1, edge_alpha = 0.8, edge_size = 5)

#indonesia
# we will trim the documentfeature matrix only for those words who have a 
#minimum of 10 occurence frequency

dfm_review_text <- corpus_subset(corp_review_text) %>% 
  dfm(remove = stopwords('id',source = 'stopwords-iso'), remove_punct = TRUE,remove_numbers=T,remove_hyphens=T)
  
dfm_review_text_trim<-dfm_trim(dfm_review_text,min_termfreq = 10, verbose = FALSE)


set.seed(100)
textplot_wordcloud(dfm_review_text_trim,color = c('brown', 'pink', 'green', 'purple', 'orange', 'blue'))

#creation of lsa model
#raw_lsa <- textmodel_lsa(dfm_review_text)

#feature frequency for review_text
features_dfm_review_text <- textstat_frequency(dfm_review_text, n = 100)

features_dfm_review_text$feature <- with(features_dfm_review_text,reorder(feature, -frequency))

#plotting the same with ggplot (grammerof graphics)
ggplot(features_dfm_review_text, aes(x = feature, y = frequency)) +
  geom_point() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#stop word  for review_text
head(stopwords("id",source='stopwords-iso'), 20)

##  [1] "ada"      "adalah"   "adanya"   "adapun"   "agak"     "agaknya" 
##  [7] "agar"     "akan"     "akankah"  "akhir"    "akhiri"   "akhirnya"
## [13] "aku"      "akulah"   "amat"     "amatlah"  "anda"     "andalah" 
## [19] "antar"    "antara"

#creation of the network using top 200 indonesian words 
toptag_review_text <- names(topfeatures(dfm_review_text,200))

tag_fcm_review_text <- fcm(dfm_review_text)
head(tag_fcm_review_text)

## Feature co-occurrence matrix of: 6 by 6 features.
## 6 x 6 sparse Matrix of class "fcm"
##                features
## features        menginspirasi spiritual mental buku moral membantu
##   menginspirasi             0         2      1    7     0        0
##   spiritual                 0         2      3   19     1        0
##   mental                    0         0      0    1     0        0
##   buku                      0         0      0  699     3       38
##   moral                     0         0      0    0     0        1
##   membantu                  0         0      0    0     0        7

topgat_fcm_review <- fcm_select(tag_fcm_review_text, pattern = toptag_review_text)
textplot_network(topgat_fcm_review, min_freq = 0.1, edge_alpha = 0.8, edge_size = 5)

#creation of the network using top 20 indonesian words 
toptag_review_text_20 <- names(topfeatures(dfm_review_text,20))

tag_fcm_review_text_20 <- fcm(dfm_review_text) 
head(tag_fcm_review_text)

## Feature co-occurrence matrix of: 6 by 6 features.
## 6 x 6 sparse Matrix of class "fcm"
##                features
## features        menginspirasi spiritual mental buku moral membantu
##   menginspirasi             0         2      1    7     0        0
##   spiritual                 0         2      3   19     1        0
##   mental                    0         0      0    1     0        0
##   buku                      0         0      0  699     3       38
##   moral                     0         0      0    0     0        1
##   membantu                  0         0      0    0     0        7

topgat_fcm_review_20 <- fcm_select(tag_fcm_review_text_20, pattern = toptag_review_text_20)
textplot_network(topgat_fcm_review_20, min_freq = 0.1, edge_alpha = 0.8, edge_size = 5)

text_analysis.R

somy

2019-11-10