Untitled

#knitr::opts_chunk$set(echo = TRUE)

library(gutenbergr)
require(widyr)

## Loading required package: widyr

require(igraph)

## Loading required package: igraph

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

require(knitr)

## Loading required package: knitr

require(NLP)

## Loading required package: NLP

require(ggraph)

## Loading required package: ggraph

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

require(readr)

## Loading required package: readr

require(tm)

## Loading required package: tm

require(data.table)

## Loading required package: data.table

require(dplyr)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:igraph':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(stringr)

## Loading required package: stringr

require(jiebaR)

## Loading required package: jiebaR

## Loading required package: jiebaRD

require(udpipe)

## Loading required package: udpipe

require(tidytext)

## Loading required package: tidytext

require(ggplot2)
require(tidyr)

## Loading required package: tidyr

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:igraph':
## 
##     crossing

require(topicmodels)

## Loading required package: topicmodels

require(LDAvis)

## Loading required package: LDAvis

require(wordcloud)

## Loading required package: wordcloud

## Loading required package: RColorBrewer

require(wordcloud2)

## Loading required package: wordcloud2

require(webshot)

## Loading required package: webshot

require(htmlwidgets)

## Loading required package: htmlwidgets

require(servr)

## Loading required package: servr

require(purrr)

## Loading required package: purrr

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:data.table':
## 
##     transpose

## The following objects are masked from 'package:igraph':
## 
##     compose, simplify

require(ramify)

## Loading required package: ramify

## 
## Attaching package: 'ramify'

## The following object is masked from 'package:purrr':
## 
##     flatten

## The following object is masked from 'package:webshot':
## 
##     resize

## The following object is masked from 'package:tidyr':
## 
##     fill

## The following object is masked from 'package:graphics':
## 
##     clip

require(reshape2)

## Loading required package: reshape2

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

require(scales)

## Loading required package: scales

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

require(RColorBrewer)
library(stringr)
library(dplyr)
library(tidytext)
library(ggplot2)
library(tidyr)
library(gutenbergr)
library(caTools)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(rpart)
library(rpart.plot)
library(tm)
library(e1071)
library(textstem)

## Loading required package: koRpus.lang.en

## Loading required package: koRpus

## Loading required package: sylly

## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()

## 
## Attaching package: 'koRpus'

## The following object is masked from 'package:tm':
## 
##     readTagged

## The following object is masked from 'package:readr':
## 
##     tokenize

#第一題

g2 <- graph(edges=c(1,2,1,3,1,4,2,4,2,5,3,6,4,3,4,6,4,7,5,4,5,7,7,6), n=7 )

plot(g2)

#density
ecount(g2)/(vcount(g2)*(vcount(g2)-1))

## [1] 0.2857143

#reciporcity
reciprocity(g2)

## [1] 0

diam <- get_diameter(g2, directed=T)

#diameter
diam

## + 3/7 vertices, from 87f91a7:
## [1] 1 2 5

diameter(g2, directed=T, weights=NA)

## [1] 2

#第二題



DH_174_76 <- gutenberg_download(c(174,76))%>%
  filter(text!='')%>%
  mutate(document=text)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

  #mutate(document = row_number())
 

#DH_174_76%


by_chapter_174_76 <- DH_174_76 %>%
  mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
  filter(chapter !=0) %>%
  select(gutenberg_id,text,document)%>%
  filter(!text %like% 'CHAPTER')



# split into words
by_chapter_word_174_76 <- by_chapter_174_76 %>%
  unnest_tokens(word, text)



# find document-word counts
word_counts_174_76 <- by_chapter_word_174_76 %>%
  anti_join(stop_words) %>%
  count(document, word, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

word_counts_174_76

## # A tibble: 58,242 x 3
##    document                                                        word        n
##    <chr>                                                           <chr>   <int>
##  1 “Tramp--tramp--tramp; that's the dead; tramp--tramp--tramp; th~ tramp       6
##  2 bum! bumble-umble-um-bum-bum-bum-bum--and the thunder would go~ bum         5
##  3 he made all sorts of signs with his hands and said “Goo-goo--g~ goo         5
##  4 all a-tremblin', en crope aroun' en open de do' easy en slow, ~ en          4
##  5 “A fess--a fess is--_you_ don't need to know what a fess is. I~ fess        3
##  6 “Drinkin'? Has I ben a-drinkin'? Has I had a chance to be a-dr~ drinkin     3
##  7 “Oh, he's dead, he's dead, I know he's dead!”                   dead        3
##  8 “Oh, your theories about life, your theories about love, your ~ theori~     3
##  9 “Trouble has done it, Bilgewater, trouble has done it; trouble~ trouble     3
## 10 “Well, you wouldn't a ben here 'f it hadn't a ben for Jim. You~ ben         3
## # ... with 58,232 more rows

DH_dtm <- word_counts_174_76 %>%
  cast_dtm(document, word, n)
#DH_dtm


DH_lda <- LDA(DH_dtm, k = 2, control = list(seed = 2020))
#DH_lda


DH_topics <- tidy(DH_lda, matrix = "beta")
#DH_topics


DH_terms <- DH_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%#
  arrange(topic, -beta)

#DH_terms


DH_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

DH_174 <- gutenberg_download(c(174))%>%
  mutate(title='Dorian_Gray')


DH_76 <- gutenberg_download(c(76))%>%
  mutate(title='Huckleberry')



DH_all<-rbind(DH_174,DH_76)%>%
  mutate(document = row_number())


tidy_books <- DH_all %>%
  unnest_tokens(word, text) %>%
  group_by(word) %>%
  anti_join(stop_words) %>% 
  filter(n() > 10) %>% #只取出出現大於10次的字
  ungroup()

## Joining, by = "word"

tidy_books$lemma = lemmatize_words(tidy_books$word)

head(tidy_books)

## # A tibble: 6 x 5
##   gutenberg_id title       document word    lemma  
##          <int> <chr>          <int> <chr>   <chr>  
## 1          174 Dorian_Gray        1 picture picture
## 2          174 Dorian_Gray        1 dorian  dorian 
## 3          174 Dorian_Gray        1 gray    gray   
## 4          174 Dorian_Gray        9 chapter chapter
## 5          174 Dorian_Gray       10 chapter chapter
## 6          174 Dorian_Gray       11 chapter chapter

dtm = tidy_books %>% 
      count(document,lemma) %>% 
      cast_dtm(document,lemma,n)

inspect(dtm[1:10,1:10])

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 12/88
## Sparsity           : 88%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs art artist beautiful chapter dorian form gray manner mode picture
##   1    0      0         0       0      1    0    1      0    0       1
##   10   0      0         0       1      0    0    0      0    0       0
##   11   0      0         0       1      0    0    0      0    0       0
##   12   0      0         0       1      0    0    0      0    0       0
##   13   0      0         0       1      0    0    0      0    0       0
##   14   0      0         0       1      0    0    0      0    0       0
##   15   0      0         0       1      0    0    0      0    0       0
##   16   0      0         0       1      0    0    0      0    0       0
##   17   0      0         0       1      0    0    0      0    0       0
##   9    0      0         0       1      0    0    0      0    0       0

Dorian_Gray = unique(tidy_books$document[which(tidy_books$title == "Dorian_Gray")])


dtm = as.data.frame(as.matrix(dtm)) 




dtm$is_Dorian_Gray = as.integer(rownames(dtm)) %in% Dorian_Gray
dtm$is_Dorian_Gray = as.factor(ifelse(dtm$is_Dorian_Gray,1,0)) #是jane為1,不是為0


table(dtm$is_Dorian_Gray)

## 
##    0    1 
## 8245 6119

set.seed(123)
spl = sample.split(dtm$is_Dorian_Gray, 0.7) #在保留is_jane比例的狀況下以7:3將原始資料分割成訓練集與測試集
TR = subset(dtm, spl == TRUE)
TS = subset(dtm, spl == FALSE)



document_words <- tidy_books %>% #計算每個document裡不同的字的tf
                  count(document, lemma, sort = TRUE)



total_words <- document_words %>% 
                group_by(document) %>% 
                summarize(total = sum(n)) %>% 
                right_join(document_words) %>% 
                mutate(is_Dorian_Gray = document %in% Dorian_Gray)

## Joining, by = "document"

#total_words <- document_words %>% 
#                group_by(document) %>% 
#                summarize(total = sum(n)) %>% 
#                right_join(document_words) %>% 
#                mutate(is_jane = document %in% jane)



total_words = total_words %>% 
              bind_tf_idf(lemma,document,n)


dtm.tfidf = total_words %>% cast_dtm(document,lemma,tf_idf)
dim(dtm.tfidf)

## [1] 14364   922

inspect(dtm.tfidf[1:10,1:10])

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 12/88
## Sparsity           : 88%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs art artist beautiful  chapter   dorian form     gray manner mode  picture
##   1    0      0         0 0.000000 1.188709    0 1.421404      0    0 1.676395
##   10   0      0         0 4.712668 0.000000    0 0.000000      0    0 0.000000
##   11   0      0         0 4.712668 0.000000    0 0.000000      0    0 0.000000
##   12   0      0         0 4.712668 0.000000    0 0.000000      0    0 0.000000
##   13   0      0         0 4.712668 0.000000    0 0.000000      0    0 0.000000
##   14   0      0         0 4.712668 0.000000    0 0.000000      0    0 0.000000
##   15   0      0         0 4.712668 0.000000    0 0.000000      0    0 0.000000
##   16   0      0         0 4.712668 0.000000    0 0.000000      0    0 0.000000
##   17   0      0         0 4.712668 0.000000    0 0.000000      0    0 0.000000
##   9    0      0         0 4.712668 0.000000    0 0.000000      0    0 0.000000

dtm.tfidf = as.data.frame(as.matrix(dtm.tfidf))

dtm.tfidf$is_Dorian_Gray = as.integer(rownames(dtm.tfidf)) %in% Dorian_Gray
dtm.tfidf$is_Dorian_Gray = as.factor(ifelse(dtm.tfidf$is_Dorian_Gray,1,0))


set.seed(123)
spl = sample.split(dtm.tfidf$is_Dorian_Gray, 0.7) 
TR.tfidf = subset(dtm.tfidf, spl == TRUE)
TS.tfidf = subset(dtm.tfidf, spl == FALSE)



svm.fit = svm(is_Dorian_Gray~.,TR,kernel = "linear",cost = 10,scale = F)
p.svm = pred = predict(svm.fit,TS)


confusionMatrix(p.svm,TS$is_Dorian_Gray,dnn = c("Prediction","Reference"),positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2205  348
##          1  268 1488
##                                           
##                Accuracy : 0.857           
##                  95% CI : (0.8462, 0.8674)
##     No Information Rate : 0.5739          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.706           
##                                           
##  Mcnemar's Test P-Value : 0.001458        
##                                           
##             Sensitivity : 0.8105          
##             Specificity : 0.8916          
##          Pos Pred Value : 0.8474          
##          Neg Pred Value : 0.8637          
##              Prevalence : 0.4261          
##          Detection Rate : 0.3453          
##    Detection Prevalence : 0.4075          
##       Balanced Accuracy : 0.8510          
##                                           
##        'Positive' Class : 1               
##

#t0 = Sys.time()
## cost about 4 mins
#glm = glm(is_Dorian_Gray~.,TR,family = "binomial") # 以is_jane作為y,其他欄位都當作x

#Sys.time() - t0

#glm.pred = predict(glm,TS,type="response") #返回為is_jane為1的機率

#cm = table(actual = TS$is_jane , pred = glm.pred >= 0.5);
#cm'''

dtm

#第二題B



DH_174 <- gutenberg_download(c(174))

tidy_DH_174 <- DH_174 %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)

## Joining, by = "word"

tidy_tidy_DH_c_174<-tidy_DH_174 %>%
  count(word, sort = TRUE)%>%
  mutate(document='Dorian_Gray')



DH_76 <- gutenberg_download(c(76))

tidy_DH_76 <- DH_76 %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)

## Joining, by = "word"

tidy_tidy_DH_c_76<-tidy_DH_76 %>%
  count(word, sort = TRUE)%>%
  mutate(document='Huckleberry')


tidy_tidy_DH_c_all<-rbind(tidy_tidy_DH_c_174,tidy_tidy_DH_c_76)
tidy_tidy_DH_c_all$lemma = lemmatize_words(tidy_tidy_DH_c_all$word)
head(tidy_tidy_DH_c_all)

## # A tibble: 6 x 4
##   word       n document    lemma 
##   <chr>  <int> <chr>       <chr> 
## 1 dorian   410 Dorian_Gray dorian
## 2 don’t    255 Dorian_Gray don’t 
## 3 lord     248 Dorian_Gray lord  
## 4 life     229 Dorian_Gray life  
## 5 henry    223 Dorian_Gray henry 
## 6 gray     189 Dorian_Gray gray

document_words <- tidy_tidy_DH_c_all %>% #計算每個document裡不同的字的tf
                  count(document, lemma, sort = TRUE)

dtm = document_words %>% 
      count(document,lemma) %>% 
      cast_dtm(document,lemma,n)


total_words <- document_words %>% 
                group_by(document) %>% 
                summarize(total = sum(n)) %>% 
                right_join(document_words) %>% 
                mutate(is_Dorian_Gray = document %in% 'Dorian_Gray')

## Joining, by = "document"

total_words = total_words %>% 
              bind_tf_idf(lemma,document,n)


dtm.tfidf = total_words %>% cast_dtm(document,lemma,tf_idf)
dim(dtm.tfidf)

## [1]    2 7922

dtm.tfidf = as.data.frame(as.matrix(dtm.tfidf))


dtm.tfidf$is_Dorian_Gray = as.integer(rownames(dtm.tfidf)) %in% 'Dorian_Gray'

## Warning in as.integer(rownames(dtm.tfidf)) %in% "Dorian_Gray": 強制變更過程中產
## 生了 NA

dtm.tfidf$is_Dorian_Gray = as.factor(ifelse(dtm.tfidf$is_Dorian_Gray,1,0))



dtm$is_Dorian_Gray = as.integer(rownames(dtm)) %in% 'Dorian_Gray'

## Warning in as.integer(rownames(dtm)) %in% "Dorian_Gray": 強制變更過程中產生了 NA

dtm$is_Dorian_Gray = as.factor(ifelse(dtm$is_Dorian_Gray,1,0)) #

set.seed(123)
spl = sample.split(dtm$is_Dorian_Gray, 0.7) #在保
TR = subset(dtm, spl == TRUE)
TS = subset(dtm, spl == FALSE)

set.seed(123)
spl = sample.split(dtm.tfidf$is_Dorian_Gray, 0.7) 
TR.tfidf = subset(dtm.tfidf, spl == TRUE)
TS.tfidf = subset(dtm.tfidf, spl == FALSE)


t0 = Sys.time()
glm.tfidf = glm(is_Dorian_Gray~.,TR.tfidf,family = "binomial")

glm.tfidf.pred = predict(glm.tfidf,TS.tfidf,type="response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

#confusionMatrix(factor(ifelse(glm.tfidf.pred >= 0.5,1,0)),factor(TS.tfidf$is_Dorian_Gray),positive="1")

Untitled

example

2021/6/26