#knitr::opts_chunk$set(echo = TRUE)
library(gutenbergr)
require(widyr)
## Loading required package: widyr
require(igraph)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
require(knitr)
## Loading required package: knitr
require(NLP)
## Loading required package: NLP
require(ggraph)
## Loading required package: ggraph
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
require(readr)
## Loading required package: readr
require(tm)
## Loading required package: tm
require(data.table)
## Loading required package: data.table
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:igraph':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(stringr)
## Loading required package: stringr
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(udpipe)
## Loading required package: udpipe
require(tidytext)
## Loading required package: tidytext
require(ggplot2)
require(tidyr)
## Loading required package: tidyr
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:igraph':
##
## crossing
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
require(wordcloud2)
## Loading required package: wordcloud2
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
require(purrr)
## Loading required package: purrr
##
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
##
## transpose
## The following objects are masked from 'package:igraph':
##
## compose, simplify
require(ramify)
## Loading required package: ramify
##
## Attaching package: 'ramify'
## The following object is masked from 'package:purrr':
##
## flatten
## The following object is masked from 'package:webshot':
##
## resize
## The following object is masked from 'package:tidyr':
##
## fill
## The following object is masked from 'package:graphics':
##
## clip
require(reshape2)
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## The following objects are masked from 'package:data.table':
##
## dcast, melt
require(scales)
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
require(RColorBrewer)
library(stringr)
library(dplyr)
library(tidytext)
library(ggplot2)
library(tidyr)
library(gutenbergr)
library(caTools)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(rpart.plot)
library(tm)
library(e1071)
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
##
## available.koRpus.lang()
##
## and see ?install.koRpus.lang()
##
## Attaching package: 'koRpus'
## The following object is masked from 'package:tm':
##
## readTagged
## The following object is masked from 'package:readr':
##
## tokenize
#第一題
g2 <- graph(edges=c(1,2,1,3,1,4,2,4,2,5,3,6,4,3,4,6,4,7,5,4,5,7,7,6), n=7 )
plot(g2)
#density
ecount(g2)/(vcount(g2)*(vcount(g2)-1))
## [1] 0.2857143
#reciporcity
reciprocity(g2)
## [1] 0
diam <- get_diameter(g2, directed=T)
#diameter
diam
## + 3/7 vertices, from 87f91a7:
## [1] 1 2 5
diameter(g2, directed=T, weights=NA)
## [1] 2
#第二題
DH_174_76 <- gutenberg_download(c(174,76))%>%
filter(text!='')%>%
mutate(document=text)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
#mutate(document = row_number())
#DH_174_76%
by_chapter_174_76 <- DH_174_76 %>%
mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
filter(chapter !=0) %>%
select(gutenberg_id,text,document)%>%
filter(!text %like% 'CHAPTER')
# split into words
by_chapter_word_174_76 <- by_chapter_174_76 %>%
unnest_tokens(word, text)
# find document-word counts
word_counts_174_76 <- by_chapter_word_174_76 %>%
anti_join(stop_words) %>%
count(document, word, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
word_counts_174_76
## # A tibble: 58,242 x 3
## document word n
## <chr> <chr> <int>
## 1 “Tramp--tramp--tramp; that's the dead; tramp--tramp--tramp; th~ tramp 6
## 2 bum! bumble-umble-um-bum-bum-bum-bum--and the thunder would go~ bum 5
## 3 he made all sorts of signs with his hands and said “Goo-goo--g~ goo 5
## 4 all a-tremblin', en crope aroun' en open de do' easy en slow, ~ en 4
## 5 “A fess--a fess is--_you_ don't need to know what a fess is. I~ fess 3
## 6 “Drinkin'? Has I ben a-drinkin'? Has I had a chance to be a-dr~ drinkin 3
## 7 “Oh, he's dead, he's dead, I know he's dead!” dead 3
## 8 “Oh, your theories about life, your theories about love, your ~ theori~ 3
## 9 “Trouble has done it, Bilgewater, trouble has done it; trouble~ trouble 3
## 10 “Well, you wouldn't a ben here 'f it hadn't a ben for Jim. You~ ben 3
## # ... with 58,232 more rows
DH_dtm <- word_counts_174_76 %>%
cast_dtm(document, word, n)
#DH_dtm
DH_lda <- LDA(DH_dtm, k = 2, control = list(seed = 2020))
#DH_lda
DH_topics <- tidy(DH_lda, matrix = "beta")
#DH_topics
DH_terms <- DH_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%#
arrange(topic, -beta)
#DH_terms
DH_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
DH_174 <- gutenberg_download(c(174))%>%
mutate(title='Dorian_Gray')
DH_76 <- gutenberg_download(c(76))%>%
mutate(title='Huckleberry')
DH_all<-rbind(DH_174,DH_76)%>%
mutate(document = row_number())
tidy_books <- DH_all %>%
unnest_tokens(word, text) %>%
group_by(word) %>%
anti_join(stop_words) %>%
filter(n() > 10) %>% #只取出出現大於10次的字
ungroup()
## Joining, by = "word"
tidy_books$lemma = lemmatize_words(tidy_books$word)
head(tidy_books)
## # A tibble: 6 x 5
## gutenberg_id title document word lemma
## <int> <chr> <int> <chr> <chr>
## 1 174 Dorian_Gray 1 picture picture
## 2 174 Dorian_Gray 1 dorian dorian
## 3 174 Dorian_Gray 1 gray gray
## 4 174 Dorian_Gray 9 chapter chapter
## 5 174 Dorian_Gray 10 chapter chapter
## 6 174 Dorian_Gray 11 chapter chapter
dtm = tidy_books %>%
count(document,lemma) %>%
cast_dtm(document,lemma,n)
inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 12/88
## Sparsity : 88%
## Maximal term length: 9
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs art artist beautiful chapter dorian form gray manner mode picture
## 1 0 0 0 0 1 0 1 0 0 1
## 10 0 0 0 1 0 0 0 0 0 0
## 11 0 0 0 1 0 0 0 0 0 0
## 12 0 0 0 1 0 0 0 0 0 0
## 13 0 0 0 1 0 0 0 0 0 0
## 14 0 0 0 1 0 0 0 0 0 0
## 15 0 0 0 1 0 0 0 0 0 0
## 16 0 0 0 1 0 0 0 0 0 0
## 17 0 0 0 1 0 0 0 0 0 0
## 9 0 0 0 1 0 0 0 0 0 0
Dorian_Gray = unique(tidy_books$document[which(tidy_books$title == "Dorian_Gray")])
dtm = as.data.frame(as.matrix(dtm))
dtm$is_Dorian_Gray = as.integer(rownames(dtm)) %in% Dorian_Gray
dtm$is_Dorian_Gray = as.factor(ifelse(dtm$is_Dorian_Gray,1,0)) #是jane為1,不是為0
table(dtm$is_Dorian_Gray)
##
## 0 1
## 8245 6119
set.seed(123)
spl = sample.split(dtm$is_Dorian_Gray, 0.7) #在保留is_jane比例的狀況下以7:3將原始資料分割成訓練集與測試集
TR = subset(dtm, spl == TRUE)
TS = subset(dtm, spl == FALSE)
document_words <- tidy_books %>% #計算每個document裡不同的字的tf
count(document, lemma, sort = TRUE)
total_words <- document_words %>%
group_by(document) %>%
summarize(total = sum(n)) %>%
right_join(document_words) %>%
mutate(is_Dorian_Gray = document %in% Dorian_Gray)
## Joining, by = "document"
#total_words <- document_words %>%
# group_by(document) %>%
# summarize(total = sum(n)) %>%
# right_join(document_words) %>%
# mutate(is_jane = document %in% jane)
total_words = total_words %>%
bind_tf_idf(lemma,document,n)
dtm.tfidf = total_words %>% cast_dtm(document,lemma,tf_idf)
dim(dtm.tfidf)
## [1] 14364 922
inspect(dtm.tfidf[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 12/88
## Sparsity : 88%
## Maximal term length: 9
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs art artist beautiful chapter dorian form gray manner mode picture
## 1 0 0 0 0.000000 1.188709 0 1.421404 0 0 1.676395
## 10 0 0 0 4.712668 0.000000 0 0.000000 0 0 0.000000
## 11 0 0 0 4.712668 0.000000 0 0.000000 0 0 0.000000
## 12 0 0 0 4.712668 0.000000 0 0.000000 0 0 0.000000
## 13 0 0 0 4.712668 0.000000 0 0.000000 0 0 0.000000
## 14 0 0 0 4.712668 0.000000 0 0.000000 0 0 0.000000
## 15 0 0 0 4.712668 0.000000 0 0.000000 0 0 0.000000
## 16 0 0 0 4.712668 0.000000 0 0.000000 0 0 0.000000
## 17 0 0 0 4.712668 0.000000 0 0.000000 0 0 0.000000
## 9 0 0 0 4.712668 0.000000 0 0.000000 0 0 0.000000
dtm.tfidf = as.data.frame(as.matrix(dtm.tfidf))
dtm.tfidf$is_Dorian_Gray = as.integer(rownames(dtm.tfidf)) %in% Dorian_Gray
dtm.tfidf$is_Dorian_Gray = as.factor(ifelse(dtm.tfidf$is_Dorian_Gray,1,0))
set.seed(123)
spl = sample.split(dtm.tfidf$is_Dorian_Gray, 0.7)
TR.tfidf = subset(dtm.tfidf, spl == TRUE)
TS.tfidf = subset(dtm.tfidf, spl == FALSE)
svm.fit = svm(is_Dorian_Gray~.,TR,kernel = "linear",cost = 10,scale = F)
p.svm = pred = predict(svm.fit,TS)
confusionMatrix(p.svm,TS$is_Dorian_Gray,dnn = c("Prediction","Reference"),positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2205 348
## 1 268 1488
##
## Accuracy : 0.857
## 95% CI : (0.8462, 0.8674)
## No Information Rate : 0.5739
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.706
##
## Mcnemar's Test P-Value : 0.001458
##
## Sensitivity : 0.8105
## Specificity : 0.8916
## Pos Pred Value : 0.8474
## Neg Pred Value : 0.8637
## Prevalence : 0.4261
## Detection Rate : 0.3453
## Detection Prevalence : 0.4075
## Balanced Accuracy : 0.8510
##
## 'Positive' Class : 1
##
#t0 = Sys.time()
## cost about 4 mins
#glm = glm(is_Dorian_Gray~.,TR,family = "binomial") # 以is_jane作為y,其他欄位都當作x
#Sys.time() - t0
#glm.pred = predict(glm,TS,type="response") #返回為is_jane為1的機率
#cm = table(actual = TS$is_jane , pred = glm.pred >= 0.5);
#cm'''
dtm
#第二題B
DH_174 <- gutenberg_download(c(174))
tidy_DH_174 <- DH_174 %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_tidy_DH_c_174<-tidy_DH_174 %>%
count(word, sort = TRUE)%>%
mutate(document='Dorian_Gray')
DH_76 <- gutenberg_download(c(76))
tidy_DH_76 <- DH_76 %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_tidy_DH_c_76<-tidy_DH_76 %>%
count(word, sort = TRUE)%>%
mutate(document='Huckleberry')
tidy_tidy_DH_c_all<-rbind(tidy_tidy_DH_c_174,tidy_tidy_DH_c_76)
tidy_tidy_DH_c_all$lemma = lemmatize_words(tidy_tidy_DH_c_all$word)
head(tidy_tidy_DH_c_all)
## # A tibble: 6 x 4
## word n document lemma
## <chr> <int> <chr> <chr>
## 1 dorian 410 Dorian_Gray dorian
## 2 don’t 255 Dorian_Gray don’t
## 3 lord 248 Dorian_Gray lord
## 4 life 229 Dorian_Gray life
## 5 henry 223 Dorian_Gray henry
## 6 gray 189 Dorian_Gray gray
document_words <- tidy_tidy_DH_c_all %>% #計算每個document裡不同的字的tf
count(document, lemma, sort = TRUE)
dtm = document_words %>%
count(document,lemma) %>%
cast_dtm(document,lemma,n)
total_words <- document_words %>%
group_by(document) %>%
summarize(total = sum(n)) %>%
right_join(document_words) %>%
mutate(is_Dorian_Gray = document %in% 'Dorian_Gray')
## Joining, by = "document"
total_words = total_words %>%
bind_tf_idf(lemma,document,n)
dtm.tfidf = total_words %>% cast_dtm(document,lemma,tf_idf)
dim(dtm.tfidf)
## [1] 2 7922
dtm.tfidf = as.data.frame(as.matrix(dtm.tfidf))
dtm.tfidf$is_Dorian_Gray = as.integer(rownames(dtm.tfidf)) %in% 'Dorian_Gray'
## Warning in as.integer(rownames(dtm.tfidf)) %in% "Dorian_Gray": 強制變更過程中產
## 生了 NA
dtm.tfidf$is_Dorian_Gray = as.factor(ifelse(dtm.tfidf$is_Dorian_Gray,1,0))
dtm$is_Dorian_Gray = as.integer(rownames(dtm)) %in% 'Dorian_Gray'
## Warning in as.integer(rownames(dtm)) %in% "Dorian_Gray": 強制變更過程中產生了 NA
dtm$is_Dorian_Gray = as.factor(ifelse(dtm$is_Dorian_Gray,1,0)) #
set.seed(123)
spl = sample.split(dtm$is_Dorian_Gray, 0.7) #在保
TR = subset(dtm, spl == TRUE)
TS = subset(dtm, spl == FALSE)
set.seed(123)
spl = sample.split(dtm.tfidf$is_Dorian_Gray, 0.7)
TR.tfidf = subset(dtm.tfidf, spl == TRUE)
TS.tfidf = subset(dtm.tfidf, spl == FALSE)
t0 = Sys.time()
glm.tfidf = glm(is_Dorian_Gray~.,TR.tfidf,family = "binomial")
glm.tfidf.pred = predict(glm.tfidf,TS.tfidf,type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
#confusionMatrix(factor(ifelse(glm.tfidf.pred >= 0.5,1,0)),factor(TS.tfidf$is_Dorian_Gray),positive="1")