2021年5月15日,雙北因為萬華群聚感染爆發而宣布進入三級警戒,我們想探討在雙北與全國進入第三集警戒之後,PTT 上有關疫情的討論的主題和貼文者與回覆者形成的社群網絡
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
library(data.table)
library(ggplot2)
library(dplyr)
library(jiebaR)
library(tidytext)
library(stringr)
library(tm)
library(topicmodels)
library(purrr)
require(RColorBrewer)
library(gridExtra)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
library(igraph)
library(scales)
library(showtext) # for chinese font
showtext.auto()
metadata <- fread("ptt_prevention_articleMetaData.csv", encoding = "UTF-8")
metadata$artDate <- as.Date(metadata$artDate, "%Y/%m/%d")
metadata %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate) %>%
summarise(count = n())%>%
ggplot(aes(artDate,count))+
geom_line(color="red")+
geom_point()
#time.jpg
文章回覆量的中位數為 70.51
metadata %>%
select(artUrl, commentNum) %>%
ggplot() +
geom_density(aes(x = commentNum)) +
ggtitle("貼文的回覆數量分布") +
xlab("數量 (commentNum)")
summary(metadata$commentNum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 18.00 70.51 49.00 1478.00
使用默認參數初始化一個斷詞引擎
#預設
jieba_tokenizer = worker()
#加入字典及停用字
user_dict <- scan(file = "user_dict.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
stop_words <- scan(file = "stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
#stop_words
new_user_word(jieba_tokenizer, c(user_dict))
## [1] TRUE
news_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!tokens %in% stop_words]
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
計算每篇文章各token出現次數
# tokens <- metadata %>%
# unnest_tokens(word, sentence, token=news_tokenizer) %>%
# filter((!str_detect(word, regex("[0-9a-zA-Z]"))) | str_detect(word, regex("[Aa][Zz]"))) %>%
# #filter(!(word %in% stop_words)) %>%
# count(artUrl, word) %>%
# rename(count=n)
# tokens %>% head(20)
# tokens
# saveRDS(tokens, file = "0614token_result.rds")
tokens = readRDS(gzfile("0614token_result.rds"))
dtm <-tokens %>% cast_dtm(artUrl, word, count)
dtm
## <<DocumentTermMatrix (documents: 1874, terms: 25447)>>
## Non-/sparse entries: 126658/47561020
## Sparsity : 100%
## Maximal term length: 26
## Weighting : term frequency (tf)
inspect(dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 28/72
## Sparsity : 72%
## Maximal term length: 2
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 差距 防疫 沒用 努力
## https://www.ptt.cc/bbs/Gossiping/M.1621008816.A.F02.html 1 2 1 2
## https://www.ptt.cc/bbs/Gossiping/M.1621012600.A.CFF.html 0 2 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621016951.A.D4A.html 0 3 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621025291.A.41B.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621029977.A.13B.html 0 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621031373.A.5E3.html 0 3 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1621033887.A.383.html 0 2 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621037027.A.1B8.html 0 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621038361.A.C39.html 0 1 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621039285.A.F03.html 0 1 0 0
## Terms
## Docs 失守 台灣 問題 疫苗
## https://www.ptt.cc/bbs/Gossiping/M.1621008816.A.F02.html 2 1 1 3
## https://www.ptt.cc/bbs/Gossiping/M.1621012600.A.CFF.html 0 2 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621016951.A.D4A.html 0 2 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621025291.A.41B.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621029977.A.13B.html 2 2 0 1
## https://www.ptt.cc/bbs/Gossiping/M.1621031373.A.5E3.html 0 0 0 3
## https://www.ptt.cc/bbs/Gossiping/M.1621033887.A.383.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621037027.A.1B8.html 0 0 1 0
## https://www.ptt.cc/bbs/Gossiping/M.1621038361.A.C39.html 0 0 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621039285.A.F03.html 0 0 2 0
## Terms
## Docs 真正 足夠
## https://www.ptt.cc/bbs/Gossiping/M.1621008816.A.F02.html 1 1
## https://www.ptt.cc/bbs/Gossiping/M.1621012600.A.CFF.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621016951.A.D4A.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621025291.A.41B.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621029977.A.13B.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621031373.A.5E3.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621033887.A.383.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621037027.A.1B8.html 0 0
## https://www.ptt.cc/bbs/Gossiping/M.1621038361.A.C39.html 2 0
## https://www.ptt.cc/bbs/Gossiping/M.1621039285.A.F03.html 0 0
# lda <- LDA(dtm, k = 4, control = list(seed = 2021))
# lda <- LDA(dtm, k = 4, control = list(seed = 2021,alpha = 2,delta=0.01),method = "Gibbs")
# saveRDS(lda, file = "lda.rds")
lda = readRDS(gzfile("lda.rds"))
topics_words <- tidy(lda, matrix = "beta")
colnames(topics_words) <- c("topic", "term", "phi")
head(topics_words)
## # A tibble: 6 x 3
## topic term phi
## <int> <chr> <dbl>
## 1 1 台灣 0.000000200
## 2 2 台灣 0.00829
## 3 3 台灣 0.000000269
## 4 4 台灣 0.0382
## 5 1 失守 0.000000200
## 6 2 失守 0.000000204
terms依照各主題的phi值由大到小排序,列出前15大
removed_word = c("防疫")
topics_words %>%
filter(! term %in% removed_word) %>%
group_by(topic) %>%
top_n(10, phi) %>%
ungroup() %>%
mutate(top_words = reorder_within(term,phi,topic)) %>%
ggplot(aes(x = top_words, y = phi, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
#lda.jpg
嘗試2、4、6、10、15個主題數,將結果存起來,再做進一步分析。
# ldas = c()
# topics = c(2,4,6,10,15)
# for(topic in topics){
# start_time <- Sys.time()
# lda <- LDA(dtm, k = topic, control = list(seed = 2021))
# ldas =c(ldas,lda)
# print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
# save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
# }
載入每個主題的LDA結果
load("ldas_result.rdata")
topics = c(2,4,6,10,15)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
#perplexity.jpg
# install.packages("ldatuning")
library("ldatuning")
# result <- FindTopicsNumber(
# dtm,
# topics = topics,
# metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
# method = "Gibbs",
# control = list(seed = 2021),
# mc.cores = 2L,
# verbose = TRUE
# )
# saveRDS(result, file = "ldatuning_result.rds")
ldatuning_result = readRDS(gzfile("ldatuning_result.rds"))
FindTopicsNumber_plot(ldatuning_result)
#ldatuning.jpg
label_topics <- tidy(lda, matrix="gamma") %>% # document topic gamma
group_by(document) %>%
top_n(1, wt=gamma)
metadata_topic <- merge(x = metadata, y = label_topics, by.x = "artUrl", by.y="document")
# head(metadata_topic)
#write.csv(metadata_topic,file="metadata_topic.csv",row.names = FALSE)
the_lda = ldas[[2]] ## 選定topic 為 4 的結果
topics_name = c("雙北疫情","中央疫情指揮中心","防疫生活","疫苗相關")
# for every document we have a probability distribution of its contained topics
tmResult <- posterior(the_lda)
doc_pro <- tmResult$topics
rownames_doc_pro <- row.names(doc_pro)
document_topics <- doc_pro[metadata[artUrl %in% rownames_doc_pro]$artUrl,]
document_topics_df =data.frame(document_topics)
colnames(document_topics_df) = topics_name
rownames(document_topics_df) = NULL
news_topic = cbind(metadata,document_topics_df)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 1874 rows but longest item has 1875; recycled with
## remainder.
#saveRDS(news_topic, file = "news_topic.rds")
bar_data = news_topic %>%
group_by(artCat) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
select("artCat", "雙北疫情","中央疫情指揮中心","防疫生活","疫苗相關") %>%
melt(id.vars = "artCat")
bar_data %>%
ggplot( aes(x=artCat, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors[c(1,5,8,12)])+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
#barchart.jpg
#八卦的圓餅圖
Gossiping_pie = bar_data %>%
filter(artCat == "Gossiping") %>%
ggplot( aes(x=artCat, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors[c(1,5,8,12)])+
coord_polar(theta = "y")+
theme(axis.title=element_blank(),axis.text=element_blank(),axis.ticks=element_blank())
#政黑的圓餅圖
HatePolitics_pie = bar_data %>%
filter(artCat == "HatePolitics") %>%
ggplot( aes(x=artCat, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors[c(1,5,8,12)])+
coord_polar(theta = "y")+
theme(axis.title=element_blank(),axis.text=element_blank(),axis.ticks=element_blank())
#合併
grid.arrange(Gossiping_pie,HatePolitics_pie,ncol=2,nrow=1)
#piechart.jpg
這部分主要是改用一些 Python 的工具來幫助 text mining
The Stanford NLP Group’s official Python NLP library. It contains support for running various accurate natural language processing tools on 60+ languages and for accessing the Java Stanford CoreNLP software from Python.
import stanza
stanza.__version__
'1.1.1'
nlp = stanza.Pipeline('zh-hant', use_gpu=False)
2021-06-15 17:28:48 INFO: Loading these models for language: zh-hant (Traditional_Chinese):
=======================
| Processor | Package |
-----------------------
| tokenize | gsd |
| pos | gsd |
| lemma | gsd |
| depparse | gsd |
=======================
2021-06-15 17:28:48 INFO: Use device: cpu
2021-06-15 17:28:48 INFO: Loading: tokenize
2021-06-15 17:28:48 INFO: Loading: pos
2021-06-15 17:28:49 INFO: Loading: lemma
2021-06-15 17:28:49 INFO: Loading: depparse
2021-06-15 17:28:50 INFO: Done loading processors!
doc = nlp('現階段武漢肺炎還是主要敵人,我們應該專心做好防疫')
for i, sent in enumerate(doc.sentences):
print('[Sentence {}]'.format(i+1))
for word in sent.words:
print('{:12s}\t{:12s}\t{:6s}\t{:d}\t{:12s}'.format(\
word.text, word.lemma, word.pos, word.head, word.deprel))
print('')
[Sentence 1]
現 現 NOUN 7 nmod:tmod
階段 階段 NOUN 4 nmod
武漢 武漢 PROPN 4 nmod
肺炎 肺炎 NOUN 7 nsubj
還是 還是 AUX 7 cop
主要 主要 ADJ 7 amod
敵人 敵人 NOUN 12 advcl
, , PUNCT 12 punct
我們 我 PRON 12 nsubj
應該 應該 AUX 12 aux
專心 專心 ADV 12 advmod
做 做 VERB 0 root
好防 好防 NOUN 12 obj
疫 疫 PUNCT 12 punct
!pip3 freeze | grep torch[^-]
facenet-pytorch==2.5.2
torch==1.7.1+cu101
torchtext==0.8.0
torchvision==0.8.2+cu101
import torch
import numpy as np
import pandas as pd
df = pd.read_csv('metadata_topic.csv')
df
artUrl | artTitle | artDate | artTime | artPoster | artCat | commentNum | push | boo | sentence | topic | gamma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | https://www.ptt.cc/bbs/Gossiping/M.1621008816…. | [問卦]防疫贏了別人一整年卻瞬間失守的關鍵? | 2021/05/14 | 16:13:31 | peter308 | Gossiping | 274 | 115 | 21 | 我感覺失守關鍵是疫苗只要疫苗買不到… | 4 | 0.709677 |
1 | https://www.ptt.cc/bbs/Gossiping/M.1621012600…. | [問卦]這次防疫破功的最大豬隊友是誰??? | 2021/05/14 | 17:16:37 | ljsnonocat2 | Gossiping | 272 | 150 | 33 | 這次讓台灣成功堅守一年的防疫整個破功的最大豬隊友是誰??1. 范雲. | 4 | 0.804878 |
2 | https://www.ptt.cc/bbs/Gossiping/M.1621016951…. | [問卦]怎麼不直接升到五級防疫!? | 2021/05/14 | 18:29:09 | ex250203 | Gossiping | 8 | 0 | 2 | 感覺現在台灣疫情這麼嚴重 居然連三級都沒有!?!… | 4 | 0.441176 |
3 | https://www.ptt.cc/bbs/Gossiping/M.1621025291…. | [問卦]確診後隔44小時後公佈算防疫破口嗎? | 2021/05/14 | 20:48:09 | windyyw | Gossiping | 251 | 89 | 24 | 現有執行政策,當天晚上6點後確診的,,要到第三天下午兩點公佈,… | 1 | 0.400000 |
4 | https://www.ptt.cc/bbs/Gossiping/M.1621029977…. | Re:[問卦]防疫贏了別人一整年卻瞬間失守的關鍵? | 2021/05/14 | 22:06:15 | iampig951753 | Gossiping | 23 | 9 | 1 | 防疫94這樣啊4瞬間ㄉ事情… | 4 | 0.634146 |
… | … | … | … | … | … | … | … | … | … | … | … | … |
1929 | https://www.ptt.cc/bbs/HatePolitics/M.16234895… | [黑特]四分防疫六分打中央 | 2021/06/12 | 09:19:33 | yien | HatePolitics | 20 | 9 | 1 | 現階段武漢肺炎還是主要敵人,不希望七分防疫、三分打柯,沒有必要… | 4 | 0.422222 |
1930 | https://www.ptt.cc/bbs/HatePolitics/M.16234976… | [討論]防疫鬧劇 | 2021/06/12 | 11:33:48 | nawussica | HatePolitics | 4 | 3 | 0 | 2021年初以來的3+11… | 4 | 0.607843 |
1931 | https://www.ptt.cc/bbs/HatePolitics/M.16235032… | [討論]民進黨的防疫會不會太忙? | 2021/06/12 | 13:07:20 | SoFanCy | HatePolitics | 7 | 5 | 0 | 要打假訊息… | 4 | 0.538462 |
1932 | https://www.ptt.cc/bbs/HatePolitics/M.16235040… | [討論]震驚!防疫壓力太大,新北護理師上吊亡 | 2021/06/12 | 13:20:31 | MrTexas | HatePolitics | 168 | 72 | 16 | https://reurl.cc/AkO9lK\r\n記者陳雕文/新北報導\r\n\r\n新… | 3 | 0.709677 |
1933 | https://www.ptt.cc/bbs/HatePolitics/M.16235106… | [討論]北高的防疫人員命運大不同 | 2021/06/12 | 15:10:11 | tenfu | HatePolitics | 25 | 9 | 0 | 北高的防疫人員 為啥命運大不同. | 1 | 0.405405 |
1934 rows × 12 columns
import torch
from torch import nn
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-macbert-base')
model = BertModel.from_pretrained('hfl/chinese-macbert-base')
device = torch.device('cuda:2')
model = model.to(device)
def get_output(text):
token = tokenizer(text,
truncation=True, padding=True,
max_length=512, return_tensors='pt')
t = token['input_ids'].to(device)
m = token['attention_mask'].to(device)
i = token['token_type_ids'].to(device)
outs = model(t, m, i).pooler_output
return outs.detach().cpu().numpy()
from tqdm import tqdm
outputs = []
for i in tqdm(range(df.shape[0])):
outputs.append(get_output(df.loc[i, 'sentence']).squeeze())
100%|██████████| 1934/1934 [00:47<00:00, 40.68it/s]
data_x = np.stack(outputs, axis=0)
data_x.shape
(1934, 768)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4)
kmeans.fit(data_x)
KMeans(algorithm=‘auto’, copy_x=True, init=‘k-means++’, max_iter=300, n_clusters=4, n_init=10, n_jobs=None, precompute_distances=‘auto’, random_state=None, tol=0.0001, verbose=0)
df['topic'].hist()
# 防疫政策
df[df['topic'] == 0].sample(5)
artUrl | artTitle | artDate | artTime | artPoster | artCat | commentNum | push | boo | sentence | topic | gamma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
435 | https://www.ptt.cc/bbs/Gossiping/M.1621475202…. | Re:[爆卦]首場全國防疫會議召開(反擊假訊息) | 2021/05/20 | 01:46:40 | eddisontw | Gossiping | 197 | 110 | 20 | 這個團隊真的完蛋了… | 0 | 0.297872 |
1348 | https://www.ptt.cc/bbs/Gossiping/M.1623037566…. | [問卦]美V.S.台三大防疫政策的不同?? | 2021/06/07 | 03:46:03 | pttmovielove | Gossiping | 8 | 2 | 1 |
|
0 | 0.295918 |
1332 | https://www.ptt.cc/bbs/Gossiping/M.1622995721…. | [問卦]民視台灣演義會怎麼介紹今年防疫 | 2021/06/06 | 16:08:38 | poeta | Gossiping | 14 | 4 | 1 | 台灣演義是民視週末的節目… | 0 | 0.459459 |
440 | https://www.ptt.cc/bbs/Gossiping/M.1621476351…. | Re:[爆卦]首場全國防疫會議召開(反擊假訊息) | 2021/05/20 | 02:05:48 | eeccoo | Gossiping | 18 | 9 | 0 | 我剛剛看公視直播在下面留言,沒其他重要的疫情訊息有… | 0 | 0.515625 |
506 | https://www.ptt.cc/bbs/Gossiping/M.1621531016…. | [問卦]是不是該推廣居家內防疫的重要性? | 2021/05/20 | 17:16:54 | xxx88550 | Gossiping | 0 | 0 | 0 | 目前大家都著重在 不群聚、戴口罩這些防疫基本款。,面對親人、朋友們… | 0 | 0.307692 |
# 防疫新聞
df[df['topic'] == 1].sample(5)
artUrl | artTitle | artDate | artTime | artPoster | artCat | commentNum | push | boo | sentence | topic | gamma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
150 | https://www.ptt.cc/bbs/Gossiping/M.1621127161…. | [新聞]李愛綺防疫健身房停業50萬飛了 | 2021/05/16 | 01:05:58 | qwqwaas | Gossiping | 23 | 12 | 5 | 李愛綺防疫健身房停業50萬飛了://tinyurl.com/ax79kke… | 1 | 0.418079 |
1579 | https://www.ptt.cc/bbs/HatePolitics/M.16213966… | [新聞]柯文哲防疫遭批蔡峻維怒嗆基進黨:嘴臭 | 2021/05/19 | 03:56:36 | goetze | HatePolitics | 24 | 10 | 2 | 1.新聞網址︰://www.chinatimes.com/realtime… | 1 | 0.648438 |
531 | https://www.ptt.cc/bbs/Gossiping/M.1621576988…. | [新聞]防疫旅館「加強版」急加開!柯文哲要觀 | 2021/05/21 | 06:03:05 | wenge321 | Gossiping | 32 | 24 | 1 | 防疫旅館「加強版」急加開!柯文哲要觀傳局:一直開房間://www.setn… | 1 | 0.790960 |
1067 | https://www.ptt.cc/bbs/Gossiping/M.1622362085…. | [爆卦]台北開完防疫記者會了,人民優先 | 2021/05/30 | 08:08:03 | wind200625 | Gossiping | 1122 | 806 | 41 | 誠實結論1.昨日萬華快篩489人,陽性率下降3.9,但用大數據來看,染症足跡… | 1 | 0.640845 |
355 | https://www.ptt.cc/bbs/Gossiping/M.1621379418…. | [新聞]快訊/國道火燒車!「75%防疫酒精」物流 | 2021/05/18 | 23:10:16 | VladeDivac | Gossiping | 162 | 100 | 10 | 備註請放最後面 違者新聞文章刪除1.媒體來源:… | 1 | 0.753304 |
# 防疫新聞、政治人物
df[df['topic'] == 2].sample(5)
artUrl | artTitle | artDate | artTime | artPoster | artCat | commentNum | push | boo | sentence | topic | gamma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1555 | https://www.ptt.cc/bbs/HatePolitics/M.16212575… | [黑特]一個牙醫懂個屁防疫公共衛生 | 2021/05/17 | 13:18:35 | moumoon5566 | HatePolitics | 7 | 2 | 3 | 自己定的防疫sop沒遵守… | 2 | 0.441176 |
116 | https://www.ptt.cc/bbs/Gossiping/M.1621087972…. | Re:[問卦]防疫你只相信誰? | 2021/05/15 | 14:12:48 | coolhon | Gossiping | 49 | 34 | 2 | 藉本篇來說一下我最近的想法,恭喜各位,民進黨賭輸了… | 2 | 0.622010 |
30 | https://www.ptt.cc/bbs/Gossiping/M.1621051950…. | [問卦]各位該裝防疫APP了吧(發錢) | 2021/05/15 | 04:12:26 | LeafLu | Gossiping | 341 | 284 | 4 | 武肺in台灣相關整理:://reurl.cc/7XeebD… | 2 | 0.440678 |
615 | https://www.ptt.cc/bbs/Gossiping/M.1621663861…. | [新聞]行政院周末開防疫會議陳時中:確診率明 | 2021/05/22 | 06:10:58 | sukiyasuica | Gossiping | 17 | 10 | 3 | 1.媒體來源:聯合2.記者署名:陳熙文3.完… | 2 | 0.418239 |
1601 | https://www.ptt.cc/bbs/HatePolitics/M.16214840… | [新聞]新防疫指揮中心」成立?他指成員有這5人 | 2021/05/20 | 04:13:40 | xamous | HatePolitics | 58 | 24 | 8 | 1.新聞網址︰://money.udn.com/money/story/5… | 2 | 0.546012 |
# 防疫方法
df[df['topic'] == 3].sample(5)
artUrl | artTitle | artDate | artTime | artPoster | artCat | commentNum | push | boo | sentence | topic | gamma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1628 | https://www.ptt.cc/bbs/HatePolitics/M.16215723… | [討論]dpp靠反中防疫 | 2021/05/21 | 04:45:23 | jt13 | HatePolitics | 13 | 2 | 4 | 守住第一波 因為他叫中國武漢肺炎… | 3 | 0.594595 |
445 | https://www.ptt.cc/bbs/Gossiping/M.1621478742…. | [問卦]新北防疫做的超棒的八卦 | 2021/05/20 | 02:45:40 | twelvethflor | Gossiping | 18 | 5 | 2 | 我朋友的工地啦!師父都不帶口罩啦!,因為不想起衝突。.. | 3 | 0.400000 |
1126 | https://www.ptt.cc/bbs/Gossiping/M.1622601115…. | [問卦]過去一年全世界怎麼看台灣防疫? | 2021/06/02 | 02:31:53 | osalucard | Gossiping | 13 | 7 | 0 | 邊境防守... | 3 | 0.627451 |
415 | https://www.ptt.cc/bbs/Gossiping/M.1621429426…. | [問卦]怎麼讓貓狗防疫? | 2021/05/19 | 13:03:44 | lobfo | Gossiping | 12 | 5 | 0 | 有人養貓狗用放養的… | 3 | 0.520000 |
657 | https://www.ptt.cc/bbs/Gossiping/M.1621711480…. | Re:[問卦]防疫旅館是只給回國的住嗎? | 2021/05/22 | 19:24:38 | wind200625 | Gossiping | 0 | 0 | 0 | 這篇簡單說明一下、… | 3 | 0.830882 |
df['topic'].hist()
import torch
from torch import nn
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-macbert-base')
model = BertForSequenceClassification.from_pretrained('hfl/chinese-macbert-base')
for p in model.parameters(): # freeze bert
p.requires_grad = False
model.classifier = nn.Linear(768, 4, bias=True)
model.bert.encoder.layer[-1].output.dense.weight.requires_grad
False
model.classifier.weight.requires_grad
True
def tokenize_text(text):
return tokenizer.encode(text, return_tensors='pt')
all_tokens = [tokenize_text(df.loc[i, 'sentence']) for i in range(df.shape[0])]
shapes = [t.shape[1] for t in all_tokens]
st = torch.tensor(shapes).float()
st.min(), st.max()
(tensor(28.), tensor(7344.))
import matplotlib.pyplot as plt
plt.hist(st.numpy())
plt.xlim(0, 3000)
import os
import csv
import torch
from torch.utils.data import Dataset, DataLoader, random_split
class PreventionDataset(Dataset):
def __init__(self, mode='train', seed=1340):
self.mode = mode
texts = df['sentence'].values.tolist()
tokens = tokenizer(texts,
truncation=True, padding=True,
max_length=512,
return_tensors='pt'
)
self.tokens = tokens['input_ids']
self.masks = tokens['attention_mask']
self.type_ids = tokens['token_type_ids']
self.labels = torch.tensor(df['topic'].values - 1).long()
def __getitem__(self, idx):
token = self.tokens[idx]
mask = self.masks[idx]
type_id = self.type_ids[idx]
label = self.labels[idx]
return (token, mask, type_id), label
def __len__(self):
return self.labels.shape[0]
dataset = PreventionDataset()
torch.manual_seed(1340)
train_size = int(len(dataset) * 0.7)
test_size = len(dataset) - train_size
dataset_train, dataset_test = random_split(dataset, [train_size, test_size])
batch_size = 64
train_data = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
test_data = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)
from transformers import AdamW
# device = torch.device('cpu')
device = torch.device('cuda:3')
optimizer = AdamW(model.parameters(), lr=3e-3)
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)
def accuracy(raw_preds, y):
preds = raw_preds.argmax(dim=1)
acc = (preds == y).sum()
return acc
from tqdm import tqdm
train_loss_list = []
test_loss_list = []
def train(model, data, optimizer, criterion):
model.train()
epoch_loss = 0
epoch_acc = 0
total = 0
for (t, m, i), label in tqdm(data, total=len(data)):
t = t.to(device)
m = m.to(device)
i = i.to(device)
label = label.to(device)
optimizer.zero_grad()
output = model(t, m, i)
# print('o', output)
pred = output.logits
# print('p', pred)
# print('p', pred)
# print('l', label)
loss = criterion(pred, label)
acc = accuracy(pred, label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
train_loss_list.append(loss.item())
epoch_acc += acc.item()
total += len(t)
return epoch_loss / total, epoch_acc / total
def test(model, data, criterion, log_loss=False):
model.eval()
epoch_loss = 0
epoch_acc = 0
total = 0
for (t, m, i), label in tqdm(data, total=len(data)):
t = t.to(device)
m = m.to(device)
i = i.to(device)
label = label.to(device)
output = model(t, m, i)
pred = output.logits
loss = criterion(pred, label)
acc = accuracy(pred, label)
if log_loss:
test_loss_list.append(loss.item())
epoch_loss += loss.item()
epoch_acc += acc.item()
total += len(t)
return epoch_loss / total, epoch_acc / total
max_epoch = 20
log_interval = 1
best_acc = 0
for epoch in range(1, max_epoch + 1):
train_loss, train_acc = train(model, train_data, optimizer, criterion)
test_loss, test_acc = test(model, test_data, criterion, log_loss=True)
if epoch % log_interval == 0:
print('Epoch {} train_loss: {} train_acc: {}'.format(
epoch, train_loss, train_acc
))
print('Epoch {} test_loss: {} test_acc : {}'.format(
epoch, test_loss, test_acc
))
torch.save(model.state_dict(), 'ckpts/e{}.pt'.format(epoch))
# if val_acc > best_acc:
# best_model = model
# best_acc = val_acc
# print('-'*10, 'e', epoch, 'save best model', '-'*10)
100%|██████████| 22/22 [00:17<00:00, 1.24it/s]
100%|██████████| 10/10 [00:07<00:00, 1.39it/s]
Epoch 1 train_loss: 0.021512867694242918 train_acc: 0.4523281596452328
Epoch 1 test_loss: 0.019873504486839045 test_acc : 0.5507745266781411
100%|██████████| 22/22 [00:17<00:00, 1.22it/s]
100%|██████████| 10/10 [00:07<00:00, 1.38it/s]
Epoch 2 train_loss: 0.019146774446003892 train_acc: 0.5188470066518847
Epoch 2 test_loss: 0.01883598531174783 test_acc : 0.5869191049913941
100%|██████████| 22/22 [00:18<00:00, 1.22it/s]
100%|██████████| 10/10 [00:07<00:00, 1.38it/s]
Epoch 3 train_loss: 0.018122735150373166 train_acc: 0.5461936437546193
Epoch 3 test_loss: 0.017967594880450407 test_acc : 0.6110154905335629
100%|██████████| 22/22 [00:18<00:00, 1.21it/s]
100%|██████████| 10/10 [00:07<00:00, 1.37it/s]
…
import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize=(16, 6))
x1 = np.linspace(1, max_epoch, len(train_loss_list))
plt.plot(x1, train_loss_list)
x2 = np.linspace(1, max_epoch, len(test_loss_list))
plt.plot(x2, test_loss_list, color='r')
plt.legend(['train_loss', 'test_loss'])
plt.show()
plt.figure(figsize=(16, 6))
x1 = np.linspace(1, max_epoch, len(train_acc_list))
plt.plot(x1, train_acc_list)
x2 = np.linspace(1, max_epoch, len(test_acc_list))
plt.plot(x2, test_acc_list, color='r')
plt.legend(['train_acc', 'test_acc'])
plt.show()
data_y = df['topic'].values
from sklearn.model_selection import train_test_split
np.random.seed(1340)
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, train_size=0.7)
print('train x', train_x.shape)
print('test x', test_x.shape)
print('train y', train_y.shape)
print('test y', test_y.shape)
train x (1353, 768)
test x (581, 768)
train y (1353,)
test y (581,)
def train_test_acc(model):
pred = model.predict(train_x)
print('train acc')
print((pred == train_y).mean())
pred = model.predict(test_x)
print('test acc')
print((pred == test_y).mean())
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
lg.fit(train_x, train_y)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class=‘warn’, n_jobs=None, penalty=‘l2’, random_state=None, solver=‘warn’, tol=0.0001, verbose=0, warm_start=False)
train_test_acc(lg)
train acc
1.0
test acc
0.9672977624784854
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(train_x, train_y)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion=‘gini’, max_depth=None, max_features=‘auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False)
train_test_acc(rf)
train acc
1.0
test acc
0.9432013769363167
Social Network
取出 cmtPoster, artPoster, artUrl 三個欄位建立 link
計算每個帳號的發文與回覆次數
標記 subject 類型
5/15 發文 - 回覆網絡
篩選出 5/15 的文章
篩選 link_0515 中的 subject
netwrok graph
0515 文章推噓網絡圖
6/3 發文 - 回覆網絡
篩選出 6/3 的文章
篩選 link_0603 中的 subject
netwrok graph
0603 文章推噓網絡圖
八卦版文章推噓網絡圖
可以發現多數回覆者對 clownT, wind200625 的貼文的態度絕大多數都是推文,
政黑版文章推噓網絡圖
政黑版多數主要 subject 對於版內的文章的態度都是推
不同主題的 network
Gossiping
八卦版的文章回覆中位數 = 375
Centrality
HatePolitics
政黑版的文章回覆中位數 = 86
Eigen Centrality
0515 Gossiping
0515 HatePolitics
0603 Gossiping
0603 HatePolitics
Centrality