P <- read_file("positive.txt") # 正向字典txt檔
N <- read_file("negative.txt") # 負向字典txt檔
#字典txt檔讀進來是一整個字串
typeof(P)
## [1] "character"
typeof(N)
## [1] "character"
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
# 檢視字典
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
ytr <- read_csv("data_youtuber_more_articleMetaData.csv") %>%
mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>%
mutate(sentence=gsub("\n", "", sentence)) %>%
mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence))
##
## -- Column specification --------------------------------------------------------
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
ytr
## # A tibble: 8,566 x 10
## artTitle artDate artTime artUrl artPoster artCat commentNum push boo
## <chr> <date> <time> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## 2 [爆卦]館長:王~ 2019-12-31 18:27:38 https:~ v840122v Gossi~ 260 56 82
## 3 Re:[新聞]3~ 2019-12-31 19:44:33 https:~ soulboy3~ Gossi~ 19 9 3
## 4 Re:[爆卦]館~ 2019-12-31 21:01:04 https:~ fckj1131~ Gossi~ 6 3 0
## 5 [問卦]愛莉莎莎~ 2020-01-01 00:03:32 https:~ f17690815 Gossi~ 24 7 10
## 6 [問卦]館長跨年~ 2020-01-01 01:18:44 https:~ kimo6414 Gossi~ 8 5 0
## 7 [問卦]館長是不~ 2020-01-01 01:52:24 https:~ ss8901234 Gossi~ 10 4 2
## 8 [問卦]誰撿到愛~ 2020-01-01 02:04:54 https:~ werqq Gossi~ 21 10 7
## 9 [問卦]愛莉莎莎~ 2020-01-01 04:04:27 https:~ james7923 Gossi~ 33 16 10
## 10 [問卦]愛莉莎莎~ 2020-01-01 06:01:05 https:~ ddd123 Gossi~ 19 5 4
## # ... with 8,556 more rows, and 1 more variable: sentence <chr>
#jieba斷詞初始化
jieba_tokenizer = worker()
#建立斷詞器
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
ytr_lexicon <- scan(file = "list_of_ytr.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8',quiet = T)
ytr_lexicon
## [1] "愛莉莎莎" "蔡阿嘎" "館長" "老高" "蔡桃貴"
## [6] "亞洲統神" "den" "小許" "老王" "洋蔥"
## [11] "蒼藍鴿" "阿神" "反骨" "志祺七七" "黃氏兄弟"
## [16] "唐綺陽" "呱吉" "九面" "千千" "雨揚樂活家族"
#把前20熱門話題的youtuber名子加入斷詞
new_user_word(jieba_tokenizer, c(ytr_lexicon))
## [1] TRUE
ytr <- ytr%>%
mutate(emo=(push-boo))
ytr
## # A tibble: 8,566 x 11
## artTitle artDate artTime artUrl artPoster artCat commentNum push boo
## <chr> <date> <time> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## 2 [爆卦]館長:王~ 2019-12-31 18:27:38 https:~ v840122v Gossi~ 260 56 82
## 3 Re:[新聞]3~ 2019-12-31 19:44:33 https:~ soulboy3~ Gossi~ 19 9 3
## 4 Re:[爆卦]館~ 2019-12-31 21:01:04 https:~ fckj1131~ Gossi~ 6 3 0
## 5 [問卦]愛莉莎莎~ 2020-01-01 00:03:32 https:~ f17690815 Gossi~ 24 7 10
## 6 [問卦]館長跨年~ 2020-01-01 01:18:44 https:~ kimo6414 Gossi~ 8 5 0
## 7 [問卦]館長是不~ 2020-01-01 01:52:24 https:~ ss8901234 Gossi~ 10 4 2
## 8 [問卦]誰撿到愛~ 2020-01-01 02:04:54 https:~ werqq Gossi~ 21 10 7
## 9 [問卦]愛莉莎莎~ 2020-01-01 04:04:27 https:~ james7923 Gossi~ 33 16 10
## 10 [問卦]愛莉莎莎~ 2020-01-01 06:01:05 https:~ ddd123 Gossi~ 19 5 4
## # ... with 8,556 more rows, and 2 more variables: sentence <chr>, emo <dbl>
cate <- ytr%>%
unnest_tokens(cate,artTitle,token = chi_tokenizer)%>%
select(artUrl,cate)%>%
filter(cate %in% ytr_lexicon)
ytr <- ytr%>%
inner_join(cate)
## Joining, by = "artUrl"
ytr
## # A tibble: 8,549 x 12
## artTitle artDate artTime artUrl artPoster artCat commentNum push boo
## <chr> <date> <time> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## 2 [爆卦]館長:王~ 2019-12-31 18:27:38 https:~ v840122v Gossi~ 260 56 82
## 3 Re:[新聞]3~ 2019-12-31 19:44:33 https:~ soulboy3~ Gossi~ 19 9 3
## 4 Re:[爆卦]館~ 2019-12-31 21:01:04 https:~ fckj1131~ Gossi~ 6 3 0
## 5 [問卦]愛莉莎莎~ 2020-01-01 00:03:32 https:~ f17690815 Gossi~ 24 7 10
## 6 [問卦]館長跨年~ 2020-01-01 01:18:44 https:~ kimo6414 Gossi~ 8 5 0
## 7 [問卦]館長是不~ 2020-01-01 01:52:24 https:~ ss8901234 Gossi~ 10 4 2
## 8 [問卦]誰撿到愛~ 2020-01-01 02:04:54 https:~ werqq Gossi~ 21 10 7
## 9 [問卦]愛莉莎莎~ 2020-01-01 04:04:27 https:~ james7923 Gossi~ 33 16 10
## 10 [問卦]愛莉莎莎~ 2020-01-01 06:01:05 https:~ ddd123 Gossi~ 19 5 4
## # ... with 8,539 more rows, and 3 more variables: sentence <chr>, emo <dbl>,
## # cate <chr>
ytr_after_jieba <- ytr %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9]")) )
ytr_after_jieba
## # A tibble: 592,208 x 12
## artTitle artDate artTime artUrl artPoster artCat commentNum push boo
## <chr> <date> <time> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## 2 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## 3 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## 4 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## 5 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## 6 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## 7 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## 8 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## 9 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## 10 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~ 146 82 16
## # ... with 592,198 more rows, and 3 more variables: emo <dbl>, cate <chr>,
## # word <chr>
sentiment <- ytr_after_jieba %>%
select(artDate,word,cate) %>%
inner_join(LIWC) %>%
mutate(senti_count = ifelse(sentiment =="positive",1,-1))%>%
select(artDate,senti_count,cate) %>%
group_by(artDate,cate)%>%
summarise(sum = sum(senti_count))
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
sentiment
## # A tibble: 1,785 x 3
## # Groups: artDate [481]
## artDate cate sum
## <date> <chr> <dbl>
## 1 2019-12-31 館長 3
## 2 2020-01-01 老高 -1
## 3 2020-01-01 愛莉莎莎 3
## 4 2020-01-01 館長 -8
## 5 2020-01-02 老高 1
## 6 2020-01-02 阿神 0
## 7 2020-01-02 唐綺陽 -21
## 8 2020-01-02 館長 17
## 9 2020-01-03 反骨 -4
## 10 2020-01-03 唐綺陽 -6
## # ... with 1,775 more rows
### 全部人的趨勢變化:一張年和分季
all_season <- sentiment%>%
filter(cate %in% "愛莉莎莎"|cate %in%"館長"|cate %in%"蒼藍鴿"|cate %in%"蔡阿嘎"|cate %in%"黃氏兄弟")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))
all_season
可以看出,黃氏兄弟在2020年第二季、蒼藍鴿在2021年第一季、愛莉莎莎在2020年第二季和2021年第一季、蔡阿嘎在2020年第三季、館長在2020年第三季有比較顯著的變化,因此下面把這些季數特別拿出來看
### 個別
#### 黃氏兄弟:
one_season <- sentiment%>%
filter(cate %in% "黃氏兄弟")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))+
scale_x_date(limits = as.Date(c('2020-04-01','2020-06-30')))
#### 蒼藍鴿:
two_season <- sentiment%>%
filter(cate %in% "蒼藍鴿")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))+
scale_x_date(limits = as.Date(c('2021-01-01','2021-03-31')))
#### 愛莉莎莎
three_season_1 <- sentiment%>%
filter(cate %in% "愛莉莎莎")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))+
scale_x_date(limits = as.Date(c('2021-01-01','2021-03-31')))
three_season_2 <- sentiment%>%
filter(cate %in% "愛莉莎莎")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))+
scale_x_date(limits = as.Date(c('2020-04-01','2020-06-30')))
#### 蔡阿嘎
four_season <- sentiment%>%
filter(cate %in% "蔡阿嘎")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))+
scale_x_date(limits = as.Date(c('2020-10-01','2020-12-31')))
#### 館長
five_season <- sentiment%>%
filter(cate %in% "館長")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))+
scale_x_date(limits = as.Date(c('2020-10-01','2020-12-31')))
one_season
## Warning: Removed 5 row(s) containing missing values (geom_path).
two_season
## Warning: Removed 1 row(s) containing missing values (geom_path).
three_season_1
## Warning: Removed 335 row(s) containing missing values (geom_path).
three_season_2
## Warning: Removed 351 row(s) containing missing values (geom_path).
four_season
## Warning: Removed 136 row(s) containing missing values (geom_path).
five_season
## Warning: Removed 343 row(s) containing missing values (geom_path).
ytr_emo <- ytr%>%
select(artDate,cate,emo)%>%
group_by(artDate,cate)%>%
summarise(sum = sum(emo))
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
ytr_emo
## # A tibble: 1,918 x 3
## # Groups: artDate [481]
## artDate cate sum
## <date> <chr> <dbl>
## 1 2019-12-31 館長 49
## 2 2020-01-01 老高 1
## 3 2020-01-01 愛莉莎莎 1
## 4 2020-01-01 館長 714
## 5 2020-01-02 老高 6
## 6 2020-01-02 阿神 75
## 7 2020-01-02 唐綺陽 -335
## 8 2020-01-02 館長 41
## 9 2020-01-03 反骨 5
## 10 2020-01-03 唐綺陽 -121
## # ... with 1,908 more rows
### 整體
all_year_emo <- ytr_emo%>%
filter(cate %in% "愛莉莎莎"|cate %in%"館長"|cate %in%"蒼藍鴿"|cate %in%"蔡阿嘎"|cate %in%"黃氏兄弟")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))
### 個別
#### 黃氏兄弟:
one_emo_season <- ytr_emo%>%
filter(cate %in% "黃氏兄弟")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))+
scale_x_date(limits = as.Date(c('2020-04-01','2020-06-30')))
#### 蒼藍鴿:
two_emo_season <- ytr_emo%>%
filter(cate %in% "蒼藍鴿")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))+
scale_x_date(limits = as.Date(c('2021-01-01','2021-03-31')))
#### 愛莉莎莎
three_emo_season_1 <- ytr_emo%>%
filter(cate %in% "愛莉莎莎")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))+
scale_x_date(limits = as.Date(c('2021-01-01','2021-03-31')))
three_emo_season_2 <- ytr_emo%>%
filter(cate %in% "愛莉莎莎")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))+
scale_x_date(limits = as.Date(c('2020-04-01','2020-06-30')))
#### 蔡阿嘎
four_emo_season <- ytr_emo%>%
filter(cate %in% "蔡阿嘎")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))+
scale_x_date(limits = as.Date(c('2020-10-01','2020-12-31')))
#### 館長
five_emo_season <- ytr_emo%>%
filter(cate %in% "館長")%>%
ggplot()+
geom_line(aes(x = artDate,y = sum,colour = cate))+
scale_x_date(limits = as.Date(c('2020-10-01','2020-12-31')))
one_emo_season
## Warning: Removed 6 row(s) containing missing values (geom_path).
two_emo_season
## Warning: Removed 1 row(s) containing missing values (geom_path).
three_emo_season_1
## Warning: Removed 346 row(s) containing missing values (geom_path).
three_emo_season_2
## Warning: Removed 360 row(s) containing missing values (geom_path).
four_emo_season
## Warning: Removed 148 row(s) containing missing values (geom_path).
five_emo_season
## Warning: Removed 353 row(s) containing missing values (geom_path).