youtber_senti.utf8

環境設定：

設定情緒字典：

P <- read_file("positive.txt") # 正向字典txt檔
N <- read_file("negative.txt") # 負向字典txt檔

#字典txt檔讀進來是一整個字串
typeof(P)

## [1] "character"

typeof(N)

## [1] "character"

# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047

# 把兩個字典拼在一起
LIWC = rbind(P, N)

# 檢視字典
head(LIWC)

##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

讀資料：

ytr <- read_csv("data_youtuber_more_articleMetaData.csv") %>% 
  mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>% 
  mutate(sentence=gsub("\n", "", sentence)) %>% 
  mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence))

## 
## -- Column specification --------------------------------------------------------
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_double(),
##   push = col_double(),
##   boo = col_double(),
##   sentence = col_character()
## )

ytr

## # A tibble: 8,566 x 10
##    artTitle  artDate    artTime  artUrl  artPoster artCat commentNum  push   boo
##    <chr>     <date>     <time>   <chr>   <chr>     <chr>       <dbl> <dbl> <dbl>
##  1 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
##  2 [爆卦]館長：王~ 2019-12-31 18:27:38 https:~ v840122v  Gossi~        260    56    82
##  3 Re:[新聞]3~ 2019-12-31 19:44:33 https:~ soulboy3~ Gossi~         19     9     3
##  4 Re:[爆卦]館~ 2019-12-31 21:01:04 https:~ fckj1131~ Gossi~          6     3     0
##  5 [問卦]愛莉莎莎~ 2020-01-01 00:03:32 https:~ f17690815 Gossi~         24     7    10
##  6 [問卦]館長跨年~ 2020-01-01 01:18:44 https:~ kimo6414  Gossi~          8     5     0
##  7 [問卦]館長是不~ 2020-01-01 01:52:24 https:~ ss8901234 Gossi~         10     4     2
##  8 [問卦]誰撿到愛~ 2020-01-01 02:04:54 https:~ werqq     Gossi~         21    10     7
##  9 [問卦]愛莉莎莎~ 2020-01-01 04:04:27 https:~ james7923 Gossi~         33    16    10
## 10 [問卦]愛莉莎莎~ 2020-01-01 06:01:05 https:~ ddd123    Gossi~         19     5     4
## # ... with 8,556 more rows, and 1 more variable: sentence <chr>

斷詞處理：

#jieba斷詞初始化
jieba_tokenizer = worker()
#建立斷詞器
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}
ytr_lexicon <- scan(file = "list_of_ytr.txt", what=character(),sep='\n', 
                    encoding='utf-8',fileEncoding='utf-8',quiet = T)
ytr_lexicon

##  [1] "愛莉莎莎"     "蔡阿嘎"       "館長"         "老高"         "蔡桃貴"      
##  [6] "亞洲統神"     "den"          "小許"         "老王"         "洋蔥"        
## [11] "蒼藍鴿"       "阿神"         "反骨"         "志祺七七"     "黃氏兄弟"    
## [16] "唐綺陽"       "呱吉"         "九面"         "千千"         "雨揚樂活家族"

#把前20熱門話題的youtuber名子加入斷詞
new_user_word(jieba_tokenizer, c(ytr_lexicon))

## [1] TRUE

推、噓整理

ytr <- ytr%>%
  mutate(emo=(push-boo))

ytr

## # A tibble: 8,566 x 11
##    artTitle  artDate    artTime  artUrl  artPoster artCat commentNum  push   boo
##    <chr>     <date>     <time>   <chr>   <chr>     <chr>       <dbl> <dbl> <dbl>
##  1 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
##  2 [爆卦]館長：王~ 2019-12-31 18:27:38 https:~ v840122v  Gossi~        260    56    82
##  3 Re:[新聞]3~ 2019-12-31 19:44:33 https:~ soulboy3~ Gossi~         19     9     3
##  4 Re:[爆卦]館~ 2019-12-31 21:01:04 https:~ fckj1131~ Gossi~          6     3     0
##  5 [問卦]愛莉莎莎~ 2020-01-01 00:03:32 https:~ f17690815 Gossi~         24     7    10
##  6 [問卦]館長跨年~ 2020-01-01 01:18:44 https:~ kimo6414  Gossi~          8     5     0
##  7 [問卦]館長是不~ 2020-01-01 01:52:24 https:~ ss8901234 Gossi~         10     4     2
##  8 [問卦]誰撿到愛~ 2020-01-01 02:04:54 https:~ werqq     Gossi~         21    10     7
##  9 [問卦]愛莉莎莎~ 2020-01-01 04:04:27 https:~ james7923 Gossi~         33    16    10
## 10 [問卦]愛莉莎莎~ 2020-01-01 06:01:05 https:~ ddd123    Gossi~         19     5     4
## # ... with 8,556 more rows, and 2 more variables: sentence <chr>, emo <dbl>

進行文章的分類：

cate <- ytr%>%
  unnest_tokens(cate,artTitle,token = chi_tokenizer)%>%
  select(artUrl,cate)%>% 
  filter(cate %in% ytr_lexicon)


ytr <- ytr%>%
  inner_join(cate)

## Joining, by = "artUrl"

ytr

## # A tibble: 8,549 x 12
##    artTitle  artDate    artTime  artUrl  artPoster artCat commentNum  push   boo
##    <chr>     <date>     <time>   <chr>   <chr>     <chr>       <dbl> <dbl> <dbl>
##  1 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
##  2 [爆卦]館長：王~ 2019-12-31 18:27:38 https:~ v840122v  Gossi~        260    56    82
##  3 Re:[新聞]3~ 2019-12-31 19:44:33 https:~ soulboy3~ Gossi~         19     9     3
##  4 Re:[爆卦]館~ 2019-12-31 21:01:04 https:~ fckj1131~ Gossi~          6     3     0
##  5 [問卦]愛莉莎莎~ 2020-01-01 00:03:32 https:~ f17690815 Gossi~         24     7    10
##  6 [問卦]館長跨年~ 2020-01-01 01:18:44 https:~ kimo6414  Gossi~          8     5     0
##  7 [問卦]館長是不~ 2020-01-01 01:52:24 https:~ ss8901234 Gossi~         10     4     2
##  8 [問卦]誰撿到愛~ 2020-01-01 02:04:54 https:~ werqq     Gossi~         21    10     7
##  9 [問卦]愛莉莎莎~ 2020-01-01 04:04:27 https:~ james7923 Gossi~         33    16    10
## 10 [問卦]愛莉莎莎~ 2020-01-01 06:01:05 https:~ ddd123    Gossi~         19     5     4
## # ... with 8,539 more rows, and 3 more variables: sentence <chr>, emo <dbl>,
## #   cate <chr>

斷詞

ytr_after_jieba <- ytr %>%
  unnest_tokens(word, sentence, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9]")) )


ytr_after_jieba

## # A tibble: 592,208 x 12
##    artTitle  artDate    artTime  artUrl  artPoster artCat commentNum  push   boo
##    <chr>     <date>     <time>   <chr>   <chr>     <chr>       <dbl> <dbl> <dbl>
##  1 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
##  2 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
##  3 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
##  4 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
##  5 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
##  6 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
##  7 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
##  8 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
##  9 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
## 10 [新聞]3萬6手~ 2019-12-31 16:32:46 https:~ LoveMake~ Gossi~        146    82    16
## # ... with 592,198 more rows, and 3 more variables: emo <dbl>, cate <chr>,
## #   word <chr>

加入情緒字典並計算每天情緒

sentiment <- ytr_after_jieba %>%
  select(artDate,word,cate) %>%
  inner_join(LIWC) %>%
  mutate(senti_count = ifelse(sentiment =="positive",1,-1))%>%
  select(artDate,senti_count,cate) %>%
  group_by(artDate,cate)%>%
  summarise(sum = sum(senti_count))

## Joining, by = "word"

## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

sentiment

## # A tibble: 1,785 x 3
## # Groups:   artDate [481]
##    artDate    cate       sum
##    <date>     <chr>    <dbl>
##  1 2019-12-31 館長         3
##  2 2020-01-01 老高        -1
##  3 2020-01-01 愛莉莎莎     3
##  4 2020-01-01 館長        -8
##  5 2020-01-02 老高         1
##  6 2020-01-02 阿神         0
##  7 2020-01-02 唐綺陽     -21
##  8 2020-01-02 館長        17
##  9 2020-01-03 反骨        -4
## 10 2020-01-03 唐綺陽      -6
## # ... with 1,775 more rows

畫圖：

### 全部人的趨勢變化：一張年和分季
all_season <- sentiment%>%
  filter(cate %in% "愛莉莎莎"|cate %in%"館長"|cate %in%"蒼藍鴿"|cate %in%"蔡阿嘎"|cate %in%"黃氏兄弟")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))


all_season

可以看出，黃氏兄弟在2020年第二季、蒼藍鴿在2021年第一季、愛莉莎莎在2020年第二季和2021年第一季、蔡阿嘎在2020年第三季、館長在2020年第三季有比較顯著的變化，因此下面把這些季數特別拿出來看

### 個別
#### 黃氏兄弟：
one_season <- sentiment%>%
  filter(cate %in% "黃氏兄弟")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))+
  scale_x_date(limits = as.Date(c('2020-04-01','2020-06-30')))

#### 蒼藍鴿：
two_season <- sentiment%>%
  filter(cate %in% "蒼藍鴿")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))+
  scale_x_date(limits = as.Date(c('2021-01-01','2021-03-31')))

#### 愛莉莎莎
three_season_1 <- sentiment%>%
  filter(cate %in% "愛莉莎莎")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))+
  scale_x_date(limits = as.Date(c('2021-01-01','2021-03-31')))

three_season_2 <- sentiment%>%
  filter(cate %in% "愛莉莎莎")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))+
  scale_x_date(limits = as.Date(c('2020-04-01','2020-06-30')))

#### 蔡阿嘎
four_season <- sentiment%>%
  filter(cate %in% "蔡阿嘎")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))+
  scale_x_date(limits = as.Date(c('2020-10-01','2020-12-31')))

#### 館長
five_season <- sentiment%>%
  filter(cate %in% "館長")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))+
  scale_x_date(limits = as.Date(c('2020-10-01','2020-12-31')))

one_season

## Warning: Removed 5 row(s) containing missing values (geom_path).

two_season

## Warning: Removed 1 row(s) containing missing values (geom_path).

three_season_1

## Warning: Removed 335 row(s) containing missing values (geom_path).

three_season_2

## Warning: Removed 351 row(s) containing missing values (geom_path).

four_season

## Warning: Removed 136 row(s) containing missing values (geom_path).

five_season

## Warning: Removed 343 row(s) containing missing values (geom_path).

計算emotion：

ytr_emo <- ytr%>%
  select(artDate,cate,emo)%>%
  group_by(artDate,cate)%>%
  summarise(sum = sum(emo))

## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

ytr_emo

## # A tibble: 1,918 x 3
## # Groups:   artDate [481]
##    artDate    cate       sum
##    <date>     <chr>    <dbl>
##  1 2019-12-31 館長        49
##  2 2020-01-01 老高         1
##  3 2020-01-01 愛莉莎莎     1
##  4 2020-01-01 館長       714
##  5 2020-01-02 老高         6
##  6 2020-01-02 阿神        75
##  7 2020-01-02 唐綺陽    -335
##  8 2020-01-02 館長        41
##  9 2020-01-03 反骨         5
## 10 2020-01-03 唐綺陽    -121
## # ... with 1,908 more rows

畫圖：

### 整體
all_year_emo <- ytr_emo%>%
  filter(cate %in% "愛莉莎莎"|cate %in%"館長"|cate %in%"蒼藍鴿"|cate %in%"蔡阿嘎"|cate %in%"黃氏兄弟")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))

### 個別
#### 黃氏兄弟：
one_emo_season <- ytr_emo%>%
  filter(cate %in% "黃氏兄弟")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))+
  scale_x_date(limits = as.Date(c('2020-04-01','2020-06-30')))

#### 蒼藍鴿：
two_emo_season <- ytr_emo%>%
  filter(cate %in% "蒼藍鴿")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))+
  scale_x_date(limits = as.Date(c('2021-01-01','2021-03-31')))

#### 愛莉莎莎
three_emo_season_1 <- ytr_emo%>%
  filter(cate %in% "愛莉莎莎")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))+
  scale_x_date(limits = as.Date(c('2021-01-01','2021-03-31')))

three_emo_season_2 <- ytr_emo%>%
  filter(cate %in% "愛莉莎莎")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))+
  scale_x_date(limits = as.Date(c('2020-04-01','2020-06-30')))

#### 蔡阿嘎
four_emo_season <- ytr_emo%>%
  filter(cate %in% "蔡阿嘎")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))+
  scale_x_date(limits = as.Date(c('2020-10-01','2020-12-31')))

#### 館長
five_emo_season <- ytr_emo%>%
  filter(cate %in% "館長")%>%
  ggplot()+
  geom_line(aes(x = artDate,y = sum,colour = cate))+
  scale_x_date(limits = as.Date(c('2020-10-01','2020-12-31')))

one_emo_season

## Warning: Removed 6 row(s) containing missing values (geom_path).

two_emo_season

## Warning: Removed 1 row(s) containing missing values (geom_path).

three_emo_season_1

## Warning: Removed 346 row(s) containing missing values (geom_path).

three_emo_season_2

## Warning: Removed 360 row(s) containing missing values (geom_path).

four_emo_season

## Warning: Removed 148 row(s) containing missing values (geom_path).

five_emo_season

## Warning: Removed 353 row(s) containing missing values (geom_path).