#把資料抓進去
lyric<-fread('./lyric.txt',sep='')
colnames(lyric)<-c('text')
lyric$text<-as.character(lyric$text)
#在清一次欄位中的符號和無用文字
structural_lyric$song<-gsub(regex("'title'")," ",structural_lyric$song)
structural_lyric=structural_lyric[c(6,1,3,4,2,5)]#排列欄位
structural_lyric$year<-gsub("'","",structural_lyric$year)#filter '
structural_lyric$artist<-gsub("'","",structural_lyric$artist)
structural_lyric$song<-gsub("'","",structural_lyric$song)
structural_lyric$song<-gsub('"',"",structural_lyric$song)#把歌名餘留的"濾掉
structural_lyric$lyrics<-gsub("['\"]","",structural_lyric$lyrics)#filter lyric '&"
structural_lyric<-structural_lyric%>%#把remix,mix重複的歌曲刪掉
filter(!str_detect(song,regex('(Remix)|(Mix)')))#filter out the remix#equal=filter(!grepl('Remix',name))
#依照年代分成不同group
six<-c(" The Beatles "," Elvis Presley "," The Supremes "," The Rolling Stones ")
seven<-c(" Bee Gees "," Elton John "," Paul McCartney "," Eagles "," Stevie Wonder ")
eight<-c(" Michael Jackson "," Madonna "," Phil Collins ",' Whitney Houston ',' George Michael ')
nine<-c(' Mariah Carey ',' Janet Jackson ',' Boyz II Men ')
ten<-c(' Usher ',' Rihanna ',' Beyoncé ',' Ludacris ',' Nelly ',' Justin Timberlake ',' 50 Cent ')
structural_lyric<-structural_lyric%>%
mutate(group=ifelse(artist%in%six , 1960, ifelse(artist%in%seven,1970,ifelse(artist%in%eight,1980,ifelse(artist%in%nine,1990,ifelse(artist%in%ten,2000,0))))))
structural_lyric=structural_lyric[c(1,2,7,4,3,5,6)]
#structural_lyric是一個結構化的歌曲資料
#把歌詞斷成token
lyric_word<-structural_lyric%>%
unnest_tokens(token,lyrics,to_lower=FALSE)%>%
filter(!str_detect(token,regex('[0-9]|Intro|Hook|Verse|Chorus')))%>%
filter(!token%in%stop_words$word)%>%
filter(!(token=='I'|token=='la'|token=='Im'))
lyric_word=lyric_word[c(1,2,3,7,4,5,6)]
#lyric_word就是歌詞斷句完畢的資料
#把歌詞用ngram斷成兩個字
lyric_ngram<-structural_lyric%>%
unnest_tokens(bigram,lyrics,token='ngrams',n=2,to_lower = FALSE)%>%
separate(bigram,c('word1','word2'),sep=" ")%>%
filter(!word1%in%stop_words$word)%>%
filter(!word2%in%stop_words$word)%>%
filter(!str_detect(word1,regex('[0-9]|Verse|Hook|Intro')))%>%
filter(!str_detect(word2,regex("[0-9]|Verse|Hook|Intro")))%>%
filter(!(word1=='I'|word1=='la'|word1=='Im'||word1==' G'))%>%
filter(!(word2=='I'|word2=='la'|word2=='Im'||word2==' G'))
# filter(!word1%in%token_remove)%>%
# filter(!word2%in%token_remove)
lyric_ngram=lyric_ngram[c(1,2,3,7,8,4,5,6)]#把欄位排序
#lyric_ngram把歌斷成兩個字在一起
#前處理結束
#算出love girl baby nigga time在各年代次數
specific_dic_j<-c('love','girl','baby','time')
wordcount_j<-lyric_word%>%
filter(token%in%specific_dic_j)%>%
group_by(token,group)%>%
summarise(count=n())
#graph
wordcount_j%>%
ggplot(aes(x=group,y=count,colour=token))+
geom_line(size=2)
#tf-idf:依據歌手來算出個別字在不同歌手中所有歌曲總字量的佔比
lyric_count_j<-lyric_word%>%
group_by(token,artist)%>%
summarise(count=n())%>%
ungroup%>%
group_by(artist)%>%
mutate(total=sum(count),frequency=count/total)%>%
arrange(desc(frequency))
#draw the pic of count
#ggplot(lyric_count_j,aes(count/total,fill=artist))+
# geom_histogram(show.legend = FALSE)+
# facet_wrap(~artist,ncol=2,scales="free_y")
#draw the pic of frequency
#bind_tf_idf:依據年代
lyric_tfidf_j<-lyric_word%>%
group_by(token,artist,group)%>%
summarise(count=n())%>%
ungroup()%>%
bind_tf_idf(token,group,count)%>%
arrange(desc(tf_idf))
#graph
garbage_word_j<-c('lala','Ai','ah','yeh','Isnt','em','ism','De','Ye','bom','Ooo','Hes','B','tat','dah','da','Ba','Fa','di','el','en','mi','II','na','va','la','Ludacris','Cent','Usher','Beyoncé','Timberlake','Nelly','Timberlake','Justin','AT')#the garbage word need to filter
lyric_tfidf_j%>%
filter(group==1970)%>%
filter(!token%in%garbage_word_j)%>%
mutate(word = factor(token, levels = rev(unique(token)))) %>%
group_by(group) %>%
top_n(15) %>%
ungroup() %>%
arrange(desc(tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = group))+
theme(text=element_text(size=18))+
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~group, ncol = 2, scales = "free") +
coord_flip()
## Selecting by word
#建立一歇每個歌手都很常用到 不特殊字的字典
artist_wordcount<-lyric_word%>%#依據artist 做count
group_by(artist,token)%>%
summarise(sum=n())%>%
arrange(desc(sum))
lyric_tfidf<-artist_wordcount%>% #tf-idf:依據artist做tf-idf
bind_tf_idf(token,artist,sum)%>%
arrange(desc(tf_idf))
token_avg<-lyric_tfidf%>% #算出token的平均值
group_by(token)%>%
summarise(avg=mean(tf_idf))
token_avg$avg%>%summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000000 0.0005012 0.0008465 0.0012700 0.0012731 0.0615269
token_remove<-token_avg%>%
filter(avg<0.0010178)%>%
.$token
token_remove%>%head
## [1] "A" "A.I" "a.k.a" "A.M" "Aah" "AAH"
#visualizaiton a network for bigram
garbage_word2_j<-c('bump','bang','doo','dum','shoop','zoop','mm','AT','II','G','Tu','va','ti','en','la','Ye')
bigram_count_j<-lyric_ngram%>%
filter(!word1%in%garbage_word2_j)%>%
filter(!word2%in%garbage_word_j)%>%
group_by(artist,word1,word2)%>%
summarise(count=n())%>%
arrange(desc(count))
bigram_graph_j<-bigram_count_j%>%
filter(artist%in%nine)%>%
filter(count>5)%>%
as.data.frame()%>%
select(-artist)%>%
graph_from_data_frame()
#graph
a<-grid::arrow(type="closed",length=unit(.05,"inches"))
ggraph(bigram_graph_j,layout='fr')+
geom_edge_link(aes=(edge_alpha=n),show.legend=FALSE,arrow=a,end_cap=circle(.07,'inches'))+
geom_node_point(color="lightblue",size=4)+
geom_node_text(aes(label=name),vjust=1,hjust=1)+
theme_void()
## Warning: Ignoring unknown parameters: aes
##Topic model
#topic model:以一首歌當作一個document
garbage_word3_j<-c('aint','Aint','ain','Fa','da','da','em','gon','don','mi','mm','bang','IS','Tha','Nos','bump','Lou','Ali','ahh','St','aw','DTP','DA','OF','ACT','AT','YA','Ho')
lyric_dtm_j<-lyric_word%>%
filter(!token%in%garbage_word3_j)%>%
group_by(song,artist,token)%>%
summarise(count=n())%>%
cast_dtm(song,token,count)
lyric_lda_j<-LDA(lyric_dtm_j,k=5,control=list(seed=1234))
lyric_topic_j<-tidy(lyric_lda_j,matrix='beta')
#找出topic top word
topic_topterm_j<-lyric_topic_j%>%
group_by(topic)%>%
top_n(10,beta)%>%
ungroup()%>%
arrange(topic)
#graph
topic_topterm_j%>%
mutate(term=reorder(term,beta))%>%
ggplot(aes(term,beta,fill=factor(topic)))+
geom_col(show.legend = FALSE)+
facet_wrap(~ topic,ncol=2, scales = "free") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。
#找出組間差異最大的值
topic_betaspread_j<-lyric_topic_j%>%
filter(!term%in%garbage_word3_j)%>%
mutate(topic=paste0('topic',topic))%>%
spread(topic,beta)%>%
filter(topic1>0.0004|topic2>0.0004)%>%
mutate(log_ratio=log2(topic2/topic1))
topic_ratio_j<-rbind(topic_betaspread_j%>%top_n(10,wt=log_ratio),topic_betaspread_j%>%top_n(-10,log_ratio))%>%
arrange(log_ratio)
#graph
topic_ratio_j%>%
ggplot(aes(x=reorder(term,log_ratio),y=log_ratio))+
geom_bar(stat="identity") +
xlab("Word")+
coord_flip()+
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。
#主題分辨
topic_distinct_j<-tidy(lyric_lda_j,matrix='gamma')
#原始資料文字雲
lyric_word%>%
group_by(token)%>%
summarise(count=n())%>%
filter(!token%in%garbage_word3_j)%>%
arrange(desc(count))%>%
wordcloud2()
#劃分兩主題資料
topic_spread_j<-topic_distinct_j%>%
mutate(topic=paste0('topic',topic))%>%
spread(topic,gamma)
#觀察每個年代屬於什麼topic
year_topic_j<-topic_spread_j%>%
inner_join(structural_lyric,by=c('document'='song'))%>%
select(artist,group,document,topic1,topic2)%>%
mutate(topicbelong=ifelse(topic1>topic2,1,2))%>%
group_by(group,topicbelong)%>%
summarise(topicount=n())
#spread(topicbelong,topicount)
#取出topic1最高
topic1_j<-topic_spread_j%>%
top_n(250,topic1)%>%#最接近topic1前250
inner_join(structural_lyric,by=c('document'='song'))%>%
select(document,artist,group,topic1,topic2)
#觀察topic1都在哪個年代
topic1_j%>%
group_by(group)%>%
summarise(count=n())
## # A tibble: 5 x 2
## group count
## <dbl> <int>
## 1 1960 62
## 2 1970 72
## 3 1980 62
## 4 1990 36
## 5 2000 28
#graph
#做成文字雲
lyric_word%>%
filter(song%in%topic1_j$document)%>%
filter(!token%in%garbage_word3_j)%>%
group_by(token)%>%
summarise(count=n())%>%
wordcloud2()
#取出topic2最高
topic2_j<-topic_spread_j%>%
top_n( 250,topic2)%>%
inner_join(structural_lyric,by=c('document'='song'))%>%
select(document,artist,group,topic1,topic2)
topic2_j%>%
group_by(group)%>%
summarise(count=n())
## # A tibble: 5 x 2
## group count
## <dbl> <int>
## 1 1960 56
## 2 1970 62
## 3 1980 35
## 4 1990 20
## 5 2000 84
#graph
lyric_word%>%
filter(song%in%topic2_j$document)%>%
filter(!token%in%garbage_word3_j)%>%
group_by(token)%>%
summarise(count=n())%>%
wordcloud2()
filter_word = c("And","The","dont","but","Oh","yeah","A","But","youre","Aint","So","if","da","No","If","Its","ya","You","We","Yeah","My","They","All","Well","Just","When","Here","That","Now","Theres","Dont","Youre","It","Do","Hes","This","gonna","wanna","To","Cause","She","don","gon","aint","What","Ill","Ive","Id","mi")
z_top_token <- lyric_word%>%
group_by(group,token)%>%
summarise(sum=n())%>%
arrange(group,sum)%>%
filter(sum>5)%>%
filter(!token%in%stop_words$word)%>%
filter(!token%in% filter_word)
z_top_token
## # A tibble: 2,560 x 3
## # Groups: group [5]
## group token sum
## <dbl> <chr> <int>
## 1 1960 After 6
## 2 1960 alright 6
## 3 1960 Aquarius 6
## 4 1960 As 6
## 5 1960 babe 6
## 6 1960 belong 6
## 7 1960 blow 6
## 8 1960 Bompa 6
## 9 1960 boy 6
## 10 1960 Close 6
## # … with 2,550 more rows
z_top_token_1960 <- z_top_token %>%
filter(group == "1960")%>%
arrange(desc(sum))%>%
head(10)%>%
ggplot(aes(x=reorder(token,sum),y=sum)) +
geom_col(show.legend = FALSE) +
theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
labs(x="1960-token", y="數量") +
coord_flip()
z_top_token_1970 <- z_top_token %>%
filter(group == "1970")%>%
arrange(desc(sum))%>%
head(10)%>%
ggplot(aes(x=reorder(token,sum),y=sum)) +
geom_col(show.legend = FALSE) +
theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
labs(x="1970-token", y="數量") +
coord_flip()
z_top_token_1980 <- z_top_token %>%
filter(group == "1980")%>%
arrange(desc(sum))%>%
head(10)%>%
ggplot(aes(x=reorder(token,sum),y=sum)) +
geom_col(show.legend = FALSE) +
theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
labs(x="1980-token", y="數量") +
coord_flip()
z_top_token_1990 <- z_top_token %>%
filter(group == "1990")%>%
arrange(desc(sum))%>%
head(10)%>%
ggplot(aes(x=reorder(token,sum),y=sum)) +
geom_col(show.legend = FALSE) +
theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
labs(x="1990-token", y="數量") +
coord_flip()
z_top_token_2000 <- z_top_token %>%
filter(group == "2000")%>%
arrange(desc(sum))%>%
head(10)%>%
ggplot(aes(x=reorder(token,sum),y=sum)) +
geom_col(show.legend = FALSE)+
theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
labs(x="2000-token", y="數量") +
coord_flip()
z_top_token_1960
z_top_token_1970
z_top_token_1980
z_top_token_1990
z_top_token_2000
z_token_sentiment <- z_top_token%>%
inner_join(get_sentiments("bing"),by = c(token = "word")) %>%
group_by(group,sentiment)%>%
summarise(count=sum(sum))
z_token_sentiment
## # A tibble: 10 x 3
## # Groups: group [5]
## group sentiment count
## <dbl> <chr> <int>
## 1 1960 negative 225
## 2 1960 positive 400
## 3 1970 negative 510
## 4 1970 positive 650
## 5 1980 negative 449
## 6 1980 positive 844
## 7 1990 negative 203
## 8 1990 positive 368
## 9 2000 negative 1980
## 10 2000 positive 1070
z_group_sentiment <- z_token_sentiment%>%
ggplot(aes(x=group,y=count,colour=sentiment , fill = sentiment)) + #x軸為年代,y軸為情緒分數
geom_col(show.legend = TRUE) +
theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
labs(x="年代", y="情緒分數")
z_group_sentiment
z_token <- lyric_word%>%
group_by(group,artist,token)%>%
summarise(sum=n())%>%
arrange(group,artist,sum)%>%
filter(sum>5)%>%
filter(!token%in%stop_words$word)%>%
filter(!token%in% filter_word)
z_token
## # A tibble: 2,412 x 4
## # Groups: group, artist [24]
## group artist token sum
## <dbl> <chr> <chr> <int>
## 1 1960 " Elvis Presley " America 6
## 2 1960 " Elvis Presley " blues 6
## 3 1960 " Elvis Presley " didnt 6
## 4 1960 " Elvis Presley " grace 6
## 5 1960 " Elvis Presley " hold 6
## 6 1960 " Elvis Presley " hundred 6
## 7 1960 " Elvis Presley " lips 6
## 8 1960 " Elvis Presley " live 6
## 9 1960 " Elvis Presley " Maybe 6
## 10 1960 " Elvis Presley " pretty 6
## # … with 2,402 more rows
z_token_sentiment_ne <- z_token%>%
filter(group =="2000")%>%
inner_join(get_sentiments("bing"),by = c(token = "word"))%>%
filter(sentiment == "negative")%>%
group_by(artist)%>%
summarise(count=sum(sum))%>%
arrange(desc(artist,sum))
z_token_sentiment_ne
## # A tibble: 7 x 2
## artist count
## <chr> <int>
## 1 " Usher " 288
## 2 " Rihanna " 114
## 3 " Nelly " 161
## 4 " Ludacris " 260
## 5 " Justin Timberlake " 153
## 6 " Beyoncé " 79
## 7 " 50 Cent " 323
##lyric_ngram把歌斷成兩個字在一起
lyric_ngram<-structural_lyric%>%
unnest_tokens(bigram,lyrics,token='ngrams',n=2,to_lower = FALSE)
lyric_ngram=lyric_ngram[c(1,2,6,7,3,4,5)]#把欄位排序
##將每位歌手的bigram數量統計,沒有stopwords(因為要做bigram情緒分析)
z_bigrams_count = lyric_ngram %>%
#select(group,artist,bigram)%>%
count(group,artist,bigram, sort = TRUE)%>%
separate(bigram,c('word1','word2'),sep=" ")%>%
filter(!str_detect(word1,regex('[0-9]|Verse|Hook|Intro|Chorus|Justin|Timberlake')))%>%
filter(!str_detect(word2,regex("[0-9]|Verse|Hook|Intro|Chorus|Justin|Timberlake")))%>%
filter(!(word1=='I'|word1=='la'|word1=='Im'))%>%
filter(!(word2=='I'|word2=='la'|word2=='Im'))
##加入stopwords
z_stop_words = z_bigrams_count %>%
filter(!word1%in%stop_words$word)%>%
filter(!word2%in%stop_words$word)
z_stop_words
## # A tibble: 23,368 x 5
## group artist word1 word2 n
## <dbl> <chr> <chr> <chr> <int>
## 1 2000 " Usher " bump bump 91
## 2 2000 " Ludacris " bang bang 71
## 3 1970 " Stevie Wonder " day sucker 52
## 4 1970 " Stevie Wonder " All day 44
## 5 2000 " Rihanna " bad bad 43
## 6 1970 " Elton John " Amen Amen 42
## 7 1970 " Stevie Wonder " love All 41
## 8 2000 " Ludacris " meeny miny 38
## 9 2000 " Ludacris " miny moe 38
## 10 1980 " Michael Jackson " da da 37
## # … with 23,358 more rows
##組合word1、word2
z_bigrams_united <- z_stop_words %>%
unite(bigram,word1,word2,sep=" ")
z_top_lyrics_bigrams <- z_bigrams_united %>%
group_by(group)%>%
top_n(10,n)%>%
ungroup()%>%
arrange(group,n)
#z_top_lyrics_bigrams
#
z_top_lyrics_bigrams %>%
arrange(n)%>%
ggplot(aes(x=reorder(bigram,n),y=n,fill=group)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ group, scales = "free") +
coord_flip()
z_AFINN <- get_sentiments("afinn")
z_bing <-get_sentiments("bing")
z_bing
## # A tibble: 6,788 x 2
## word sentiment
## <chr> <chr>
## 1 2-faced negative
## 2 2-faces negative
## 3 a+ positive
## 4 abnormal negative
## 5 abolish negative
## 6 abominable negative
## 7 abominably negative
## 8 abominate negative
## 9 abomination negative
## 10 abort negative
## # … with 6,778 more rows
z_sentiment_bigrams <-z_bigrams_count %>%
select(group, word1, word2) %>%
count(group,word1, word2, sort = TRUE)%>%
inner_join(z_AFINN, by = c(word2 = "word")) %>%
mutate(sentiment_tag=ifelse(score>0, "positive", "negative"))
z_sentiment_bigrams
## # A tibble: 6,011 x 6
## group word1 word2 n score sentiment_tag
## <dbl> <chr> <chr> <int> <int> <chr>
## 1 2000 you want 7 1 positive
## 2 2000 act like 6 2 positive
## 3 2000 me like 6 2 positive
## 4 2000 Oh yeah 6 1 positive
## 5 2000 yeah yeah 6 1 positive
## 6 1970 a dream 5 1 positive
## 7 1970 my love 5 3 positive
## 8 1980 a dream 5 1 positive
## 9 1980 dont want 5 1 positive
## 10 1980 in love 5 3 positive
## # … with 6,001 more rows
z_sentiment_plot_data <- z_sentiment_bigrams %>%
group_by(group,sentiment_tag) %>%
summarise(count=n())
z_sentiment_plot_data
## # A tibble: 10 x 3
## # Groups: group [5]
## group sentiment_tag count
## <dbl> <chr> <int>
## 1 1960 negative 275
## 2 1960 positive 427
## 3 1970 negative 464
## 4 1970 positive 577
## 5 1980 negative 462
## 6 1980 positive 689
## 7 1990 negative 253
## 8 1990 positive 372
## 9 2000 negative 1333
## 10 2000 positive 1159
# 最後把圖畫出來
z_sentiment_plot_data %>%
ggplot()+
geom_line(aes(x=group,y=count,colour=sentiment_tag))+
theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
labs(x="年代", y="數量")
z_negative <- c("not", "no", "never", "without")
z_negative_words <- z_bigrams_count %>%
filter(word1 %in% z_negative)%>%
inner_join(z_AFINN, by = c(word2 = "word")) %>%
count(group,word1,word2, score, sort = TRUE)
#mutate(score = ifelse(sentiment=="positive",1, -1))
z_negative_words
## # A tibble: 130 x 5
## group word1 word2 score n
## <dbl> <chr> <chr> <int> <int>
## 1 2000 no no -1 5
## 2 1980 no no -1 3
## 3 1990 no no -1 3
## 4 2000 no doubt -1 3
## 5 1960 no loving 2 2
## 6 1960 no no -1 2
## 7 1970 never die -3 2
## 8 1980 not good 3 2
## 9 2000 never hurt -2 2
## 10 2000 no wrong -2 2
## # … with 120 more rows
# 查看 前面出現否定詞 和 後面的所有詞彙
z_negative_all_words <- z_bigrams_count%>%
filter(word1 %in% z_negative) %>%
count(group,word1, word2, sort = TRUE)
z_negative_all_words
## # A tibble: 721 x 4
## group word1 word2 n
## <dbl> <chr> <chr> <int>
## 1 2000 no more 6
## 2 2000 not a 6
## 3 1980 never gonna 5
## 4 2000 no no 5
## 5 1970 no one 4
## 6 1980 no more 4
## 7 1980 no one 4
## 8 2000 never be 4
## 9 1960 never let 3
## 10 1960 never take 3
## # … with 711 more rows
# 如果在情緒詞前出現的是否定詞的話,則將他的情緒對調
z_negative_bigrams <-z_negative_words %>%
mutate(sentiment=ifelse(word1 %in% z_negative, -1*score, score)) %>%
mutate(sentiment_tag=ifelse(score>0, "positive", "negative"))
z_negative_bigrams
## # A tibble: 130 x 7
## group word1 word2 score n sentiment sentiment_tag
## <dbl> <chr> <chr> <int> <int> <dbl> <chr>
## 1 2000 no no -1 5 1 negative
## 2 1980 no no -1 3 1 negative
## 3 1990 no no -1 3 1 negative
## 4 2000 no doubt -1 3 1 negative
## 5 1960 no loving 2 2 -2 positive
## 6 1960 no no -1 2 1 negative
## 7 1970 never die -3 2 3 negative
## 8 1980 not good 3 2 -3 positive
## 9 2000 never hurt -2 2 2 negative
## 10 2000 no wrong -2 2 2 negative
## # … with 120 more rows
# 計算我們資料集中 每年代的情緒值
z_negated_sentiment_plot_data <- z_negative_bigrams %>%
group_by(group,sentiment_tag) %>%
summarise(count=n())
z_negated_sentiment_plot_data
## # A tibble: 10 x 3
## # Groups: group [5]
## group sentiment_tag count
## <dbl> <chr> <int>
## 1 1960 negative 13
## 2 1960 positive 5
## 3 1970 negative 14
## 4 1970 positive 14
## 5 1980 negative 13
## 6 1980 positive 11
## 7 1990 negative 11
## 8 1990 positive 6
## 9 2000 negative 28
## 10 2000 positive 15
# 最後把圖畫出來
z_negated_sentiment_plot_data %>%
ggplot()+
geom_line(aes(x=group,y=count,colour=sentiment_tag))+
theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
labs(x="年代", y="數量")
z_all_sentiments <- bind_rows(
z_sentiment_plot_data %>% mutate(sentiment_tag=paste(sentiment_tag, "_original", sep = "")),
z_negated_sentiment_plot_data %>% mutate(sentiment_tag=paste(sentiment_tag, "_negated", sep = "")))
z_all_sentiments
## # A tibble: 20 x 3
## # Groups: group [5]
## group sentiment_tag count
## <dbl> <chr> <int>
## 1 1960 negative_original 275
## 2 1960 positive_original 427
## 3 1970 negative_original 464
## 4 1970 positive_original 577
## 5 1980 negative_original 462
## 6 1980 positive_original 689
## 7 1990 negative_original 253
## 8 1990 positive_original 372
## 9 2000 negative_original 1333
## 10 2000 positive_original 1159
## 11 1960 negative_negated 13
## 12 1960 positive_negated 5
## 13 1970 negative_negated 14
## 14 1970 positive_negated 14
## 15 1980 negative_negated 13
## 16 1980 positive_negated 11
## 17 1990 negative_negated 11
## 18 1990 positive_negated 6
## 19 2000 negative_negated 28
## 20 2000 positive_negated 15
z_all_sentiments_after <- bind_rows(z_sentiment_plot_data ,z_negated_sentiment_plot_data )%>%
group_by(group,sentiment_tag)%>%
summarise(count=sum(count))
z_sentiment_plot_data
## # A tibble: 10 x 3
## # Groups: group [5]
## group sentiment_tag count
## <dbl> <chr> <int>
## 1 1960 negative 275
## 2 1960 positive 427
## 3 1970 negative 464
## 4 1970 positive 577
## 5 1980 negative 462
## 6 1980 positive 689
## 7 1990 negative 253
## 8 1990 positive 372
## 9 2000 negative 1333
## 10 2000 positive 1159
z_negated_sentiment_plot_data
## # A tibble: 10 x 3
## # Groups: group [5]
## group sentiment_tag count
## <dbl> <chr> <int>
## 1 1960 negative 13
## 2 1960 positive 5
## 3 1970 negative 14
## 4 1970 positive 14
## 5 1980 negative 13
## 6 1980 positive 11
## 7 1990 negative 11
## 8 1990 positive 6
## 9 2000 negative 28
## 10 2000 positive 15
z_all_sentiments_after
## # A tibble: 10 x 3
## # Groups: group [5]
## group sentiment_tag count
## <dbl> <chr> <int>
## 1 1960 negative 288
## 2 1960 positive 432
## 3 1970 negative 478
## 4 1970 positive 591
## 5 1980 negative 475
## 6 1980 positive 700
## 7 1990 negative 264
## 8 1990 positive 378
## 9 2000 negative 1361
## 10 2000 positive 1174
z_all_sentiments_after %>%
ggplot()+
geom_line(aes(x=group,y=count,colour=sentiment_tag))+
theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
labs(x="年代", y="數量")
z_all_sentiments %>%
filter(sentiment_tag %in% c("positive_original", "positive_negated")) %>%
ggplot()+
geom_line(aes(x=group,y=count,colour=sentiment_tag))+
theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
labs(x="年代", y="數量")
z_all_sentiments %>%
filter(sentiment_tag %in% c("negative_original", "negative_negated")) %>%
ggplot()+
geom_line(aes(x=group,y=count,colour=sentiment_tag)) +
theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
labs(x="年代", y="數量")