토픽모델링
# textdata 패키지 및 뉴스 데이터셋 불러오기
library(textdata)
news <- dataset_ag_news(split="test")
news
## # A tibble: 7,600 × 3
## class title description
## <chr> <chr> <chr>
## 1 Business Fears for T N pension after talks "Unions re…
## 2 Sci/Tech The Race is On: Second Private Team Sets Launch Date fo… "SPACE.com…
## 3 Sci/Tech Ky. Company Wins Grant to Study Peptides (AP) "AP - A co…
## 4 Sci/Tech Prediction Unit Helps Forecast Wildfires (AP) "AP - It's…
## 5 Sci/Tech Calif. Aims to Limit Farm-Related Smog (AP) "AP - Sout…
## 6 Sci/Tech Open Letter Against British Copyright Indoctrination in… "The Briti…
## 7 Sci/Tech Loosing the War on Terrorism "\\\\\"Sve…
## 8 Sci/Tech FOAFKey: FOAF, PGP, Key Distribution, and Bloom Filters "\\\\FOAF/…
## 9 Sci/Tech E-mail scam targets police chief "Wiltshire…
## 10 Sci/Tech Card fraud unit nets 36,000 cards "In its fi…
## # ℹ 7,590 more rows
table(news$class)
##
## Business Sci/Tech Sports World
## 1900 1900 1900 1900
# tm 패키지를 설치하고 로드
install.packages("tm")
##
## The downloaded binary packages are in
## /var/folders/wn/dnhm0_tn4qs9lnqb0r3mjy380000gn/T//RtmpQ5EGTL/downloaded_packages
library(tm)
## Loading required package: NLP
# VCorpus 객체로 변환
docs <- VCorpus(VectorSource(news$description))
docs
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 7600
# 데이터의 형식 파악
lapply(docs, content)[1:3]
## $`1`
## [1] "Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul."
##
## $`2`
## [1] "SPACE.com - TORONTO, Canada -- A second\\team of rocketeers competing for the #36;10 million Ansari X Prize, a contest for\\privately funded suborbital space flight, has officially announced the first\\launch date for its manned rocket."
##
## $`3`
## [1] "AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins."
# 모두 소문자로 변환
docs <- tm_map(docs, content_transformer(tolower))
# URL을 제거하는 함수를 정의
myRemove <- function(x, pattern)
{return(gsub(pattern, "", x))}
# URL 패턴 (http, https)과 'www' 패턴을 제거
docs <- tm_map(docs, content_transformer(myRemove), "(f|ht)tp\\S+\\s*")
docs <- tm_map(docs, content_transformer(myRemove), "www\\.+\\S+")
# 영어 불용어(stopwords) 리스트 정의
mystopwords <- c(stopwords("english"),
c("first", "second", "one", "two", "three", "four", "another",
"last", "least", "just", "will", "week", "weeks", "quot",
"ago", "day", "days", "night", "nights", "month", "months",
"years", "year", "next", "now", "today", "yesterday",
"may", "new", "york", "according", "back", "say", "says",
"said", "can", "make", "made", "reuters", "monday", "tuesday",
"wednesday", "thursday", "friday", "saturday", "sunday"))
# 불용어를 문서에서 제거
docs <- tm_map(docs, removeWords, mystopwords)
# 특정 기호들을 공백으로 대체하는 함수 정의
toSpace <- function(x, pattern)
{return(gsub(pattern, " ", x))}
# 텍스트에서 특정 기호들을 공백으로 변환하여 처리
docs <- tm_map(docs, content_transformer(toSpace), ":")
docs <- tm_map(docs, content_transformer(toSpace), ";")
docs <- tm_map(docs, content_transformer(toSpace), "/")
docs <- tm_map(docs, content_transformer(toSpace), "\\.")
docs <- tm_map(docs, content_transformer(toSpace), "\\\\")
# 구두점 및 숫자 제거 , 불필요한 공백 정리
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, content_transformer(trimws))
# 변환 결과 점검
lapply(docs, content)[1:3]
## $`1`
## [1] "unions representing workers turner newall disappointed talks stricken parent firm federal mogul"
##
## $`2`
## [1] "space com toronto canada team rocketeers competing million ansari x prize contest privately funded suborbital space flight officially announced launch date manned rocket"
##
## $`3`
## [1] "ap company founded chemistry researcher university louisville won grant develop method producing better peptides short chains amino acids building blocks proteins"
# 문서-단어 행렬(DTM) 생성
dtm <- DocumentTermMatrix(docs)
rownames(dtm) <- paste0(rownames(dtm), "-", news$class)
inspect(dtm)
## <<DocumentTermMatrix (documents: 7600, terms: 20319)>>
## Non-/sparse entries: 126245/154298155
## Sparsity : 100%
## Maximal term length: 29
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs company corp government inc million oil president time united
## 1755-Sci/Tech 0 0 0 0 0 0 0 0 0
## 2755-Sci/Tech 0 0 0 0 0 0 0 0 0
## 3110-Sci/Tech 0 0 0 0 0 0 0 0 0
## 5130-Business 1 0 0 0 0 0 0 0 0
## 6-Sci/Tech 0 0 0 0 0 0 0 0 0
## 6307-Sci/Tech 0 0 0 0 0 0 0 0 0
## 6595-Sci/Tech 0 0 0 0 2 0 0 0 0
## 7-Sci/Tech 1 0 0 0 0 0 0 0 0
## 7000-Sci/Tech 0 0 0 0 1 0 0 0 0
## 8-Sci/Tech 0 0 0 0 0 0 0 0 0
## Terms
## Docs world
## 1755-Sci/Tech 0
## 2755-Sci/Tech 0
## 3110-Sci/Tech 0
## 5130-Business 0
## 6-Sci/Tech 0
## 6307-Sci/Tech 0
## 6595-Sci/Tech 0
## 7-Sci/Tech 0
## 7000-Sci/Tech 0
## 8-Sci/Tech 0
library(slam)
summary(col_sums(dtm))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 2.00 6.42 4.00 448.00
# 가장 많이 등장한 5개의 단어를 내림차순으로 정렬하 출력
col_sums(dtm)[order(col_sums(dtm), decreasing=TRUE)][1:5]
## inc company world president million
## 448 425 387 338 319
# 최소 250회 이상 등장한 빈도수가 높은 단어들 찾기
findFreqTerms(dtm, lowfreq = 250)
## [1] "company" "corp" "government" "group" "inc"
## [6] "million" "oil" "people" "president" "time"
## [11] "united" "world"
# topicmodels 패키지 설치 후 불러오기
install.packages("topicmodels")
##
## The downloaded binary packages are in
## /var/folders/wn/dnhm0_tn4qs9lnqb0r3mjy380000gn/T//RtmpQ5EGTL/downloaded_packages
library(topicmodels)
# LDA 모델 생성 - 4개의 토픽(k=4), Gibbs 샘플링 방식 사용
news.lda <- LDA(dtm, k=4, method="Gibbs",
control=list(seed=123, burnin=1000, iter=1000, thin=100))
class(news.lda)
## [1] "LDA_Gibbs"
## attr(,"package")
## [1] "topicmodels"
# 첫 5개의 문서에 할당된 토픽 확인
topics(news.lda)[1:5]
## 1-Business 2-Sci/Tech 3-Sci/Tech 4-Sci/Tech 5-Sci/Tech
## 3 4 2 4 3
# 각 토픽에 할당된 문서 수 확인
table(topics(news.lda))
##
## 1 2 3 4
## 2109 2031 1808 1652
# 각 토픽에서 상위 10개의 중요한 단어 출력
terms(news.lda, 10)
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "president" "world" "inc" "microsoft"
## [2,] "united" "time" "company" "announced"
## [3,] "people" "game" "million" "software"
## [4,] "government" "season" "corp" "internet"
## [5,] "iraq" "team" "oil" "plans"
## [6,] "state" "win" "percent" "security"
## [7,] "afp" "league" "prices" "news"
## [8,] "minister" "top" "stocks" "service"
## [9,] "officials" "victory" "group" "technology"
## [10,] "states" "home" "international" "computer"
# LDA 모델 객체의 구조 확인
str(news.lda, max.level=2, nchar.max=50)
## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
## ..@ seedwords : NULL
## ..@ z : int [1:130442] 2 3 3 1 3 3 1 3 1 3 ...
## ..@ alpha : num 12.5
## ..@ call : language LDA(x = dtm, k = 4, method = "Gibbs| __truncated__
## ..@ Dim : int [1:2] 7600 20319
## ..@ control :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
## ..@ k : int 4
## ..@ terms : chr [1:20319] "aakash" "aapl" "aaron" "aarons" ...
## ..@ documents : chr [1:7600] "1-Business" "2-Sci/Tech" "3-Sci/Tech" "4-Sci/Tech" ...
## ..@ beta : num [1:4, 1:20319] -10.4 -12.7 -12.8 -12.7 -10.4 ...
## ..@ gamma : num [1:7600, 1:4] 0.25 0.201 0.221 0.217 0.261 ...
## ..@ wordassignments:List of 5
## .. ..- attr(*, "class")= chr "simple_triplet_matrix"
## ..@ loglikelihood : num -1049375
## ..@ iter : int 100
## ..@ logLiks : num(0)
## ..@ n : int 130442
# 각 토픽에서 단어 분포(beta 행렬) 확인, 첫 5개 단어 출력
news.lda@beta[, 1:5]
## [,1] [,2] [,3] [,4] [,5]
## [1,] -10.37187 -10.371868 -9.725241 -10.37187 -12.76976
## [2,] -12.74548 -12.745483 -12.745483 -12.74548 -12.74548
## [3,] -12.76047 -9.326485 -12.760472 -12.76047 -9.71595
## [4,] -12.74575 -10.347850 -9.701223 -12.74575 -12.74575
# beta 행렬 값을 지수 함수(exp)로 변환해 확률값으로 출력
exp(news.lda@beta[, 1:5])
## [,1] [,2] [,3] [,4] [,5]
## [1,] 3.130077e-05 3.130077e-05 5.975602e-05 3.130077e-05 2.845525e-06
## [2,] 2.915460e-06 2.915460e-06 2.915460e-06 2.915460e-06 2.915460e-06
## [3,] 2.872086e-06 8.903466e-05 2.872086e-06 2.872086e-06 6.031380e-05
## [4,] 2.914696e-06 3.206165e-05 6.120861e-05 2.914696e-06 2.914696e-06
# tidytext 패키지 사용해 LDA 모델을 tidy 형식으로 변환
library(tidytext)
news.term <- tidy(news.lda, matrix="beta")
news.term
## # A tibble: 81,276 × 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 aakash 0.0000313
## 2 2 aakash 0.00000292
## 3 3 aakash 0.00000287
## 4 4 aakash 0.00000291
## 5 1 aapl 0.0000313
## 6 2 aapl 0.00000292
## 7 3 aapl 0.0000890
## 8 4 aapl 0.0000321
## 9 1 aaron 0.0000598
## 10 2 aaron 0.00000292
## # ℹ 81,266 more rows
# dplyr 패키지 사용해 각 토픽에서 상위 5개의 단어 추출
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
news.term.top <- news.term %>%
group_by(topic) %>%
slice_max(order_by=beta, n=5) %>%
ungroup() %>%
arrange(topic, -beta)
news.term.top
## # A tibble: 20 × 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 president 0.00959
## 2 1 united 0.00780
## 3 1 people 0.00774
## 4 1 government 0.00734
## 5 1 iraq 0.00621
## 6 2 world 0.0112
## 7 2 time 0.00817
## 8 2 game 0.00726
## 9 2 season 0.00662
## 10 2 team 0.00639
## 11 3 inc 0.0129
## 12 3 company 0.0121
## 13 3 million 0.00916
## 14 3 corp 0.00839
## 15 3 oil 0.00790
## 16 4 microsoft 0.00685
## 17 4 announced 0.00665
## 18 4 software 0.00665
## 19 4 internet 0.00601
## 20 4 plans 0.00542
# 토픽별 발생 확률 높은 상위 다섯 개 단어 그래프 생성
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
ggplot(news.term.top,
aes(reorder_within(x=term, by=beta, within=topic),
beta, fill=factor(topic))) +
geom_col(show.legend=FALSE) +
facet_wrap(~ paste("Topic", topic), scales="free") +
scale_x_reordered() +
coord_flip() +
theme_minimal() +
labs(x=NULL, y="Word-Topic Probability (Beta)",
title="News") +
theme(plot.title=element_text(face="bold"),
strip.text=element_text(face="bold"))

# 첫 5개의 문서에 대한 토픽 분포 행렬(gamma) 확인
news.lda@gamma[1:5,]
## [,1] [,2] [,3] [,4]
## [1,] 0.2500000 0.2177419 0.3145161 0.2177419
## [2,] 0.2013889 0.2152778 0.2152778 0.3680556
## [3,] 0.2214286 0.3357143 0.2214286 0.2214286
## [4,] 0.2171053 0.2302632 0.2302632 0.3223684
## [5,] 0.2611940 0.2313433 0.2910448 0.2164179
# LDA 모델의 문서-토픽 분포를 tidy 형식으로 변환
news.doc <- tidy(news.lda, matrix="gamma")
news.doc
## # A tibble: 30,400 × 3
## document topic gamma
## <chr> <int> <dbl>
## 1 1-Business 1 0.25
## 2 2-Sci/Tech 1 0.201
## 3 3-Sci/Tech 1 0.221
## 4 4-Sci/Tech 1 0.217
## 5 5-Sci/Tech 1 0.261
## 6 6-Sci/Tech 1 0.194
## 7 7-Sci/Tech 1 0.173
## 8 8-Sci/Tech 1 0.150
## 9 9-Sci/Tech 1 0.233
## 10 10-Sci/Tech 1 0.254
## # ℹ 30,390 more rows
# tidyr 패키지 로드 후 document 열을 id와 category로 분리
library(tidyr)
news.doc <- news.doc %>%
separate(document, c("id", "category"), sep="-", convert=TRUE)
news.doc
## # A tibble: 30,400 × 4
## id category topic gamma
## <int> <chr> <int> <dbl>
## 1 1 Business 1 0.25
## 2 2 Sci/Tech 1 0.201
## 3 3 Sci/Tech 1 0.221
## 4 4 Sci/Tech 1 0.217
## 5 5 Sci/Tech 1 0.261
## 6 6 Sci/Tech 1 0.194
## 7 7 Sci/Tech 1 0.173
## 8 8 Sci/Tech 1 0.150
## 9 9 Sci/Tech 1 0.233
## 10 10 Sci/Tech 1 0.254
## # ℹ 30,390 more rows
# 카테고리별 각 토픽 배정 확률 분포 확인 그래프 생성
library(ggplot2)
ggplot(news.doc, aes(factor(topic), gamma, fill=category)) +
geom_boxplot(color="gray50", show.legend=FALSE,
outlier.shape=21, outlier.color="black", outlier.fill="gray") +
facet_wrap(~ factor(category), scales="free") + # 카테고리별로 그래프 나누기
scale_fill_brewer(palette="Dark2") +
theme_bw() +
labs(x="Topic", y="Document-Topic Probability (Gamma)",
title="News") +
theme(strip.background=element_rect(fill="aliceblue"),
strip.text=element_text(face="bold", color="steelblue"),
plot.title=element_text(face="bold"),
panel.grid.major.x=element_blank(),
panel.grid.minor.x=element_blank())

감성분석
# tidytext 패키지 로드 및 감성 데이터셋 확인
library(tidytext)
sentiments
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
install.packages("textdata")
##
## The downloaded binary packages are in
## /var/folders/wn/dnhm0_tn4qs9lnqb0r3mjy380000gn/T//RtmpQ5EGTL/downloaded_packages
library(textdata)
# 각 감성 사전 로드 (afinn, nrc, loughran)
get_sentiments(lexicon="afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
get_sentiments(lexicon="nrc")
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
get_sentiments(lexicon="loughran")
## # A tibble: 4,150 × 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # ℹ 4,140 more rows
# 필요 패키지 로드
library(dplyr)
library(tibble)
library(purrr)
library(readr)
install.packages("lubridate")
##
## The downloaded binary packages are in
## /var/folders/wn/dnhm0_tn4qs9lnqb0r3mjy380000gn/T//RtmpQ5EGTL/downloaded_packages
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# 트윗 데이터를 다운로드하여 로컬에 저장
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00438/Health-News-Tweets.zip"
local.copy <- tempfile()
download.file(url, destfile=local.copy, mode="wb")
Sys.setlocale("LC_TIME", "en_US.UTF-8") #로케일 설정
## [1] "en_US.UTF-8"
# 트윗 데이터를 읽고 결합하여 tibble로 변환
health.twitter <-
map(unzip(zipfile=local.copy,
files=c("Health-Tweets/bbchealth.txt",
"Health-Tweets/cnnhealth.txt",
"Health-Tweets/foxnewshealth.txt",
"Health-Tweets/NBChealth.txt")),
read_delim, delim="|", quote="",
col_types=list(col_character(), col_character(), col_character()),
col_names=c("id", "datetime", "tweet")) %>%
map2(c("bbc", "cnn", "foxnews", "nbc"), ~cbind(.x, source=.y)) %>%
reduce(bind_rows) %>%
as_tibble() %>%
mutate(datetime=ymd_hms(strptime(datetime, "%a %b %d %H:%M:%S +0000 %Y")))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
unlink(local.copy)
Sys.setlocale() # 로케일 원래대로 복원
## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"
# 출처별 트윗 수 확인
health.twitter %>%
count(source)
## # A tibble: 4 × 2
## source n
## <chr> <int>
## 1 bbc 3929
## 2 cnn 4061
## 3 foxnews 2000
## 4 nbc 4215
# 텍스트 전처리 수행
library(tidytext)
library(stringr)
health.words <- health.twitter %>%
select(-id) %>%
mutate(tweet=str_replace_all(tweet, pattern="(f|ht)tp\\S+\\s*", replacement="")) %>%
mutate(tweet=str_replace_all(tweet, pattern="\\d+", replacement="")) %>%
mutate(tweet=str_replace_all(tweet, pattern="\\bRT", replacement="")) %>%
mutate(tweet=str_replace_all(tweet, pattern="@\\S+", replacement="")) %>%
mutate(tweet=str_replace_all(tweet, pattern="&", replacement="")) %>%
unnest_tokens(word, tweet) # 토큰화하여 단어 추출
health.words
## # A tibble: 124,559 × 3
## datetime source word
## <dttm> <chr> <chr>
## 1 2015-04-09 01:31:50 bbc breast
## 2 2015-04-09 01:31:50 bbc cancer
## 3 2015-04-09 01:31:50 bbc risk
## 4 2015-04-09 01:31:50 bbc test
## 5 2015-04-09 01:31:50 bbc devised
## 6 2015-04-08 23:30:18 bbc gp
## 7 2015-04-08 23:30:18 bbc workload
## 8 2015-04-08 23:30:18 bbc harming
## 9 2015-04-08 23:30:18 bbc care
## 10 2015-04-08 23:30:18 bbc bma
## # ℹ 124,549 more rows
# 감성 분석 및 상위 10개 단어 추출
health.sentiment <- health.words %>%
inner_join(get_sentiments("bing"), by="word") %>%
count(word, sentiment, sort=TRUE) %>%
group_by(sentiment) %>%
slice_max(order_by=n, n=10) %>%
ungroup() %>%
mutate(nsign=ifelse(sentiment=="negative", -n, n))
health.sentiment
## # A tibble: 20 × 4
## word sentiment n nsign
## <chr> <chr> <int> <int>
## 1 cancer negative 701 -701
## 2 risk negative 317 -317
## 3 death negative 193 -193
## 4 outbreak negative 188 -188
## 5 virus negative 147 -147
## 6 fat negative 103 -103
## 7 loss negative 103 -103
## 8 deadly negative 102 -102
## 9 warning negative 96 -96
## 10 pain negative 92 -92
## 11 work positive 210 210
## 12 healthy positive 158 158
## 13 patient positive 155 155
## 14 good positive 139 139
## 15 like positive 139 139
## 16 better positive 135 135
## 17 support positive 102 102
## 18 free positive 95 95
## 19 love positive 90 90
## 20 best positive 87 87
# [그림 7-9] 그래프 생성
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
## The following object is masked from 'package:purrr':
##
## discard
ggplot(health.sentiment,
aes(x=reorder(word, nsign), y=nsign,
fill=factor(sentiment, levels=c("positive", "negative")))) +
geom_col(color="lightslategray", width=0.8) +
geom_text(aes(label=n), size=3, color="black",
hjust=ifelse(health.sentiment$nsign < 0, 1.1, -0.1)) +
scale_fill_manual(values=c("cornflowerblue", "tomato")) +
scale_y_continuous(breaks=pretty(health.sentiment$nsign),
labels=abs(pretty(health.sentiment$nsign))) +
coord_flip() +
labs(x=NULL, y="Count",
title="Health News Tweets") +
theme_minimal() +
theme(legend.position="bottom",
legend.title=element_blank(),
plot.title=element_text(face="bold"),
axis.text=element_text(face="bold", size=10))

# 의료 관련 용어 제외 후 분석
health.sentiment <- health.words %>%
inner_join(get_sentiments("bing"), by="word") %>%
filter(!(word %in% c("patient", "cancer", "virus"))) %>%
count(word, sentiment, sort=TRUE) %>%
group_by(sentiment) %>%
slice_max(order_by=n, n=10) %>%
ungroup() %>%
mutate(nsign=ifelse(sentiment=="negative", -n, n))
# [그림 7-10] 그래프 생성
library(ggplot2)
ggplot(health.sentiment,
aes(x=reorder(word, n), y=n,
fill=factor(sentiment, levels=c("positive", "negative")))) +
geom_col(color="lightslategray", width=0.6, show.legend = FALSE) +
geom_text(aes(label=n), size=3, color="black", hjust=1.2) +
scale_fill_manual(values=c("lightsteelblue1", "lightsalmon1")) +
facet_wrap(~ factor(sentiment, levels=c("positive", "negative")),
ncol=2, scales="free") +
coord_flip() +
labs(x=NULL, y="Count",
title="Health News Tweets") +
theme_light() +
theme(plot.title=element_text(face="bold"),
axis.line=element_line(color="gray"),
axis.text=element_text(face="bold", size=10))

# [그림 7-11] 워드 클라우드 생성
install.packages("wordcloud2")
##
## The downloaded binary packages are in
## /var/folders/wn/dnhm0_tn4qs9lnqb0r3mjy380000gn/T//RtmpQ5EGTL/downloaded_packages
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
set.seed(123)
health.words %>%
inner_join(get_sentiments("bing"), by="word") %>%
filter(!(word %in% c("patient", "cancer", "virus"))) %>%
count(word, sentiment, sort=TRUE) %>%
ungroup() %>%
acast(formula=word ~ sentiment, value.var="n", fill=0) %>%
comparison.cloud(colors=c("tomato", "cornflowerblue"), title.size=2,
title.colors=c("red", "blue"), title.bg.colors=c("wheat"),
scale=c(4, 0.3), max.words=200, match.colors=TRUE)

# [그림 7-12] 건강뉴스 트윗 (회사별 상위 10개 긍/부정 단어)
health.sentiment <- health.words %>%
inner_join(get_sentiments("bing"), by="word") %>%
filter(!(word %in% c("patient", "cancer", "virus"))) %>%
count(word, sentiment, source, sort=TRUE) %>%
ungroup() %>%
group_by(source, sentiment) %>%
slice_max(order_by = n, n=10) %>%
ungroup()
health.sentiment
## # A tibble: 86 × 4
## word sentiment source n
## <chr> <chr> <chr> <int>
## 1 risk negative bbc 89
## 2 death negative bbc 72
## 3 warning negative bbc 54
## 4 crisis negative bbc 35
## 5 abuse negative bbc 29
## 6 outbreak negative bbc 26
## 7 threat negative bbc 26
## 8 fat negative bbc 22
## 9 poor negative bbc 22
## 10 strike negative bbc 22
## # ℹ 76 more rows
# 그래프 그리기
library(ggplot2)
ggplot(health.sentiment,
aes(reorder_within(x=word, by=n, within=source), n, fill=source)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ factor(source,
labels=c("BBC", "CNN", "Fox News", "NBC")) + sentiment,
ncol=2, scales="free") +
scale_x_reordered() +
coord_flip() +
labs(x=NULL, y="Count",
title="Health News Tweets") +
theme_light() +
theme(strip.background=element_blank(),
strip.text=element_text(color="goldenrod4", face="bold"),
plot.title=element_text(face="bold"),
axis.line=element_line(color="gray"),
axis.text=element_text(face="bold", size=10),
panel.grid.minor=element_blank())

# 시간 흐름에 따른 추이 분석
library(lubridate)
health.sentiment <- health.words %>%
inner_join(get_sentiments("bing"), by="word") %>%
filter(!(word %in% c("patient", "cancer", "virus"))) %>%
mutate(time=floor_date(x=datetime, unit="month")) %>%
count(sentiment, time) %>%
group_by(sentiment) %>%
slice(2:(n()-1)) %>%
ungroup()
health.sentiment
## # A tibble: 62 × 3
## sentiment time n
## <chr> <dttm> <int>
## 1 negative 2012-09-01 00:00:00 86
## 2 negative 2012-10-01 00:00:00 94
## 3 negative 2012-11-01 00:00:00 54
## 4 negative 2012-12-01 00:00:00 65
## 5 negative 2013-01-01 00:00:00 135
## 6 negative 2013-02-01 00:00:00 188
## 7 negative 2013-03-01 00:00:00 183
## 8 negative 2013-04-01 00:00:00 197
## 9 negative 2013-05-01 00:00:00 212
## 10 negative 2013-06-01 00:00:00 176
## # ℹ 52 more rows
# [그림 7-13] 그래프 생성하기
Sys.setlocale("LC_TIME", "en_US.UTF-8")
## [1] "en_US.UTF-8"
library(ggplot2)
ggplot(health.sentiment, aes(x=time, y=n, fill=sentiment, color=sentiment)) +
geom_area(position="identity", alpha=0.3) +
geom_line(size=1.5) +
scale_fill_manual(labels=c("Negative", "Positive"),
values=c("orangered", "deepskyblue2")) +
scale_color_manual(labels=c("Negative", "Positive"),
values=c("orangered", "deepskyblue2")) +
scale_x_datetime(date_labels="%b %Y", date_breaks="6 months") +
labs(x=NULL, y="Count",
title="Health News Twetts") +
theme_minimal() +
theme(plot.title=element_text(face="bold"),
axis.text=element_text(face="bold"),
legend.position="bottom",
legend.title=element_blank())
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Sys.setlocale()
## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"
# [그림 7-14] 회사별 긍/부정 단어 출현 빈도 추이 분석 그래프 생성
Sys.setlocale("LC_TIME", "en_US.UTF-8")
## [1] "en_US.UTF-8"
library(ggplot2)
health.words %>%
inner_join(get_sentiments("bing"), by="word") %>%
filter(!(word %in% c("patient", "cancer", "virus"))) %>%
mutate(time=floor_date(datetime, unit="month")) %>%
count(source, sentiment, time) %>%
group_by(source, sentiment) %>%
slice(2:(n()-1)) %>%
ungroup() %>%
ggplot(aes(x=time, y=n, fill=sentiment, color=sentiment)) +
geom_area(position="identity", alpha=0.3) +
geom_line(size=1.5) +
facet_wrap(~ factor(source,
labels=c("BBC", "CNN", "Fox News", "NBC")),
nrow=4, scales="free") +
scale_fill_manual(labels=c("Negative", "Positive"),
values=c("coral", "cornflowerblue")) +
scale_color_manual(labels=c("Negative", "Positive"),
values=c("coral", "cornflowerblue")) +
scale_x_datetime(date_labels="%b %Y", date_breaks="2 months") +
labs(x=NULL, y="Count",
title="Health News Tweets") +
theme(plot.title=element_text(face="bold"),
axis.text.x=element_text(size=8),
legend.position="bottom",
legend.title=element_blank())

Sys.setlocale()
## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"