토픽모델링 & 감성분석 - with 건강 뉴스 트윗 데이터 활용

토픽모델링

# textdata 패키지 및 뉴스 데이터셋 불러오기
library(textdata)
news <- dataset_ag_news(split="test")
news

## # A tibble: 7,600 × 3
##    class    title                                                    description
##    <chr>    <chr>                                                    <chr>      
##  1 Business Fears for T N pension after talks                        "Unions re…
##  2 Sci/Tech The Race is On: Second Private Team Sets Launch Date fo… "SPACE.com…
##  3 Sci/Tech Ky. Company Wins Grant to Study Peptides (AP)            "AP - A co…
##  4 Sci/Tech Prediction Unit Helps Forecast Wildfires (AP)            "AP - It's…
##  5 Sci/Tech Calif. Aims to Limit Farm-Related Smog (AP)              "AP - Sout…
##  6 Sci/Tech Open Letter Against British Copyright Indoctrination in… "The Briti…
##  7 Sci/Tech Loosing the War on Terrorism                             "\\\\\"Sve…
##  8 Sci/Tech FOAFKey: FOAF, PGP, Key Distribution, and Bloom Filters  "\\\\FOAF/…
##  9 Sci/Tech E-mail scam targets police chief                         "Wiltshire…
## 10 Sci/Tech Card fraud unit nets 36,000 cards                        "In its fi…
## # ℹ 7,590 more rows

table(news$class)

## 
## Business Sci/Tech   Sports    World 
##     1900     1900     1900     1900

# tm 패키지를 설치하고 로드
install.packages("tm")

## 
## The downloaded binary packages are in
##  /var/folders/wn/dnhm0_tn4qs9lnqb0r3mjy380000gn/T//RtmpQ5EGTL/downloaded_packages

library(tm)

## Loading required package: NLP

# VCorpus 객체로 변환
docs <- VCorpus(VectorSource(news$description))
docs

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 7600

# 데이터의 형식 파악
lapply(docs, content)[1:3]

## $`1`
## [1] "Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul."
## 
## $`2`
## [1] "SPACE.com - TORONTO, Canada -- A second\\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\\privately funded suborbital space flight, has officially announced the first\\launch date for its manned rocket."
## 
## $`3`
## [1] "AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins."

# 모두 소문자로 변환
docs <- tm_map(docs, content_transformer(tolower))

# URL을 제거하는 함수를 정의
myRemove <- function(x, pattern)
  {return(gsub(pattern, "", x))}

# URL 패턴 (http, https)과 'www' 패턴을 제거
docs <- tm_map(docs, content_transformer(myRemove), "(f|ht)tp\\S+\\s*")
docs <- tm_map(docs, content_transformer(myRemove), "www\\.+\\S+")

# 영어 불용어(stopwords) 리스트 정의
mystopwords <- c(stopwords("english"),
                 c("first", "second", "one", "two", "three", "four", "another",
                   "last", "least", "just", "will", "week", "weeks", "quot",
                   "ago", "day", "days", "night", "nights", "month", "months",
                   "years", "year", "next", "now", "today", "yesterday",
                   "may", "new", "york", "according", "back", "say", "says",
                   "said", "can", "make", "made", "reuters", "monday", "tuesday",
                   "wednesday", "thursday", "friday", "saturday", "sunday"))

# 불용어를 문서에서 제거
docs <- tm_map(docs, removeWords, mystopwords)

# 특정 기호들을 공백으로 대체하는 함수 정의
toSpace <- function(x, pattern)
  {return(gsub(pattern, " ", x))}

# 텍스트에서 특정 기호들을 공백으로 변환하여 처리
docs <- tm_map(docs, content_transformer(toSpace), ":")
docs <- tm_map(docs, content_transformer(toSpace), ";")
docs <- tm_map(docs, content_transformer(toSpace), "/")
docs <- tm_map(docs, content_transformer(toSpace), "\\.")
docs <- tm_map(docs, content_transformer(toSpace), "\\\\")

# 구두점 및 숫자 제거 , 불필요한 공백 정리
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, content_transformer(trimws))

# 변환 결과 점검
lapply(docs, content)[1:3]

## $`1`
## [1] "unions representing workers turner newall disappointed talks stricken parent firm federal mogul"
## 
## $`2`
## [1] "space com toronto canada team rocketeers competing million ansari x prize contest privately funded suborbital space flight officially announced launch date manned rocket"
## 
## $`3`
## [1] "ap company founded chemistry researcher university louisville won grant develop method producing better peptides short chains amino acids building blocks proteins"

# 문서-단어 행렬(DTM) 생성
dtm <- DocumentTermMatrix(docs)
rownames(dtm) <- paste0(rownames(dtm), "-", news$class)
inspect(dtm)

## <<DocumentTermMatrix (documents: 7600, terms: 20319)>>
## Non-/sparse entries: 126245/154298155
## Sparsity           : 100%
## Maximal term length: 29
## Weighting          : term frequency (tf)
## Sample             :
##                Terms
## Docs            company corp government inc million oil president time united
##   1755-Sci/Tech       0    0          0   0       0   0         0    0      0
##   2755-Sci/Tech       0    0          0   0       0   0         0    0      0
##   3110-Sci/Tech       0    0          0   0       0   0         0    0      0
##   5130-Business       1    0          0   0       0   0         0    0      0
##   6-Sci/Tech          0    0          0   0       0   0         0    0      0
##   6307-Sci/Tech       0    0          0   0       0   0         0    0      0
##   6595-Sci/Tech       0    0          0   0       2   0         0    0      0
##   7-Sci/Tech          1    0          0   0       0   0         0    0      0
##   7000-Sci/Tech       0    0          0   0       1   0         0    0      0
##   8-Sci/Tech          0    0          0   0       0   0         0    0      0
##                Terms
## Docs            world
##   1755-Sci/Tech     0
##   2755-Sci/Tech     0
##   3110-Sci/Tech     0
##   5130-Business     0
##   6-Sci/Tech        0
##   6307-Sci/Tech     0
##   6595-Sci/Tech     0
##   7-Sci/Tech        0
##   7000-Sci/Tech     0
##   8-Sci/Tech        0

library(slam)
summary(col_sums(dtm))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    2.00    6.42    4.00  448.00

# 가장 많이 등장한 5개의 단어를 내림차순으로 정렬하 출력
col_sums(dtm)[order(col_sums(dtm), decreasing=TRUE)][1:5]

##       inc   company     world president   million 
##       448       425       387       338       319

# 최소 250회 이상 등장한 빈도수가 높은 단어들 찾기
findFreqTerms(dtm, lowfreq = 250)

##  [1] "company"    "corp"       "government" "group"      "inc"       
##  [6] "million"    "oil"        "people"     "president"  "time"      
## [11] "united"     "world"

# topicmodels 패키지 설치 후 불러오기
install.packages("topicmodels")

## 
## The downloaded binary packages are in
##  /var/folders/wn/dnhm0_tn4qs9lnqb0r3mjy380000gn/T//RtmpQ5EGTL/downloaded_packages

library(topicmodels)

# LDA 모델 생성 - 4개의 토픽(k=4), Gibbs 샘플링 방식 사용
news.lda <- LDA(dtm, k=4, method="Gibbs",
                control=list(seed=123, burnin=1000, iter=1000, thin=100))
class(news.lda)

## [1] "LDA_Gibbs"
## attr(,"package")
## [1] "topicmodels"

# 첫 5개의 문서에 할당된 토픽 확인
topics(news.lda)[1:5]

## 1-Business 2-Sci/Tech 3-Sci/Tech 4-Sci/Tech 5-Sci/Tech 
##          3          4          2          4          3

# 각 토픽에 할당된 문서 수 확인
table(topics(news.lda))

## 
##    1    2    3    4 
## 2109 2031 1808 1652

# 각 토픽에서 상위 10개의 중요한 단어 출력
terms(news.lda, 10)

##       Topic 1      Topic 2   Topic 3         Topic 4     
##  [1,] "president"  "world"   "inc"           "microsoft" 
##  [2,] "united"     "time"    "company"       "announced" 
##  [3,] "people"     "game"    "million"       "software"  
##  [4,] "government" "season"  "corp"          "internet"  
##  [5,] "iraq"       "team"    "oil"           "plans"     
##  [6,] "state"      "win"     "percent"       "security"  
##  [7,] "afp"        "league"  "prices"        "news"      
##  [8,] "minister"   "top"     "stocks"        "service"   
##  [9,] "officials"  "victory" "group"         "technology"
## [10,] "states"     "home"    "international" "computer"

# LDA 모델 객체의 구조 확인
str(news.lda, max.level=2, nchar.max=50)

## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
##   ..@ seedwords      : NULL
##   ..@ z              : int [1:130442] 2 3 3 1 3 3 1 3 1 3 ...
##   ..@ alpha          : num 12.5
##   ..@ call           : language LDA(x = dtm, k = 4, method = "Gibbs| __truncated__
##   ..@ Dim            : int [1:2] 7600 20319
##   ..@ control        :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
##   ..@ k              : int 4
##   ..@ terms          : chr [1:20319] "aakash" "aapl" "aaron" "aarons" ...
##   ..@ documents      : chr [1:7600] "1-Business" "2-Sci/Tech" "3-Sci/Tech" "4-Sci/Tech" ...
##   ..@ beta           : num [1:4, 1:20319] -10.4 -12.7 -12.8 -12.7 -10.4 ...
##   ..@ gamma          : num [1:7600, 1:4] 0.25 0.201 0.221 0.217 0.261 ...
##   ..@ wordassignments:List of 5
##   .. ..- attr(*, "class")= chr "simple_triplet_matrix"
##   ..@ loglikelihood  : num -1049375
##   ..@ iter           : int 100
##   ..@ logLiks        : num(0) 
##   ..@ n              : int 130442

# 각 토픽에서 단어 분포(beta 행렬) 확인, 첫 5개 단어 출력
news.lda@beta[, 1:5]

##           [,1]       [,2]       [,3]      [,4]      [,5]
## [1,] -10.37187 -10.371868  -9.725241 -10.37187 -12.76976
## [2,] -12.74548 -12.745483 -12.745483 -12.74548 -12.74548
## [3,] -12.76047  -9.326485 -12.760472 -12.76047  -9.71595
## [4,] -12.74575 -10.347850  -9.701223 -12.74575 -12.74575

# beta 행렬 값을 지수 함수(exp)로 변환해 확률값으로 출력
exp(news.lda@beta[, 1:5])

##              [,1]         [,2]         [,3]         [,4]         [,5]
## [1,] 3.130077e-05 3.130077e-05 5.975602e-05 3.130077e-05 2.845525e-06
## [2,] 2.915460e-06 2.915460e-06 2.915460e-06 2.915460e-06 2.915460e-06
## [3,] 2.872086e-06 8.903466e-05 2.872086e-06 2.872086e-06 6.031380e-05
## [4,] 2.914696e-06 3.206165e-05 6.120861e-05 2.914696e-06 2.914696e-06

# tidytext 패키지 사용해 LDA 모델을 tidy 형식으로 변환
library(tidytext)
news.term <- tidy(news.lda, matrix="beta")
news.term

## # A tibble: 81,276 × 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     1 aakash 0.0000313 
##  2     2 aakash 0.00000292
##  3     3 aakash 0.00000287
##  4     4 aakash 0.00000291
##  5     1 aapl   0.0000313 
##  6     2 aapl   0.00000292
##  7     3 aapl   0.0000890 
##  8     4 aapl   0.0000321 
##  9     1 aaron  0.0000598 
## 10     2 aaron  0.00000292
## # ℹ 81,266 more rows

# dplyr 패키지 사용해 각 토픽에서 상위 5개의 단어 추출
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

news.term.top <- news.term %>%
  group_by(topic) %>%
  slice_max(order_by=beta, n=5) %>%
  ungroup() %>%
  arrange(topic, -beta)
  news.term.top

## # A tibble: 20 × 3
##    topic term          beta
##    <int> <chr>        <dbl>
##  1     1 president  0.00959
##  2     1 united     0.00780
##  3     1 people     0.00774
##  4     1 government 0.00734
##  5     1 iraq       0.00621
##  6     2 world      0.0112 
##  7     2 time       0.00817
##  8     2 game       0.00726
##  9     2 season     0.00662
## 10     2 team       0.00639
## 11     3 inc        0.0129 
## 12     3 company    0.0121 
## 13     3 million    0.00916
## 14     3 corp       0.00839
## 15     3 oil        0.00790
## 16     4 microsoft  0.00685
## 17     4 announced  0.00665
## 18     4 software   0.00665
## 19     4 internet   0.00601
## 20     4 plans      0.00542

# 토픽별 발생 확률 높은 상위 다섯 개 단어 그래프 생성  
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

ggplot(news.term.top,
       aes(reorder_within(x=term, by=beta, within=topic),
           beta, fill=factor(topic))) +
  geom_col(show.legend=FALSE) +
  facet_wrap(~ paste("Topic", topic), scales="free") +
  scale_x_reordered() +
  coord_flip() +
  theme_minimal() +
  labs(x=NULL, y="Word-Topic Probability (Beta)",
       title="News") +
  theme(plot.title=element_text(face="bold"),
        strip.text=element_text(face="bold"))

# 첫 5개의 문서에 대한 토픽 분포 행렬(gamma) 확인
news.lda@gamma[1:5,]

##           [,1]      [,2]      [,3]      [,4]
## [1,] 0.2500000 0.2177419 0.3145161 0.2177419
## [2,] 0.2013889 0.2152778 0.2152778 0.3680556
## [3,] 0.2214286 0.3357143 0.2214286 0.2214286
## [4,] 0.2171053 0.2302632 0.2302632 0.3223684
## [5,] 0.2611940 0.2313433 0.2910448 0.2164179

# LDA 모델의 문서-토픽 분포를 tidy 형식으로 변환
news.doc <- tidy(news.lda, matrix="gamma")
news.doc

## # A tibble: 30,400 × 3
##    document    topic gamma
##    <chr>       <int> <dbl>
##  1 1-Business      1 0.25 
##  2 2-Sci/Tech      1 0.201
##  3 3-Sci/Tech      1 0.221
##  4 4-Sci/Tech      1 0.217
##  5 5-Sci/Tech      1 0.261
##  6 6-Sci/Tech      1 0.194
##  7 7-Sci/Tech      1 0.173
##  8 8-Sci/Tech      1 0.150
##  9 9-Sci/Tech      1 0.233
## 10 10-Sci/Tech     1 0.254
## # ℹ 30,390 more rows

# tidyr 패키지 로드 후 document 열을 id와 category로 분리
library(tidyr)
news.doc <- news.doc %>%
  separate(document, c("id", "category"), sep="-", convert=TRUE)
news.doc

## # A tibble: 30,400 × 4
##       id category topic gamma
##    <int> <chr>    <int> <dbl>
##  1     1 Business     1 0.25 
##  2     2 Sci/Tech     1 0.201
##  3     3 Sci/Tech     1 0.221
##  4     4 Sci/Tech     1 0.217
##  5     5 Sci/Tech     1 0.261
##  6     6 Sci/Tech     1 0.194
##  7     7 Sci/Tech     1 0.173
##  8     8 Sci/Tech     1 0.150
##  9     9 Sci/Tech     1 0.233
## 10    10 Sci/Tech     1 0.254
## # ℹ 30,390 more rows

# 카테고리별 각 토픽 배정 확률 분포 확인 그래프 생성
library(ggplot2)
ggplot(news.doc, aes(factor(topic), gamma, fill=category)) +
  geom_boxplot(color="gray50", show.legend=FALSE,
               outlier.shape=21, outlier.color="black", outlier.fill="gray") +
  facet_wrap(~ factor(category), scales="free") + # 카테고리별로 그래프 나누기
  scale_fill_brewer(palette="Dark2") +
  theme_bw() +
  labs(x="Topic", y="Document-Topic Probability (Gamma)",
       title="News") +
  theme(strip.background=element_rect(fill="aliceblue"),
        strip.text=element_text(face="bold", color="steelblue"),
        plot.title=element_text(face="bold"),
        panel.grid.major.x=element_blank(),
        panel.grid.minor.x=element_blank())

감성분석

# tidytext 패키지 로드 및 감성 데이터셋 확인
library(tidytext)
sentiments

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

install.packages("textdata")

## 
## The downloaded binary packages are in
##  /var/folders/wn/dnhm0_tn4qs9lnqb0r3mjy380000gn/T//RtmpQ5EGTL/downloaded_packages

library(textdata)

# 각 감성 사전 로드 (afinn, nrc, loughran)
get_sentiments(lexicon="afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

get_sentiments(lexicon="nrc")

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

get_sentiments(lexicon="loughran")

## # A tibble: 4,150 × 2
##    word         sentiment
##    <chr>        <chr>    
##  1 abandon      negative 
##  2 abandoned    negative 
##  3 abandoning   negative 
##  4 abandonment  negative 
##  5 abandonments negative 
##  6 abandons     negative 
##  7 abdicated    negative 
##  8 abdicates    negative 
##  9 abdicating   negative 
## 10 abdication   negative 
## # ℹ 4,140 more rows

# 필요 패키지 로드
library(dplyr)
library(tibble)
library(purrr)
library(readr)
install.packages("lubridate")

## 
## The downloaded binary packages are in
##  /var/folders/wn/dnhm0_tn4qs9lnqb0r3mjy380000gn/T//RtmpQ5EGTL/downloaded_packages

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

# 트윗 데이터를 다운로드하여 로컬에 저장
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00438/Health-News-Tweets.zip"
local.copy <- tempfile()
download.file(url, destfile=local.copy, mode="wb")
Sys.setlocale("LC_TIME", "en_US.UTF-8") #로케일 설정

## [1] "en_US.UTF-8"

# 트윗 데이터를 읽고 결합하여 tibble로 변환
health.twitter <-
  map(unzip(zipfile=local.copy,
            files=c("Health-Tweets/bbchealth.txt",
                    "Health-Tweets/cnnhealth.txt",
                    "Health-Tweets/foxnewshealth.txt",
                    "Health-Tweets/NBChealth.txt")),
      read_delim, delim="|", quote="",
      col_types=list(col_character(), col_character(), col_character()),
      col_names=c("id", "datetime", "tweet")) %>%
map2(c("bbc", "cnn", "foxnews", "nbc"), ~cbind(.x, source=.y)) %>%
reduce(bind_rows) %>%
as_tibble() %>%
mutate(datetime=ymd_hms(strptime(datetime, "%a %b %d %H:%M:%S +0000 %Y")))

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

unlink(local.copy) 
Sys.setlocale() # 로케일 원래대로 복원

## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"

# 출처별 트윗 수 확인
health.twitter %>%
  count(source)

## # A tibble: 4 × 2
##   source      n
##   <chr>   <int>
## 1 bbc      3929
## 2 cnn      4061
## 3 foxnews  2000
## 4 nbc      4215

# 텍스트 전처리 수행
library(tidytext)
library(stringr)
health.words <- health.twitter %>%
  select(-id) %>%
  mutate(tweet=str_replace_all(tweet, pattern="(f|ht)tp\\S+\\s*", replacement="")) %>%
  mutate(tweet=str_replace_all(tweet, pattern="\\d+", replacement="")) %>%
  mutate(tweet=str_replace_all(tweet, pattern="\\bRT", replacement="")) %>%
  mutate(tweet=str_replace_all(tweet, pattern="@\\S+", replacement="")) %>%
  mutate(tweet=str_replace_all(tweet, pattern="&amp", replacement="")) %>%
  unnest_tokens(word, tweet) # 토큰화하여 단어 추출

health.words

## # A tibble: 124,559 × 3
##    datetime            source word    
##    <dttm>              <chr>  <chr>   
##  1 2015-04-09 01:31:50 bbc    breast  
##  2 2015-04-09 01:31:50 bbc    cancer  
##  3 2015-04-09 01:31:50 bbc    risk    
##  4 2015-04-09 01:31:50 bbc    test    
##  5 2015-04-09 01:31:50 bbc    devised 
##  6 2015-04-08 23:30:18 bbc    gp      
##  7 2015-04-08 23:30:18 bbc    workload
##  8 2015-04-08 23:30:18 bbc    harming 
##  9 2015-04-08 23:30:18 bbc    care    
## 10 2015-04-08 23:30:18 bbc    bma     
## # ℹ 124,549 more rows

# 감성 분석 및 상위 10개 단어 추출
health.sentiment <- health.words %>%
  inner_join(get_sentiments("bing"), by="word") %>%
  count(word, sentiment, sort=TRUE) %>%
  group_by(sentiment) %>%
  slice_max(order_by=n, n=10) %>%
  ungroup() %>%
  mutate(nsign=ifelse(sentiment=="negative", -n, n))

health.sentiment

## # A tibble: 20 × 4
##    word     sentiment     n nsign
##    <chr>    <chr>     <int> <int>
##  1 cancer   negative    701  -701
##  2 risk     negative    317  -317
##  3 death    negative    193  -193
##  4 outbreak negative    188  -188
##  5 virus    negative    147  -147
##  6 fat      negative    103  -103
##  7 loss     negative    103  -103
##  8 deadly   negative    102  -102
##  9 warning  negative     96   -96
## 10 pain     negative     92   -92
## 11 work     positive    210   210
## 12 healthy  positive    158   158
## 13 patient  positive    155   155
## 14 good     positive    139   139
## 15 like     positive    139   139
## 16 better   positive    135   135
## 17 support  positive    102   102
## 18 free     positive     95    95
## 19 love     positive     90    90
## 20 best     positive     87    87

# [그림 7-9] 그래프 생성  
library(ggplot2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

## The following object is masked from 'package:purrr':
## 
##     discard

ggplot(health.sentiment,
       aes(x=reorder(word, nsign), y=nsign,
           fill=factor(sentiment, levels=c("positive", "negative")))) +
  geom_col(color="lightslategray", width=0.8) +
  geom_text(aes(label=n), size=3, color="black",
            hjust=ifelse(health.sentiment$nsign < 0, 1.1, -0.1)) +
  scale_fill_manual(values=c("cornflowerblue", "tomato")) +
  scale_y_continuous(breaks=pretty(health.sentiment$nsign),
                     labels=abs(pretty(health.sentiment$nsign))) +
  coord_flip() +
    labs(x=NULL, y="Count",
         title="Health News Tweets") +
    theme_minimal() +
    theme(legend.position="bottom",
          legend.title=element_blank(),
          plot.title=element_text(face="bold"),
          axis.text=element_text(face="bold", size=10))

# 의료 관련 용어 제외 후  분석 
health.sentiment <- health.words %>%
  inner_join(get_sentiments("bing"), by="word") %>%
  filter(!(word %in% c("patient", "cancer", "virus"))) %>%
  count(word, sentiment, sort=TRUE) %>%
  group_by(sentiment) %>%
  slice_max(order_by=n, n=10) %>%
  ungroup() %>%
  mutate(nsign=ifelse(sentiment=="negative", -n, n))
  
  
  

# [그림 7-10] 그래프 생성
library(ggplot2)
ggplot(health.sentiment,
       aes(x=reorder(word, n), y=n,
           fill=factor(sentiment, levels=c("positive", "negative")))) +
  geom_col(color="lightslategray", width=0.6, show.legend = FALSE) +
  geom_text(aes(label=n), size=3, color="black", hjust=1.2) +
  scale_fill_manual(values=c("lightsteelblue1", "lightsalmon1")) +
  facet_wrap(~ factor(sentiment, levels=c("positive", "negative")),
             ncol=2, scales="free") +
  coord_flip() +
  labs(x=NULL, y="Count",
       title="Health News Tweets") +
  theme_light() +
  theme(plot.title=element_text(face="bold"),
        axis.line=element_line(color="gray"),
        axis.text=element_text(face="bold", size=10))

# [그림 7-11] 워드 클라우드 생성 
install.packages("wordcloud2")

## 
## The downloaded binary packages are in
##  /var/folders/wn/dnhm0_tn4qs9lnqb0r3mjy380000gn/T//RtmpQ5EGTL/downloaded_packages

library(wordcloud)

## Loading required package: RColorBrewer

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

set.seed(123)
health.words %>%
  inner_join(get_sentiments("bing"), by="word") %>%
  filter(!(word %in% c("patient", "cancer", "virus"))) %>%
  count(word, sentiment, sort=TRUE) %>%
  ungroup() %>%
  acast(formula=word ~ sentiment, value.var="n", fill=0) %>%
  comparison.cloud(colors=c("tomato", "cornflowerblue"), title.size=2,
                   title.colors=c("red", "blue"), title.bg.colors=c("wheat"),
                   scale=c(4, 0.3), max.words=200, match.colors=TRUE)

# [그림 7-12] 건강뉴스 트윗 (회사별 상위 10개 긍/부정 단어)

health.sentiment <- health.words %>%
  inner_join(get_sentiments("bing"), by="word") %>%
  filter(!(word %in% c("patient", "cancer", "virus"))) %>%
  count(word, sentiment, source, sort=TRUE) %>%
  ungroup() %>%
  group_by(source, sentiment) %>%
  slice_max(order_by = n, n=10) %>%
  ungroup()
health.sentiment

## # A tibble: 86 × 4
##    word     sentiment source     n
##    <chr>    <chr>     <chr>  <int>
##  1 risk     negative  bbc       89
##  2 death    negative  bbc       72
##  3 warning  negative  bbc       54
##  4 crisis   negative  bbc       35
##  5 abuse    negative  bbc       29
##  6 outbreak negative  bbc       26
##  7 threat   negative  bbc       26
##  8 fat      negative  bbc       22
##  9 poor     negative  bbc       22
## 10 strike   negative  bbc       22
## # ℹ 76 more rows

# 그래프 그리기
library(ggplot2)
ggplot(health.sentiment,
       aes(reorder_within(x=word, by=n, within=source), n, fill=source)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ factor(source,
                      labels=c("BBC", "CNN", "Fox News", "NBC")) + sentiment,
             ncol=2, scales="free") +
  scale_x_reordered() +
  coord_flip() +
  labs(x=NULL, y="Count",
       title="Health News Tweets") +
  theme_light() +
  theme(strip.background=element_blank(),
        strip.text=element_text(color="goldenrod4", face="bold"),
        plot.title=element_text(face="bold"),
        axis.line=element_line(color="gray"),
        axis.text=element_text(face="bold", size=10),
        panel.grid.minor=element_blank())

# 시간 흐름에 따른 추이 분석  
library(lubridate)  
health.sentiment <- health.words %>%
  inner_join(get_sentiments("bing"), by="word") %>%
  filter(!(word %in% c("patient", "cancer", "virus"))) %>%
  mutate(time=floor_date(x=datetime, unit="month")) %>%
  count(sentiment, time) %>%
  group_by(sentiment) %>%
  slice(2:(n()-1)) %>%
  ungroup()         
           
health.sentiment

## # A tibble: 62 × 3
##    sentiment time                    n
##    <chr>     <dttm>              <int>
##  1 negative  2012-09-01 00:00:00    86
##  2 negative  2012-10-01 00:00:00    94
##  3 negative  2012-11-01 00:00:00    54
##  4 negative  2012-12-01 00:00:00    65
##  5 negative  2013-01-01 00:00:00   135
##  6 negative  2013-02-01 00:00:00   188
##  7 negative  2013-03-01 00:00:00   183
##  8 negative  2013-04-01 00:00:00   197
##  9 negative  2013-05-01 00:00:00   212
## 10 negative  2013-06-01 00:00:00   176
## # ℹ 52 more rows

# [그림 7-13] 그래프 생성하기
           
Sys.setlocale("LC_TIME", "en_US.UTF-8")

## [1] "en_US.UTF-8"

library(ggplot2)
ggplot(health.sentiment, aes(x=time, y=n, fill=sentiment, color=sentiment)) +
  geom_area(position="identity", alpha=0.3) +
  geom_line(size=1.5) +
  scale_fill_manual(labels=c("Negative", "Positive"),
                    values=c("orangered", "deepskyblue2")) +
  scale_color_manual(labels=c("Negative", "Positive"),
                     values=c("orangered", "deepskyblue2")) +
  scale_x_datetime(date_labels="%b %Y", date_breaks="6 months") +
  labs(x=NULL, y="Count",
       title="Health News Twetts") +
  theme_minimal() +
  theme(plot.title=element_text(face="bold"),
        axis.text=element_text(face="bold"),
        legend.position="bottom",
        legend.title=element_blank())

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Sys.setlocale()

## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"

# [그림 7-14] 회사별 긍/부정 단어 출현 빈도 추이 분석 그래프 생성

Sys.setlocale("LC_TIME", "en_US.UTF-8")

## [1] "en_US.UTF-8"

library(ggplot2)
health.words %>%
  inner_join(get_sentiments("bing"), by="word") %>%
  filter(!(word %in% c("patient", "cancer", "virus"))) %>%
  mutate(time=floor_date(datetime, unit="month")) %>%
  count(source, sentiment, time) %>%
  group_by(source, sentiment) %>%
  slice(2:(n()-1)) %>%
  ungroup() %>%
  ggplot(aes(x=time, y=n, fill=sentiment, color=sentiment)) +
  geom_area(position="identity", alpha=0.3) +
  geom_line(size=1.5) +
  facet_wrap(~ factor(source,
                      labels=c("BBC", "CNN", "Fox News", "NBC")),
             nrow=4, scales="free") +
  scale_fill_manual(labels=c("Negative", "Positive"),
                    values=c("coral", "cornflowerblue")) +
  scale_color_manual(labels=c("Negative", "Positive"),
                     values=c("coral", "cornflowerblue")) +
  scale_x_datetime(date_labels="%b %Y", date_breaks="2 months") +
  labs(x=NULL, y="Count",
       title="Health News Tweets") +
  theme(plot.title=element_text(face="bold"),
        axis.text.x=element_text(size=8),
        legend.position="bottom",
        legend.title=element_blank())

Sys.setlocale()

## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"

토픽모델링 & 감성분석 - with 건강 뉴스 트윗 데이터 활용

컬처앤테크놀로지융합전공 2020313202 원희수

2024-10-14

토픽모델링

감성분석