Title

This is an R HTML document. When you click the Knit HTML button a web page will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# tidytext 패키지 로드 및 감성 데이터셋 확인
library(tidytext)
sentiments

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

install.packages("textdata")

## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror

library(textdata)

# 각 감성 사전 로드 (afinn, nrc, loughran)
get_sentiments(lexicon="afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

get_sentiments(lexicon="nrc")

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

get_sentiments(lexicon="loughran")

## # A tibble: 4,150 × 2
##    word         sentiment
##    <chr>        <chr>    
##  1 abandon      negative 
##  2 abandoned    negative 
##  3 abandoning   negative 
##  4 abandonment  negative 
##  5 abandonments negative 
##  6 abandons     negative 
##  7 abdicated    negative 
##  8 abdicates    negative 
##  9 abdicating   negative 
## 10 abdication   negative 
## # ℹ 4,140 more rows

# 필요 패키지 로드
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tibble)
library(purrr)
library(readr)
install.packages("lubridate")

## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

# 트윗 데이터를 다운로드하여 로컬에 저장
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00438/Health-News-Tweets.zip"
local.copy <- tempfile()
download.file(url, destfile=local.copy, mode="wb")
Sys.setlocale("LC_TIME", "en_US.UTF-8") #로케일 설정

## [1] "en_US.UTF-8"

# 트윗 데이터를 읽고 결합하여 tibble로 변환
health.twitter <-
  map(unzip(zipfile=local.copy,
            files=c("Health-Tweets/bbchealth.txt",
                    "Health-Tweets/cnnhealth.txt",
                    "Health-Tweets/foxnewshealth.txt",
                    "Health-Tweets/NBChealth.txt")),
      read_delim, delim="|", quote="",
      col_types=list(col_character(), col_character(), col_character()),
      col_names=c("id", "datetime", "tweet")) %>%
map2(c("bbc", "cnn", "foxnews", "nbc"), ~cbind(.x, source=.y)) %>%
reduce(bind_rows) %>%
as_tibble() %>%
mutate(datetime=ymd_hms(strptime(datetime, "%a %b %d %H:%M:%S +0000 %Y")))

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

unlink(local.copy)
Sys.setlocale() # 로케일 원래대로 복원

## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"

# 출처별 트윗 수 확인
health.twitter %>%
  count(source)

## # A tibble: 4 × 2
##   source      n
##   <chr>   <int>
## 1 bbc      3929
## 2 cnn      4061
## 3 foxnews  2000
## 4 nbc      4215

# 텍스트 전처리 수행
library(tidytext)
library(stringr)
health.words <- health.twitter %>%
  select(-id) %>%
  mutate(tweet=str_replace_all(tweet, pattern="(f|ht)tp\\S+\\s*", replacement="")) %>%
  mutate(tweet=str_replace_all(tweet, pattern="\\d+", replacement="")) %>%
  mutate(tweet=str_replace_all(tweet, pattern="\\bRT", replacement="")) %>%
  mutate(tweet=str_replace_all(tweet, pattern="@\\S+", replacement="")) %>%
  mutate(tweet=str_replace_all(tweet, pattern="&amp", replacement="")) %>%
  unnest_tokens(word, tweet) # 토큰화하여 단어 추출

health.words

## # A tibble: 124,559 × 3
##    datetime            source word    
##    <dttm>              <chr>  <chr>   
##  1 2015-04-09 01:31:50 bbc    breast  
##  2 2015-04-09 01:31:50 bbc    cancer  
##  3 2015-04-09 01:31:50 bbc    risk    
##  4 2015-04-09 01:31:50 bbc    test    
##  5 2015-04-09 01:31:50 bbc    devised 
##  6 2015-04-08 23:30:18 bbc    gp      
##  7 2015-04-08 23:30:18 bbc    workload
##  8 2015-04-08 23:30:18 bbc    harming 
##  9 2015-04-08 23:30:18 bbc    care    
## 10 2015-04-08 23:30:18 bbc    bma     
## # ℹ 124,549 more rows

# 감성 분석 및 상위 10개 단어 추출
health.sentiment <- health.words %>%
  inner_join(get_sentiments("bing"), by="word") %>%
  count(word, sentiment, sort=TRUE) %>%
  group_by(sentiment) %>%
  slice_max(order_by=n, n=10) %>%
  ungroup() %>%
  mutate(nsign=ifelse(sentiment=="negative", -n, n))

health.sentiment

## # A tibble: 20 × 4
##    word     sentiment     n nsign
##    <chr>    <chr>     <int> <int>
##  1 cancer   negative    701  -701
##  2 risk     negative    317  -317
##  3 death    negative    193  -193
##  4 outbreak negative    188  -188
##  5 virus    negative    147  -147
##  6 fat      negative    103  -103
##  7 loss     negative    103  -103
##  8 deadly   negative    102  -102
##  9 warning  negative     96   -96
## 10 pain     negative     92   -92
## 11 work     positive    210   210
## 12 healthy  positive    158   158
## 13 patient  positive    155   155
## 14 good     positive    139   139
## 15 like     positive    139   139
## 16 better   positive    135   135
## 17 support  positive    102   102
## 18 free     positive     95    95
## 19 love     positive     90    90
## 20 best     positive     87    87

# [그림 7-9] 그래프 생성  
library(ggplot2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

## The following object is masked from 'package:purrr':
## 
##     discard

ggplot(health.sentiment,
       aes(x=reorder(word, nsign), y=nsign,
           fill=factor(sentiment, levels=c("positive", "negative")))) +
  geom_col(color="lightslategray", width=0.8) +
  geom_text(aes(label=n), size=3, color="black",
            hjust=ifelse(health.sentiment$nsign < 0, 1.1, -0.1)) +
  scale_fill_manual(values=c("cornflowerblue", "tomato")) +
  scale_y_continuous(breaks=pretty(health.sentiment$nsign),
                     labels=abs(pretty(health.sentiment$nsign))) +
  coord_flip() +
    labs(x=NULL, y="Count",
         title="Health News Tweets") +
    theme_minimal() +
    theme(legend.position="bottom",
          legend.title=element_blank(),
          plot.title=element_text(face="bold"),
          axis.text=element_text(face="bold", size=10))

# 의료 관련 용어 제외 후  분석 
health.sentiment <- health.words %>%
  inner_join(get_sentiments("bing"), by="word") %>%
  filter(!(word %in% c("patient", "cancer", "virus"))) %>%
  count(word, sentiment, sort=TRUE) %>%
  group_by(sentiment) %>%
  slice_max(order_by=n, n=10) %>%
  ungroup() %>%
  mutate(nsign=ifelse(sentiment=="negative", -n, n))




# [그림 7-10] 그래프 생성
library(ggplot2)
ggplot(health.sentiment,
       aes(x=reorder(word, n), y=n,
           fill=factor(sentiment, levels=c("positive", "negative")))) +
  geom_col(color="lightslategray", width=0.6, show.legend = FALSE) +
  geom_text(aes(label=n), size=3, color="black", hjust=1.2) +
  scale_fill_manual(values=c("lightsteelblue1", "lightsalmon1")) +
  facet_wrap(~ factor(sentiment, levels=c("positive", "negative")),
             ncol=2, scales="free") +
  coord_flip() +
  labs(x=NULL, y="Count",
       title="Health News Tweets") +
  theme_light() +
  theme(plot.title=element_text(face="bold"),
        axis.line=element_line(color="gray"),
        axis.text=element_text(face="bold", size=10))

# [그림 7-11] 워드 클라우드 생성 
install.packages("wordcloud2")

## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror

library(wordcloud)

## Loading required package: RColorBrewer

library(reshape2)
set.seed(123)
health.words %>%
  inner_join(get_sentiments("bing"), by="word") %>%
  filter(!(word %in% c("patient", "cancer", "virus"))) %>%
  count(word, sentiment, sort=TRUE) %>%
  ungroup() %>%
  acast(formula=word ~ sentiment, value.var="n", fill=0) %>%
  comparison.cloud(colors=c("tomato", "cornflowerblue"), title.size=2,
                   title.colors=c("red", "blue"), title.bg.colors=c("wheat"),
                   scale=c(4, 0.3), max.words=200, match.colors=TRUE)

# [그림 7-12] 건강뉴스 트윗 (회사별 상위 10개 긍/부정 단어)

health.sentiment <- health.words %>%
  inner_join(get_sentiments("bing"), by="word") %>%
  filter(!(word %in% c("patient", "cancer", "virus"))) %>%
  count(word, sentiment, source, sort=TRUE) %>%
  ungroup() %>%
  group_by(source, sentiment) %>%
  slice_max(order_by = n, n=10) %>%
  ungroup()
health.sentiment

## # A tibble: 86 × 4
##    word     sentiment source     n
##    <chr>    <chr>     <chr>  <int>
##  1 risk     negative  bbc       89
##  2 death    negative  bbc       72
##  3 warning  negative  bbc       54
##  4 crisis   negative  bbc       35
##  5 abuse    negative  bbc       29
##  6 outbreak negative  bbc       26
##  7 threat   negative  bbc       26
##  8 fat      negative  bbc       22
##  9 poor     negative  bbc       22
## 10 strike   negative  bbc       22
## # ℹ 76 more rows

# 그래프 그리기
library(ggplot2)
ggplot(health.sentiment,
       aes(reorder_within(x=word, by=n, within=source), n, fill=source)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ factor(source,
                      labels=c("BBC", "CNN", "Fox News", "NBC")) + sentiment,
             ncol=2, scales="free") +
  scale_x_reordered() +
  coord_flip() +
  labs(x=NULL, y="Count",
       title="Health News Tweets") +
  theme_light() +
  theme(strip.background=element_blank(),
        strip.text=element_text(color="goldenrod4", face="bold"),
        plot.title=element_text(face="bold"),
        axis.line=element_line(color="gray"),
        axis.text=element_text(face="bold", size=10),
        panel.grid.minor=element_blank())

# 시간 흐름에 따른 추이 분석  
library(lubridate)
health.sentiment <- health.words %>%
  inner_join(get_sentiments("bing"), by="word") %>%
  filter(!(word %in% c("patient", "cancer", "virus"))) %>%
  mutate(time=floor_date(x=datetime, unit="month")) %>%
  count(sentiment, time) %>%
  group_by(sentiment) %>%
  slice(2:(n()-1)) %>%
  ungroup()

health.sentiment

## # A tibble: 62 × 3
##    sentiment time                    n
##    <chr>     <dttm>              <int>
##  1 negative  2012-09-01 00:00:00    86
##  2 negative  2012-10-01 00:00:00    94
##  3 negative  2012-11-01 00:00:00    54
##  4 negative  2012-12-01 00:00:00    65
##  5 negative  2013-01-01 00:00:00   135
##  6 negative  2013-02-01 00:00:00   188
##  7 negative  2013-03-01 00:00:00   183
##  8 negative  2013-04-01 00:00:00   197
##  9 negative  2013-05-01 00:00:00   212
## 10 negative  2013-06-01 00:00:00   176
## # ℹ 52 more rows

# [그림 7-13] 그래프 생성하기

Sys.setlocale("LC_TIME", "en_US.UTF-8")

## [1] "en_US.UTF-8"

library(ggplot2)
ggplot(health.sentiment, aes(x=time, y=n, fill=sentiment, color=sentiment)) +
  geom_area(position="identity", alpha=0.3) +
  geom_line(size=1.5) +
  scale_fill_manual(labels=c("Negative", "Positive"),
                    values=c("orangered", "deepskyblue2")) +
  scale_color_manual(labels=c("Negative", "Positive"),
                     values=c("orangered", "deepskyblue2")) +
  scale_x_datetime(date_labels="%b %Y", date_breaks="6 months") +
  labs(x=NULL, y="Count",
       title="Health News Twetts") +
  theme_minimal() +
  theme(plot.title=element_text(face="bold"),
        axis.text=element_text(face="bold"),
        legend.position="bottom",
        legend.title=element_blank())

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Sys.setlocale()

## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"

# [그림 7-14] 회사별 긍/부정 단어 출현 빈도 추이 분석 그래프 생성

Sys.setlocale("LC_TIME", "en_US.UTF-8")

## [1] "en_US.UTF-8"

library(ggplot2)
health.words %>%
  inner_join(get_sentiments("bing"), by="word") %>%
  filter(!(word %in% c("patient", "cancer", "virus"))) %>%
  mutate(time=floor_date(datetime, unit="month")) %>%
  count(source, sentiment, time) %>%
  group_by(source, sentiment) %>%
  slice(2:(n()-1)) %>%
  ungroup() %>%
  ggplot(aes(x=time, y=n, fill=sentiment, color=sentiment)) +
  geom_area(position="identity", alpha=0.3) +
  geom_line(size=1.5) +
  facet_wrap(~ factor(source,
                      labels=c("BBC", "CNN", "Fox News", "NBC")),
             nrow=4, scales="free") +
  scale_fill_manual(labels=c("Negative", "Positive"),
                    values=c("coral", "cornflowerblue")) +
  scale_color_manual(labels=c("Negative", "Positive"),
                     values=c("coral", "cornflowerblue")) +
  scale_x_datetime(date_labels="%b %Y", date_breaks="2 months") +
  labs(x=NULL, y="Count",
       title="Health News Tweets") +
  theme(plot.title=element_text(face="bold"),
        axis.text.x=element_text(size=8),
        legend.position="bottom",
        legend.title=element_blank())

Sys.setlocale()

## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"