This is an R HTML document. When you click the Knit HTML button a web page will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# tidytext 패키지 로드 및 감성 데이터셋 확인 library(tidytext) sentiments
## # A tibble: 6,786 × 2 ## word sentiment ## <chr> <chr> ## 1 2-faces negative ## 2 abnormal negative ## 3 abolish negative ## 4 abominable negative ## 5 abominably negative ## 6 abominate negative ## 7 abomination negative ## 8 abort negative ## 9 aborted negative ## 10 aborts negative ## # ℹ 6,776 more rows
install.packages("textdata")
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(textdata) # 각 감성 사전 로드 (afinn, nrc, loughran) get_sentiments(lexicon="afinn")
## # A tibble: 2,477 × 2 ## word value ## <chr> <dbl> ## 1 abandon -2 ## 2 abandoned -2 ## 3 abandons -2 ## 4 abducted -2 ## 5 abduction -2 ## 6 abductions -2 ## 7 abhor -3 ## 8 abhorred -3 ## 9 abhorrent -3 ## 10 abhors -3 ## # ℹ 2,467 more rows
get_sentiments(lexicon="nrc")
## # A tibble: 13,872 × 2 ## word sentiment ## <chr> <chr> ## 1 abacus trust ## 2 abandon fear ## 3 abandon negative ## 4 abandon sadness ## 5 abandoned anger ## 6 abandoned fear ## 7 abandoned negative ## 8 abandoned sadness ## 9 abandonment anger ## 10 abandonment fear ## # ℹ 13,862 more rows
get_sentiments(lexicon="loughran")
## # A tibble: 4,150 × 2 ## word sentiment ## <chr> <chr> ## 1 abandon negative ## 2 abandoned negative ## 3 abandoning negative ## 4 abandonment negative ## 5 abandonments negative ## 6 abandons negative ## 7 abdicated negative ## 8 abdicates negative ## 9 abdicating negative ## 10 abdication negative ## # ℹ 4,140 more rows
# 필요 패키지 로드 library(dplyr)
library(tibble) library(purrr) library(readr) install.packages("lubridate")
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(lubridate)
# 트윗 데이터를 다운로드하여 로컬에 저장 url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00438/Health-News-Tweets.zip" local.copy <- tempfile() download.file(url, destfile=local.copy, mode="wb") Sys.setlocale("LC_TIME", "en_US.UTF-8") #로케일 설정
## [1] "en_US.UTF-8"
# 트윗 데이터를 읽고 결합하여 tibble로 변환 health.twitter <- map(unzip(zipfile=local.copy, files=c("Health-Tweets/bbchealth.txt", "Health-Tweets/cnnhealth.txt", "Health-Tweets/foxnewshealth.txt", "Health-Tweets/NBChealth.txt")), read_delim, delim="|", quote="", col_types=list(col_character(), col_character(), col_character()), col_names=c("id", "datetime", "tweet")) %>% map2(c("bbc", "cnn", "foxnews", "nbc"), ~cbind(.x, source=.y)) %>% reduce(bind_rows) %>% as_tibble() %>% mutate(datetime=ymd_hms(strptime(datetime, "%a %b %d %H:%M:%S +0000 %Y")))
## Warning: One or more parsing issues, call `problems()` on your data frame for details, ## e.g.: ## dat <- vroom(...) ## problems(dat)
unlink(local.copy) Sys.setlocale() # 로케일 원래대로 복원
## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"
# 출처별 트윗 수 확인 health.twitter %>% count(source)
## # A tibble: 4 × 2 ## source n ## <chr> <int> ## 1 bbc 3929 ## 2 cnn 4061 ## 3 foxnews 2000 ## 4 nbc 4215
# 텍스트 전처리 수행 library(tidytext) library(stringr) health.words <- health.twitter %>% select(-id) %>% mutate(tweet=str_replace_all(tweet, pattern="(f|ht)tp\\S+\\s*", replacement="")) %>% mutate(tweet=str_replace_all(tweet, pattern="\\d+", replacement="")) %>% mutate(tweet=str_replace_all(tweet, pattern="\\bRT", replacement="")) %>% mutate(tweet=str_replace_all(tweet, pattern="@\\S+", replacement="")) %>% mutate(tweet=str_replace_all(tweet, pattern="&", replacement="")) %>% unnest_tokens(word, tweet) # 토큰화하여 단어 추출 health.words
## # A tibble: 124,559 × 3 ## datetime source word ## <dttm> <chr> <chr> ## 1 2015-04-09 01:31:50 bbc breast ## 2 2015-04-09 01:31:50 bbc cancer ## 3 2015-04-09 01:31:50 bbc risk ## 4 2015-04-09 01:31:50 bbc test ## 5 2015-04-09 01:31:50 bbc devised ## 6 2015-04-08 23:30:18 bbc gp ## 7 2015-04-08 23:30:18 bbc workload ## 8 2015-04-08 23:30:18 bbc harming ## 9 2015-04-08 23:30:18 bbc care ## 10 2015-04-08 23:30:18 bbc bma ## # ℹ 124,549 more rows
# 감성 분석 및 상위 10개 단어 추출 health.sentiment <- health.words %>% inner_join(get_sentiments("bing"), by="word") %>% count(word, sentiment, sort=TRUE) %>% group_by(sentiment) %>% slice_max(order_by=n, n=10) %>% ungroup() %>% mutate(nsign=ifelse(sentiment=="negative", -n, n)) health.sentiment
## # A tibble: 20 × 4 ## word sentiment n nsign ## <chr> <chr> <int> <int> ## 1 cancer negative 701 -701 ## 2 risk negative 317 -317 ## 3 death negative 193 -193 ## 4 outbreak negative 188 -188 ## 5 virus negative 147 -147 ## 6 fat negative 103 -103 ## 7 loss negative 103 -103 ## 8 deadly negative 102 -102 ## 9 warning negative 96 -96 ## 10 pain negative 92 -92 ## 11 work positive 210 210 ## 12 healthy positive 158 158 ## 13 patient positive 155 155 ## 14 good positive 139 139 ## 15 like positive 139 139 ## 16 better positive 135 135 ## 17 support positive 102 102 ## 18 free positive 95 95 ## 19 love positive 90 90 ## 20 best positive 87 87
# [그림 7-9] 그래프 생성 library(ggplot2) library(scales)
ggplot(health.sentiment, aes(x=reorder(word, nsign), y=nsign, fill=factor(sentiment, levels=c("positive", "negative")))) + geom_col(color="lightslategray", width=0.8) + geom_text(aes(label=n), size=3, color="black", hjust=ifelse(health.sentiment$nsign < 0, 1.1, -0.1)) + scale_fill_manual(values=c("cornflowerblue", "tomato")) + scale_y_continuous(breaks=pretty(health.sentiment$nsign), labels=abs(pretty(health.sentiment$nsign))) + coord_flip() + labs(x=NULL, y="Count", title="Health News Tweets") + theme_minimal() + theme(legend.position="bottom", legend.title=element_blank(), plot.title=element_text(face="bold"), axis.text=element_text(face="bold", size=10))
# 의료 관련 용어 제외 후 분석 health.sentiment <- health.words %>% inner_join(get_sentiments("bing"), by="word") %>% filter(!(word %in% c("patient", "cancer", "virus"))) %>% count(word, sentiment, sort=TRUE) %>% group_by(sentiment) %>% slice_max(order_by=n, n=10) %>% ungroup() %>% mutate(nsign=ifelse(sentiment=="negative", -n, n)) # [그림 7-10] 그래프 생성 library(ggplot2) ggplot(health.sentiment, aes(x=reorder(word, n), y=n, fill=factor(sentiment, levels=c("positive", "negative")))) + geom_col(color="lightslategray", width=0.6, show.legend = FALSE) + geom_text(aes(label=n), size=3, color="black", hjust=1.2) + scale_fill_manual(values=c("lightsteelblue1", "lightsalmon1")) + facet_wrap(~ factor(sentiment, levels=c("positive", "negative")), ncol=2, scales="free") + coord_flip() + labs(x=NULL, y="Count", title="Health News Tweets") + theme_light() + theme(plot.title=element_text(face="bold"), axis.line=element_line(color="gray"), axis.text=element_text(face="bold", size=10))
# [그림 7-11] 워드 클라우드 생성 install.packages("wordcloud2")
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(wordcloud)
library(reshape2) set.seed(123) health.words %>% inner_join(get_sentiments("bing"), by="word") %>% filter(!(word %in% c("patient", "cancer", "virus"))) %>% count(word, sentiment, sort=TRUE) %>% ungroup() %>% acast(formula=word ~ sentiment, value.var="n", fill=0) %>% comparison.cloud(colors=c("tomato", "cornflowerblue"), title.size=2, title.colors=c("red", "blue"), title.bg.colors=c("wheat"), scale=c(4, 0.3), max.words=200, match.colors=TRUE)
# [그림 7-12] 건강뉴스 트윗 (회사별 상위 10개 긍/부정 단어) health.sentiment <- health.words %>% inner_join(get_sentiments("bing"), by="word") %>% filter(!(word %in% c("patient", "cancer", "virus"))) %>% count(word, sentiment, source, sort=TRUE) %>% ungroup() %>% group_by(source, sentiment) %>% slice_max(order_by = n, n=10) %>% ungroup() health.sentiment
## # A tibble: 86 × 4 ## word sentiment source n ## <chr> <chr> <chr> <int> ## 1 risk negative bbc 89 ## 2 death negative bbc 72 ## 3 warning negative bbc 54 ## 4 crisis negative bbc 35 ## 5 abuse negative bbc 29 ## 6 outbreak negative bbc 26 ## 7 threat negative bbc 26 ## 8 fat negative bbc 22 ## 9 poor negative bbc 22 ## 10 strike negative bbc 22 ## # ℹ 76 more rows
# 그래프 그리기 library(ggplot2) ggplot(health.sentiment, aes(reorder_within(x=word, by=n, within=source), n, fill=source)) + geom_col(show.legend = FALSE) + facet_wrap(~ factor(source, labels=c("BBC", "CNN", "Fox News", "NBC")) + sentiment, ncol=2, scales="free") + scale_x_reordered() + coord_flip() + labs(x=NULL, y="Count", title="Health News Tweets") + theme_light() + theme(strip.background=element_blank(), strip.text=element_text(color="goldenrod4", face="bold"), plot.title=element_text(face="bold"), axis.line=element_line(color="gray"), axis.text=element_text(face="bold", size=10), panel.grid.minor=element_blank())
# 시간 흐름에 따른 추이 분석 library(lubridate) health.sentiment <- health.words %>% inner_join(get_sentiments("bing"), by="word") %>% filter(!(word %in% c("patient", "cancer", "virus"))) %>% mutate(time=floor_date(x=datetime, unit="month")) %>% count(sentiment, time) %>% group_by(sentiment) %>% slice(2:(n()-1)) %>% ungroup() health.sentiment
## # A tibble: 62 × 3 ## sentiment time n ## <chr> <dttm> <int> ## 1 negative 2012-09-01 00:00:00 86 ## 2 negative 2012-10-01 00:00:00 94 ## 3 negative 2012-11-01 00:00:00 54 ## 4 negative 2012-12-01 00:00:00 65 ## 5 negative 2013-01-01 00:00:00 135 ## 6 negative 2013-02-01 00:00:00 188 ## 7 negative 2013-03-01 00:00:00 183 ## 8 negative 2013-04-01 00:00:00 197 ## 9 negative 2013-05-01 00:00:00 212 ## 10 negative 2013-06-01 00:00:00 176 ## # ℹ 52 more rows
# [그림 7-13] 그래프 생성하기 Sys.setlocale("LC_TIME", "en_US.UTF-8")
## [1] "en_US.UTF-8"
library(ggplot2) ggplot(health.sentiment, aes(x=time, y=n, fill=sentiment, color=sentiment)) + geom_area(position="identity", alpha=0.3) + geom_line(size=1.5) + scale_fill_manual(labels=c("Negative", "Positive"), values=c("orangered", "deepskyblue2")) + scale_color_manual(labels=c("Negative", "Positive"), values=c("orangered", "deepskyblue2")) + scale_x_datetime(date_labels="%b %Y", date_breaks="6 months") + labs(x=NULL, y="Count", title="Health News Twetts") + theme_minimal() + theme(plot.title=element_text(face="bold"), axis.text=element_text(face="bold"), legend.position="bottom", legend.title=element_blank())
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. ## ℹ Please use `linewidth` instead. ## This warning is displayed once every 8 hours. ## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was ## generated.
Sys.setlocale()
## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"
# [그림 7-14] 회사별 긍/부정 단어 출현 빈도 추이 분석 그래프 생성 Sys.setlocale("LC_TIME", "en_US.UTF-8")
## [1] "en_US.UTF-8"
library(ggplot2) health.words %>% inner_join(get_sentiments("bing"), by="word") %>% filter(!(word %in% c("patient", "cancer", "virus"))) %>% mutate(time=floor_date(datetime, unit="month")) %>% count(source, sentiment, time) %>% group_by(source, sentiment) %>% slice(2:(n()-1)) %>% ungroup() %>% ggplot(aes(x=time, y=n, fill=sentiment, color=sentiment)) + geom_area(position="identity", alpha=0.3) + geom_line(size=1.5) + facet_wrap(~ factor(source, labels=c("BBC", "CNN", "Fox News", "NBC")), nrow=4, scales="free") + scale_fill_manual(labels=c("Negative", "Positive"), values=c("coral", "cornflowerblue")) + scale_color_manual(labels=c("Negative", "Positive"), values=c("coral", "cornflowerblue")) + scale_x_datetime(date_labels="%b %Y", date_breaks="2 months") + labs(x=NULL, y="Count", title="Health News Tweets") + theme(plot.title=element_text(face="bold"), axis.text.x=element_text(size=8), legend.position="bottom", legend.title=element_blank())
Sys.setlocale()
## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"
You can also embed plots, for example:
plot(cars)