1. 형태소 분석

1.1 형태소 분석기 설치 RHINO::

library(devtools)
library(rJava)
library(KoNLP)
install_github("SukjaeChoi/RHINO")

1.2 KoNLP

1.2.1 KoNLP::extracNoun()으로 명사 추출하기

txt=read.any("data/sample_news.txt")
write.csv(txt,"data/sample_news.csv")
txt=readLines("data/sample_news.csv")

noun=lapply(txt, extractNoun)
noun
head(txt)
noun=unlist(noun)
noun
noun <- gsub("[[:digit:]]","", noun)
noun<-noun[-which(noun=="")]

1.2.2 KoNLP::SimplePos09()

url="https://news.naver.com/main/read.nhn?mode=LPOD&mid=sec&oid=001&aid=0010245141&isYeonhapFlash=Y&rc=N"
library(rvest)
page <- read_html(url,encoding = "euc-kr")
article <- page%>%html_nodes("#articleBodyContents")%>%html_text()
article

article=SimplePos09(head(article))
grep(pattern = "P", x = article, value=T)

1.3 RHINO::getMorph

  • mac에서는 동작 안됨.
library(RHINO)
initRhino()
noun2=lapply(txt,getMorph,"noun")

1.4 txt형태의 데이터를 데이터프레임 형태로 바꾸기

txt=readLines("data/sample_news.csv")
txt.df=as.data.frame(txt)
txt.df=txt.df[-1,]
View(txt.df)

1.5 네이버 기사 크롤링 후 형태소 분석

url="https://news.naver.com/main/read.nhn?mode=LPOD&mid=sec&oid=001&aid=0010245141&isYeonhapFlash=Y&rc=N"
library(rvest)
page <- read_html(url,encoding = "euc-kr")
article <- page%>%html_nodes("#articleBodyContents")%>%html_text()
article

noun=lapply(article, extractNoun)
noun=unlist(x)

1.6 형태소 분석 관련 함수 sort(), table()

1.6.1 기본 사용법

sort(table(noun),decreasing = T)
nounVec=unlist(noun)

nounFreq=table(nounVec)
head(sort(nounFreq,decreasing = T),20)

1.6.2 고빈도 단어 추출 후 barplot작성

word=names(head(sort(nounFreq,decreasing = T),20))
freq=as.vector(head(sort(nounFreq,decreasing = T),20))

sum=sum(nounFreq)
percent=round(freq/sum*100,digits=2)
mainTxt="고빈도 단어"

bq <- barplot(percent,main=mainTxt,las=2,ylim = c(0,5),ylab="%",names.arg = word,col="black")