1. 형태소 분석
1.1 형태소 분석기 설치 RHINO::
library(devtools)
library(rJava)
library(KoNLP)
install_github("SukjaeChoi/RHINO")1.2 KoNLP
1.2.1 KoNLP::extracNoun()으로 명사 추출하기
txt=read.any("data/sample_news.txt")
write.csv(txt,"data/sample_news.csv")
txt=readLines("data/sample_news.csv")
noun=lapply(txt, extractNoun)
noun
head(txt)
noun=unlist(noun)
noun
noun <- gsub("[[:digit:]]","", noun)
noun<-noun[-which(noun=="")]1.2.2 KoNLP::SimplePos09()
url="https://news.naver.com/main/read.nhn?mode=LPOD&mid=sec&oid=001&aid=0010245141&isYeonhapFlash=Y&rc=N"
library(rvest)
page <- read_html(url,encoding = "euc-kr")
article <- page%>%html_nodes("#articleBodyContents")%>%html_text()
article
article=SimplePos09(head(article))
grep(pattern = "P", x = article, value=T)1.3 RHINO::getMorph
- mac에서는 동작 안됨.
library(RHINO)
initRhino()
noun2=lapply(txt,getMorph,"noun")1.4 txt형태의 데이터를 데이터프레임 형태로 바꾸기
txt=readLines("data/sample_news.csv")
txt.df=as.data.frame(txt)
txt.df=txt.df[-1,]
View(txt.df)1.5 네이버 기사 크롤링 후 형태소 분석
url="https://news.naver.com/main/read.nhn?mode=LPOD&mid=sec&oid=001&aid=0010245141&isYeonhapFlash=Y&rc=N"
library(rvest)
page <- read_html(url,encoding = "euc-kr")
article <- page%>%html_nodes("#articleBodyContents")%>%html_text()
article
noun=lapply(article, extractNoun)
noun=unlist(x)1.6 형태소 분석 관련 함수 sort(), table()
1.6.1 기본 사용법
sort(table(noun),decreasing = T)
nounVec=unlist(noun)
nounFreq=table(nounVec)
head(sort(nounFreq,decreasing = T),20)1.6.2 고빈도 단어 추출 후 barplot작성
word=names(head(sort(nounFreq,decreasing = T),20))
freq=as.vector(head(sort(nounFreq,decreasing = T),20))
sum=sum(nounFreq)
percent=round(freq/sum*100,digits=2)
mainTxt="고빈도 단어"
bq <- barplot(percent,main=mainTxt,las=2,ylim = c(0,5),ylab="%",names.arg = word,col="black")