NLP+Asso

Sys.getenv("JAVA_HOME")
Sys.setenv(JAVA_HOME = "C:/Program Files/Java/jdk-11.0.3")
Sys.getenv("JAVA_HOME")

library(KoNLP)
library(rJava)
library(tm)

library(stringr)
library(wordcloud)
library(RColorBrewer)
library(wordcloud2)

# library(KoSpacing)

# 파일 불러오기
txt <- file("taxi_spot.txt")     #애초 파일은 ANSI 형태
#encoding="UTF-8", encoding = "UCS-2LE"
text  <- readLines(txt)
close(txt)
text
# Encoding(text) = "CP949"
# 불용어 처리 및 띄워 쓰기 교정
text <- gsub("\\[(.*?)\\]", " ",text)          #대괄호안의 내용 삭제(대괄로도 포함)
# text <- gsub("현재", "",text)                 #해당글자 제거
text <- gsub("[^가-힣]", " ",text)             #한글외 모든 문자 제거
# text <- gsub("[^가-힣]", "",text)             #한글외 모든 문자 제거
# text <- spacing(text)                         #띄워쓰기 교정
text <- str_trim(text)                        # 문자 앞 뒤 빈공간 없애기

# 사전 추가
useNIADic()
# useNIADic(which_dic = c("woorimalsam", "insighter"), category_dic_nms = "", backup = T)  
# useSejongDic()
#word <- data.frame(c("", "혁신"))      # 단어 추가
#buildDictionary(ext_dic = c("sejong", "woorimalsam"), user_dic=word, replace_usr_dic = T)
#buildDictionary(ext_dic = c("woorimalsam"), user_dic=word, replace_usr_dic = T)


# library(KoSpacing)

# 명사추출 및 단어설정
nouns <- sapply(text, extractNoun, USE.NAMES = F)    # KoNLP 명사추출
unlist_nouns <- unlist(nouns)                       # unlist
unlist_nouns_n <- Filter(function(x){nchar(x)>=2 & nchar(x)<=6}, unlist_nouns)
wordcount <- table(unlist_nouns_n)
wordcount <- sort(wordcount, decreasing = T)
head(wordcount, 100)

# 워드클라우드 2   - 일정 빈도 이상 단어만 추출
wordcount_1 <- data.frame(wordcount) 
summary(wordcount_1)
wordcount_2 <- subset(wordcount_1, Freq>=3 & Freq<=2000)   #00회 이상 빈도
# wordcount_2 <- subset(wordcount_1, Freq>=30 & Freq<=2000 & Freq!=808)   #00회 이상 빈도
wordcloud2(data = wordcount_2, size = 1.5, 
           fontFamily = 'Segoe UI', fontWeight = 'bold', shape = 'pentagon')

# 연관성 분석
str(nouns)
library(arules)

keyword_taxi <- dimnames(wordcount[1:30])   # 빈도 상위 5  wordcount <- sort(wordcount, decreasing = T)

contents <- c()
for (i in 1:1862) {                     # 1862, 5148 str(nouns)
  inter <- intersect(nouns[[i]], unlist(keyword_taxi))
  contents <- rbind(contents, table(inter)[unlist(keyword_taxi)])
}
colnames(contents) <- unlist(keyword_taxi)
contents[which(is.na(contents))] <- 0
head(contents)

trans <- as.matrix(contents)   # str(comments$contents)
rules1 <- apriori(trans, parameter = list(supp = 0.005, conf = 0.6, target = "rules"))

rules1.sorted <- sort(rules1, by="lift")
inspect(rules1.sorted)


library(arulesViz)
plot(rules1.sorted, method = "scatterplot")
plot(rules1.sorted, method="graph", control = list(type="items", alpha=0.8))
plot(rules1.sorted, method="grouped")  
plot(rules1.sorted, method="graph",interactive = T) 


library(corrplot)
# corrplot(cor(contents), method="number", diag=F)

corrplot(cor(contents), method="number", order="hclust", addrect=2, diag=F, 
         tl.col = "black", tl.cex = 1.1, sig.level = 0.05, pch.cex = 0.9, insig = "pch")

corrplot.mixed(cor(contents), upper = "ellipse", lower = "number")

NLP+Asso

updragon

2019 8 5