Sys.getenv("JAVA_HOME")
Sys.setenv(JAVA_HOME = "C:/Program Files/Java/jdk-11.0.3")
Sys.getenv("JAVA_HOME")
library(KoNLP)
library(rJava)
library(tm)
library(stringr)
library(wordcloud)
library(RColorBrewer)
library(wordcloud2)
# library(KoSpacing)
# 파일 불러오기
txt <- file("taxi_spot.txt") #애초 파일은 ANSI 형태
#encoding="UTF-8", encoding = "UCS-2LE"
text <- readLines(txt)
close(txt)
text
# Encoding(text) = "CP949"
# 불용어 처리 및 띄워 쓰기 교정
text <- gsub("\\[(.*?)\\]", " ",text) #대괄호안의 내용 삭제(대괄로도 포함)
# text <- gsub("현재", "",text) #해당글자 제거
text <- gsub("[^가-힣]", " ",text) #한글외 모든 문자 제거
# text <- gsub("[^가-힣]", "",text) #한글외 모든 문자 제거
# text <- spacing(text) #띄워쓰기 교정
text <- str_trim(text) # 문자 앞 뒤 빈공간 없애기
# 사전 추가
useNIADic()
# useNIADic(which_dic = c("woorimalsam", "insighter"), category_dic_nms = "", backup = T)
# useSejongDic()
#word <- data.frame(c("", "혁신")) # 단어 추가
#buildDictionary(ext_dic = c("sejong", "woorimalsam"), user_dic=word, replace_usr_dic = T)
#buildDictionary(ext_dic = c("woorimalsam"), user_dic=word, replace_usr_dic = T)
# library(KoSpacing)
# 명사추출 및 단어설정
nouns <- sapply(text, extractNoun, USE.NAMES = F) # KoNLP 명사추출
unlist_nouns <- unlist(nouns) # unlist
unlist_nouns_n <- Filter(function(x){nchar(x)>=2 & nchar(x)<=6}, unlist_nouns)
wordcount <- table(unlist_nouns_n)
wordcount <- sort(wordcount, decreasing = T)
head(wordcount, 100)
# 워드클라우드 2 - 일정 빈도 이상 단어만 추출
wordcount_1 <- data.frame(wordcount)
summary(wordcount_1)
wordcount_2 <- subset(wordcount_1, Freq>=3 & Freq<=2000) #00회 이상 빈도
# wordcount_2 <- subset(wordcount_1, Freq>=30 & Freq<=2000 & Freq!=808) #00회 이상 빈도
wordcloud2(data = wordcount_2, size = 1.5,
fontFamily = 'Segoe UI', fontWeight = 'bold', shape = 'pentagon')
# 연관성 분석
str(nouns)
library(arules)
keyword_taxi <- dimnames(wordcount[1:30]) # 빈도 상위 5 wordcount <- sort(wordcount, decreasing = T)
contents <- c()
for (i in 1:1862) { # 1862, 5148 str(nouns)
inter <- intersect(nouns[[i]], unlist(keyword_taxi))
contents <- rbind(contents, table(inter)[unlist(keyword_taxi)])
}
colnames(contents) <- unlist(keyword_taxi)
contents[which(is.na(contents))] <- 0
head(contents)
trans <- as.matrix(contents) # str(comments$contents)
rules1 <- apriori(trans, parameter = list(supp = 0.005, conf = 0.6, target = "rules"))
rules1.sorted <- sort(rules1, by="lift")
inspect(rules1.sorted)
library(arulesViz)
plot(rules1.sorted, method = "scatterplot")
plot(rules1.sorted, method="graph", control = list(type="items", alpha=0.8))
plot(rules1.sorted, method="grouped")
plot(rules1.sorted, method="graph",interactive = T)
library(corrplot)
# corrplot(cor(contents), method="number", diag=F)
corrplot(cor(contents), method="number", order="hclust", addrect=2, diag=F,
tl.col = "black", tl.cex = 1.1, sig.level = 0.05, pch.cex = 0.9, insig = "pch")
corrplot.mixed(cor(contents), upper = "ellipse", lower = "number")