University Bamboo Forest Text Mining

Article

단대신문 1436호 심층기획
대나무숲에 드리운 익명성의 빛과 그림자
Link: http://dknews.dankook.ac.kr/news/articleView.html?idxno=15191

Web Crawling

library(Rfacebook)

fb_auth = fbOAuth(app_id=“facebook_app_id”, app_secret=“facebook_app_secret”,
extended_permissions = FALSE)
getUsers(“me”, token=fb_oauth)

start_date = ‘2017/01/01’
end_date=‘2017/12/31’
scrape_days = seq(from=as.Date(start_date), to=as.Date(end_date), by=‘days’)
posts= c()

for(scrape_day in scrape_days)
{
daypost = c()
tryCatch({daypost=getPage(page = “dkubamboo”,
token=fb_oauth,
since=as.Date(scrape_day, origin=“1970-01-01”),
until=as.Date(scrape_day, origin=“1970-01-01”) + 1
)},
error=function(e){}
)
posts=rbind(posts, daypost)

}

write(posts, “_Duplicate_1231 bamboo.txt”)

Word Count

library(KoNLP)

txt <- readLines(“_Duplicate_1231 bamboo.txt”)
order <- sapply(txt, extractNoun, USE.NAMES=F)
txt2 <- unlist(order)
txt3 <- Filter(function(x){nchar(x)>=2}, txt2)

txt3 <- gsub(“\d+”, ““, txt3)
”txt3 <- gsub("\$", "", txt3)”
txt3 <- gsub(“$”, ““, txt3)
txt3 <- gsub(”\@“,”“, txt3)
txt3 <- gsub(”\#“,”“, txt3)
txt3 <- gsub(”\+“,”“, txt3)
txt3 <- gsub(”\?“,”“, txt3)
txt3 <- gsub(”\:“,”“, txt3)
txt3 <- gsub(”\^“,”“, txt3)
txt3 <- gsub(”\*“,”“, txt3)
txt3 <- gsub(”\~“,”“, txt3)
txt3 <- gsub(”[A-Za-z]“,”“, txt3)
txt3 <- gsub(”[^가-힣]“,”“, txt3)
txt3 <- gsub(”\&“,”“, txt3)
txt3 <- gsub(”\!“,”“, txt3)
txt3 <- gsub(”\/“,”“, txt3)
txt3 <- gsub(”\-“,”“, txt3)
txt3 <- gsub(”\–“,”“, txt3)
txt3 <- gsub(”\.”,““, txt3)
txt3 <- gsub(”\/“,”“, txt3)

txt3 <- gsub(“해서”,““, txt3)
txt3 <- gsub(”하게”,““, txt3)
txt3 <- gsub(”하시”,““, txt3)
txt3 <- gsub(”오전”,““, txt3)
txt3 <- gsub(”오후”,““, txt3)
txt3 <- gsub(”대숲”,““, txt3)
txt3 <- gsub(”이후”,““, txt3)
txt3 <- gsub(”단국대학교”,““, txt3)
txt3 <- gsub(”단국대”,““, txt3)
txt3 <- gsub(”대나무숲”,““, txt3)

head(unlist(txt3), 20)
write(unlist(txt3), “1231 전처리.txt”)

txt4 <- read.table(“1231 전처리.txt”)
nrow(txt4)
wordcount <- table(txt4)
ppap <- head(sort(wordcount, decreasing=T), 1000)
write.table(ppap, “1231 단어집계 순위.txt”)

WordCloud

*Python

#!/usr/bin/env python
““”
Masked wordcloud
================
Using a mask you can generate wordclouds in arbitrary shapes.
““”

from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os

from wordcloud import WordCloud, STOPWORDS

# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = path.dirname(file) if “file” in locals() else os.getcwd()

# Read the whole text.
text = open(path.join(d, ‘alice.txt’)).read()

# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
alice_mask = np.array(Image.open(path.join(d, “alice_mask.png”)))

stopwords = set(STOPWORDS)
stopwords.add(“said”)

wc = WordCloud(background_color=“white”, max_words=2000, mask=alice_mask, stopwords=stopwords, contour_width=3, contour_color=‘steelblue’)

# generate word cloud
wc.generate(text)

# store to file
wc.to_file(path.join(d, “alice.png”))

# show
plt.imshow(wc, interpolation=‘bilinear’)
plt.axis(“off”)
plt.figure()
plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation=‘bilinear’)
plt.axis(“off”)
plt.show()

LDA

library(lda)
library(stringr)
library(tm)
library(KoNLP)
library(topicmodels)
library(LDAvis)
library(servr)
library(LDAvisData)

# mytext<- VCorpus(DirSource())

# LDAvisData can be installed from GitHub via
‘devtools::install_github(“cpsievert/LDAvisData”)’
# reviews <- mytext

# read in some stopwords:
library(tm)
stop_words <- stopwords(“SMART”)

#load file
reviews <- readLines(“1231 전처리.txt”)
reviews <- sapply(reviews, extractNoun, USE.NAMES=F)
reviews <- unlist(reviews)

# pre-processing:
reviews <- gsub(“’”, ““, reviews) # remove apostrophes
reviews <- gsub(”[[:punct:]]“,” “, reviews) # replace punctuation with space
reviews <- gsub(”[[:cntrl:]]“,” “, reviews) # replace control characters with space
reviews <- gsub(”¹+“,”“, reviews) # remove whitespace at beginning of documents
reviews <- gsub(”[[:space:]]+$“,”“, reviews) # remove whitespace at end of documents
reviews <- tolower(reviews) # force to lowercase

# tokenize on space and output as a list:
# doc.list <- strsplit(reviews, “[[:space:]]+”)
doc.list <- strsplit(reviews, “[[:space:]]+”)

# compute the table of terms:
term.table <- table(unlist(doc.list))
term.table <- sort(term.table, decreasing = TRUE)

# remove terms that are stop words or occur fewer than 5 times:
del <- names(term.table) %in% stop_words | term.table < 5
term.table <- term.table[!del]
vocab <- names(term.table)

# now put the documents into the format required by the lda package:
get.terms <- function(x) {
index <- match(x, vocab)
index <- index[!is.na(index)]
rbind(as.integer(index - 1), as.integer(rep(1, length(index))))
}
documents <- lapply(doc.list, get.terms)

# Compute some statistics related to the data set:
D <- length(documents) # number of documents (2,000)
W <- length(vocab) # number of terms in the vocab (14,568)
doc.length <- sapply(documents, function(x) sum(x[2, ])) # number of tokens per
document [312, 288, 170, 436, 291, …]
N <- sum(doc.length) # total number of tokens in the data (546,827)
term.frequency <- as.integer(term.table) # frequencies of terms in the corpus [8939, 5544, 2411, 2410, 2143, …]

# MCMC and model tuning parameters:
K <- 5
G <- 5000
alpha <- 0.02
eta <- 0.02

# Fit the model:
library(lda)
set.seed(357)
t1 <- Sys.time()
fit <- lda.collapsed.gibbs.sampler(documents = documents, K = K, vocab = vocab, num.iterations = G, alpha = alpha, eta = eta, initial = NULL, burnin = 0, compute.log.likelihood = TRUE)

t2 <- Sys.time()
t2 - t1 # about 24 minutes on laptop

theta <- t(apply(fit $document_sums + alpha, 2, function(x) x/sum(x))) <br/>phi <- t(apply(t(fit$topics) + eta, 2, function(x) x/sum(x)))

MovieReviews <- list(phi = phi, theta = theta, doc.length = doc.length, vocab = vocab, term.frequency = term.frequency)

options(encoding = ‘UTF-8’)

library(LDAvis)

# create the JSON object to feed the visualization:
json <- createJSON(phi = MovieReviews $phi,
theta = MovieReviews $theta,
doc.length = MovieReviews $doc.length,
vocab = MovieReviews $vocab,
term.frequency = MovieReviews $term.frequency, encoding=‘UTF-8’)

serVis(json, out.dir = ‘vis’, open.browser = TRUE)

Reference

[1] [TIP] R에서 페이스북 페이지 정보 크롤링 하기, AirPAGE
https://airpage.org/xe/language_data/22905
[2] cpsievert, A topic model for movie reviews, Github
https://github.com/cpsievert/LDAvis/blob/master/docs/reviews/reviews.md
[3] amueller, word_cloud/examples/masked.py, Github
https://github.com/amueller/word_cloud/blob/master/examples/masked.py

[:space:]↩︎