before running, select FILE, save with encoding , UTP-8 please.
create a new working directory that is directly pointing to the folder contains blogs, news and tweets( english data)
rm(list=ls())
data_path<-paste(getwd(),"/final/en_US", sep="")
setwd(data_path)
#package I use to do this project:
library(knitr) # for knit html
library(ggplot2) # for ggplot2 plotting
library(NLP) # for natural language processing
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(stringr) # package for handling string in R
library(qdap) # count word
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Loading required package: qdapTools
## Loading required package: RColorBrewer
## WARNING: Rtools is required to build R packages, but is not currently installed.
##
## Please download and install Rtools 3.1 from http://cran.r-project.org/bin/windows/Rtools/ and then run find_rtools().
##
## Attaching package: 'qdap'
##
## The following object is masked from 'package:base':
##
## Filter
library(stringi) # use to count lines fast
library(pryr) # to see file size with command object_size
library(wordcloud) # for visualization
#read in the data and read several lines of data:
con_twitts<- file("en_US.twitter.txt",open="rb")
con_news<- file("en_US.news.txt", open="rb")
con_blogs<- file("en_US.blogs.txt", open="rb")
#use readlines to store content :
twitts<-readLines(con_twitts,encoding="UTF-8",warn=FALSE)
news<-readLines(con_news,encoding="UTF-8")
blogs<-readLines(con_blogs,encoding="UTF-8")
#close connection:
close(con_twitts)
close(con_news)
close(con_blogs)
#see how many lines of data(run bash command in R script)
#This below is a faster way to count lines:
stri_stats_general(twitts)[1]
## Lines
## 2360148
stri_stats_general(news)[1]
## Lines
## 1010242
stri_stats_general(blogs)[1]
## Lines
## 899288
#object_size(twitts)
#object_size(news)
#object_size(blogs)
#count and sum words in each data:
sum(sapply(gregexpr("\\W+", twitts), length))
## [1] 30433240
sum(sapply(gregexpr("\\W+", news), length))
## [1] 35710862
sum(sapply(gregexpr("\\W+", blogs), length))
## [1] 38222278
#random sampling:
#Since we don't need to load in and use all of the data, I would like to just read
#several lines of each data and conbine them into one data.
#ramdom select several lines in each data and combine them into one data -all.
set.seed(1233)
ran_twitts<-sample(twitts, 2000, replace=FALSE)
ran_news<-sample(news, 2000, replace=FALSE)
ran_blogs<-sample(blogs, 2000, replace=FALSE)
#all is the combined data by twitts_part news_part and blogs_part.
all<-paste(ran_twitts, ran_news, ran_blogs)
#count how many words in data:
stri_stats_general(all) #it should be 2000.
## Lines LinesNEmpty Chars CharsNWhite
## 2000 2000 1011841 835481
#sum(sapply(gregexpr("\\W+", all), length))
library(tm)
##
## Attaching package: 'tm'
##
## The following object is masked from 'package:pryr':
##
## inspect
##
## The following objects are masked from 'package:qdap':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
#tokenization : create corpus
my_corp<-Corpus(VectorSource(all), readerControl=list(language="lat"))
#do transformation on corpus I made and get a clean corpus
my_corp<-tm_map(my_corp, PlainTextDocument)
my_corp<-tm_map(my_corp, removePunctuation)
my_corp<-tm_map(my_corp, stripWhitespace)
my_corp<-tm_map(my_corp, tolower)
my_corp<-tm_map(my_corp, removeNumbers)
my_corp<-tm_map(my_corp, stemDocument)
corpus<-tm_map(my_corp,tolower) #convert to lower case
my_corp<-tm_map(my_corp, removeWords, stopwords("english"))
profane_path<-paste(getwd(), "/profane.txt",sep="")
my_corp<-tm_map(my_corp, removeWords, profane_path)
corp_clean<-my_corp
corp_clean<-Corpus(VectorSource(corp_clean))
#generate a document term matrix:
dtm<-DocumentTermMatrix(corp_clean)
#visualization :
my_dtm<-as.matrix(dtm)
order<-sort(colSums(my_dtm), decreasing=TRUE)[1:60]
order_name <-names(order)
word_freq<-data.frame(order)
df<-data.frame(as.character(rownames(word_freq)), word_freq)
colnames(df)[1]="word_names"
names(df)<-c("word_names","frequency")
rownames(df)<-c(1:nrow(df))
#transformed to a frequency table for plotting easily.
head(df)
## word_names frequency
## 1 said 588
## 2 will 554
## 3 one 497
## 4 just 481
## 5 can 411
## 6 like 398
#order frequency as decreasing and save to a data frame called df_order
df_order<-df[order(-df$frequency), ]
#plot bar plot to show words and their frequency:
g<-ggplot(df_order,aes(x=word_names, y=frequency)) +
geom_bar(stat="identity",colour="yellow", fill="pink") +
labs(x="frequency of each word", y="word names") +
ggtitle("histogram of word frequencies") +
coord_flip() +
theme_bw() +
geom_text(aes(label=frequency),size=3)
print(g)
#use word cloud.
v<-sort(colSums(my_dtm), decreasing=TRUE)
words<-names(v)
d<-data.frame(word=words, freq=v)
wordcloud(d$word,d$freq,max.words=150,colors=brewer.pal(5,"Set1"),random.order=FALSE)