Capstone report

before running, select FILE, save with encoding , UTP-8 please.

1 load in data set

create a new working directory that is directly pointing to the folder contains blogs, news and tweets( english data)

rm(list=ls())
data_path<-paste(getwd(),"/final/en_US", sep="")
setwd(data_path)
#package I use to do this project:
library(knitr) # for knit html
library(ggplot2) # for ggplot2 plotting
library(NLP) # for natural language processing

## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(stringr) # package for handling string in R
library(qdap) # count word

## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Loading required package: qdapTools
## Loading required package: RColorBrewer
## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install Rtools 3.1 from http://cran.r-project.org/bin/windows/Rtools/ and then run find_rtools().
## 
## Attaching package: 'qdap'
## 
## The following object is masked from 'package:base':
## 
##     Filter

library(stringi) # use to count lines fast 
library(pryr) # to see file size with command object_size
library(wordcloud) # for visualization 
#read in the data and read several lines of data:
con_twitts<- file("en_US.twitter.txt",open="rb") 
con_news<- file("en_US.news.txt", open="rb") 
con_blogs<- file("en_US.blogs.txt", open="rb")
#use readlines to store content : 
twitts<-readLines(con_twitts,encoding="UTF-8",warn=FALSE)
news<-readLines(con_news,encoding="UTF-8")
blogs<-readLines(con_blogs,encoding="UTF-8")
#close connection: 
close(con_twitts)
close(con_news)
close(con_blogs)

2 basic summary of data

#see how many lines of data(run bash command in R script)
#This below is a faster way to count lines: 
stri_stats_general(twitts)[1]

##   Lines 
## 2360148

stri_stats_general(news)[1]

##   Lines 
## 1010242

stri_stats_general(blogs)[1]

##  Lines 
## 899288

#object_size(twitts)
#object_size(news)
#object_size(blogs)

#count and sum words in each data:
sum(sapply(gregexpr("\\W+", twitts), length))

## [1] 30433240

sum(sapply(gregexpr("\\W+", news), length))

## [1] 35710862

sum(sapply(gregexpr("\\W+", blogs), length))

## [1] 38222278

3 Preprocessing data:

#random sampling: 
#Since we don't need to load in and use all of the data, I would like to just read 
#several lines of each data and conbine them into one data.
#ramdom select several lines in each data and combine them into one data -all.
set.seed(1233)
ran_twitts<-sample(twitts, 2000, replace=FALSE)
ran_news<-sample(news, 2000, replace=FALSE)
ran_blogs<-sample(blogs, 2000, replace=FALSE)
#all is the combined data by twitts_part news_part and blogs_part.
all<-paste(ran_twitts, ran_news, ran_blogs)
#count how many words in data: 
stri_stats_general(all) #it should be 2000.

##       Lines LinesNEmpty       Chars CharsNWhite 
##        2000        2000     1011841      835481

#sum(sapply(gregexpr("\\W+", all), length))
library(tm)

## 
## Attaching package: 'tm'
## 
## The following object is masked from 'package:pryr':
## 
##     inspect
## 
## The following objects are masked from 'package:qdap':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix

#tokenization : create corpus 
my_corp<-Corpus(VectorSource(all), readerControl=list(language="lat"))
#do transformation on corpus I made and get a clean corpus
my_corp<-tm_map(my_corp, PlainTextDocument)
my_corp<-tm_map(my_corp, removePunctuation)
my_corp<-tm_map(my_corp, stripWhitespace)
my_corp<-tm_map(my_corp, tolower)
my_corp<-tm_map(my_corp, removeNumbers)
my_corp<-tm_map(my_corp, stemDocument)
corpus<-tm_map(my_corp,tolower) #convert to lower case
my_corp<-tm_map(my_corp, removeWords, stopwords("english"))
profane_path<-paste(getwd(), "/profane.txt",sep="")
my_corp<-tm_map(my_corp, removeWords, profane_path)
corp_clean<-my_corp
corp_clean<-Corpus(VectorSource(corp_clean))
#generate a document term matrix: 
dtm<-DocumentTermMatrix(corp_clean)

4 visualization of data:

#visualization : 
my_dtm<-as.matrix(dtm)
order<-sort(colSums(my_dtm), decreasing=TRUE)[1:60]
order_name <-names(order)
word_freq<-data.frame(order)
df<-data.frame(as.character(rownames(word_freq)), word_freq)
colnames(df)[1]="word_names"
names(df)<-c("word_names","frequency")
rownames(df)<-c(1:nrow(df))
#transformed to a frequency table for plotting easily.
head(df)

##   word_names frequency
## 1       said       588
## 2       will       554
## 3        one       497
## 4       just       481
## 5        can       411
## 6       like       398

#order frequency as decreasing and save to a data frame called df_order
df_order<-df[order(-df$frequency), ]
#plot bar plot to show words and their frequency: 
g<-ggplot(df_order,aes(x=word_names, y=frequency)) + 
        geom_bar(stat="identity",colour="yellow", fill="pink") +
        labs(x="frequency of each word", y="word names") +
        ggtitle("histogram of word frequencies") +
        coord_flip() +
        theme_bw() +
        geom_text(aes(label=frequency),size=3)
         
print(g)

#use word cloud.
v<-sort(colSums(my_dtm), decreasing=TRUE)
words<-names(v)
d<-data.frame(word=words, freq=v)
wordcloud(d$word,d$freq,max.words=150,colors=brewer.pal(5,"Set1"),random.order=FALSE)

Capstone report

Shuang

Wednesday, March 18, 2015

1 load in data set

2 basic summary of data

3 Preprocessing data:

4 visualization of data: