Summary

This document analyzed the three data sets: US blog, US twitter, and US news. Analysis include word count, most frequent used words, word clouds in the text of US blog, twitter, and US news respectively. After the basic analysis of line length, I extracted the first 8000 lines in the US_Blog file to esimate the word frequency and the word cloud figure.

Download and Read Data

Download data

# url of data
daturl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

# download file
download.file(daturl, "txtdata.zip")

# unzip file
unzip("txtdata.zip", exdir = ".")

Read data

ustwitter<-readLines("./final/en_US/en_US.twitter.txt")
usblog<-readLines("./final/en_US/en_US.blogs.txt")
usnews<-readLines("./final/en_US/en_US.news.txt")

Exploratory Data Analysis

Summary of line information of these three data file

## explore lenght of each line in data ustwitter
n<-length(ustwitter)
line_n_t<-rep(0, n)
for(i in 1:n){
  m<-nchar(ustwitter[i])
  line_n_t[i]<-m
}

## explore lenght of each line in data usblog
n<-length(usblog)
line_n_b<-rep(0, n)
for(i in 1:n){
  m<-nchar(usblog[i])
  line_n_b[i]<-m
}

## explore lenght of each line in data usnews
n<-length(usnews)
line_n_n<-rep(0, n)
for(i in 1:n){
  m<-nchar(usnews[i])
  line_n_n[i]<-m
}

## summary is printed in table line_sum
line_sum<-data.frame(rbind(summary(line_n_t)[c(1,3,4,6)],
                           summary(line_n_b)[c(1,3,4,6)],
                           summary(line_n_n)[c(1,3,4,6)]))
row.names(line_sum) = c("twitter", "blog", "news")
line_sum$line_length=c(length(ustwitter), length(usblog), length(usnews))

line_sum
##         Min. Median  Mean  Max. line_length
## twitter    2     64  68.8   213     2360148
## blog       1    157 231.7 40840      899288
## news       2    186 203.0  5760       77259

Within three documents, twitter file has the longest lines, in which each line is very short. The blog file has the longest character per line among these files, and the largest amount of inforamtion (based on the lenght of characters). The news file has shortest line.

Clean data (using blog file top 8000 lines)

library(tm)

# read as Corpus and briefly inspect data
usb_c<-VCorpus(VectorSource(usblog[1:8000]))
#inspect(usb_c)

# remove punctuation 
usb_c<-tm_map(usb_c,removePunctuation)
usb_c<-tm_map(usb_c, PlainTextDocument)

# remove number
usb_c<-tm_map(usb_c, removeNumbers)
usb_c<-tm_map(usb_c, PlainTextDocument)

# Converting to lowercase
usb_c<-tm_map(usb_c, tolower)
usb_c<-tm_map(usb_c, PlainTextDocument)

# remove stopwords in English
usb_c<-tm_map(usb_c, removeWords, stopwords("english"))
usb_c<-tm_map(usb_c, PlainTextDocument)

# getting the stemwords--There is error in tm-0.7_6 version
#usb_st<-tm_map(usb_c, StemDocument)

#usb_st<-tm_map(usb_st, PlainTextDocument)

# add common ending of stemwords
#usb_stc<-tm_map(usb_st, stemCompletion, dictionary = DocsCopy, lazy=TRUE)

Explore data in blog file

usb_c<-tm_map(usb_c, stripWhitespace)
usb_c<-tm_map(usb_c, PlainTextDocument)

# Read the clean data of usn_c into term matrix
blog<-DocumentTermMatrix(usb_c)

# getting the frequency of word
freq <- colSums(as.matrix(blog))
#head(freq)
#tail(freq)

sum(freq)
## [1] 169245
# ordering frequency based on the decreasing frequency
freq<- sort(colSums(as.matrix(blog)), decreasing=TRUE)   
word_f<- data.frame(word=names(freq), freq=freq) 
head(word_f, n=15)# top 15 frequent word  
##          word freq
## one       one 1084
## will     will 1006
## just     just  929
## can       can  912
## like     like  880
## time     time  809
## get       get  629
## know     know  574
## now       now  541
## new       new  519
## also     also  480
## people people  474
## make     make  465
## even     even  464
## good     good  460

In these 8000 line, total word count is 169245.

Word frequency figure

library(ggplot2)
# demonstrate the count of word is more than 300
wf300<-subset(word_f, freq>300)

ggplot(wf300, aes(x = reorder(word, -freq), y = freq)) + 
  geom_bar(stat = "identity") +
  theme(axis.text.x=element_text(angle=90, hjust=1))

The most frequent words include “one”, “will”, “just”, “can”, “like”, and “time”. These words are very common in English.