This document analyzed the three data sets: US blog, US twitter, and US news. Analysis include word count, most frequent used words, word clouds in the text of US blog, twitter, and US news respectively. After the basic analysis of line length, I extracted the first 8000 lines in the US_Blog file to esimate the word frequency and the word cloud figure.
# url of data
daturl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
# download file
download.file(daturl, "txtdata.zip")
# unzip file
unzip("txtdata.zip", exdir = ".")
ustwitter<-readLines("./final/en_US/en_US.twitter.txt")
usblog<-readLines("./final/en_US/en_US.blogs.txt")
usnews<-readLines("./final/en_US/en_US.news.txt")
## explore lenght of each line in data ustwitter
n<-length(ustwitter)
line_n_t<-rep(0, n)
for(i in 1:n){
m<-nchar(ustwitter[i])
line_n_t[i]<-m
}
## explore lenght of each line in data usblog
n<-length(usblog)
line_n_b<-rep(0, n)
for(i in 1:n){
m<-nchar(usblog[i])
line_n_b[i]<-m
}
## explore lenght of each line in data usnews
n<-length(usnews)
line_n_n<-rep(0, n)
for(i in 1:n){
m<-nchar(usnews[i])
line_n_n[i]<-m
}
## summary is printed in table line_sum
line_sum<-data.frame(rbind(summary(line_n_t)[c(1,3,4,6)],
summary(line_n_b)[c(1,3,4,6)],
summary(line_n_n)[c(1,3,4,6)]))
row.names(line_sum) = c("twitter", "blog", "news")
line_sum$line_length=c(length(ustwitter), length(usblog), length(usnews))
line_sum
## Min. Median Mean Max. line_length
## twitter 2 64 68.8 213 2360148
## blog 1 157 231.7 40840 899288
## news 2 186 203.0 5760 77259
Within three documents, twitter file has the longest lines, in which each line is very short. The blog file has the longest character per line among these files, and the largest amount of inforamtion (based on the lenght of characters). The news file has shortest line.
library(tm)
# read as Corpus and briefly inspect data
usb_c<-VCorpus(VectorSource(usblog[1:8000]))
#inspect(usb_c)
# remove punctuation
usb_c<-tm_map(usb_c,removePunctuation)
usb_c<-tm_map(usb_c, PlainTextDocument)
# remove number
usb_c<-tm_map(usb_c, removeNumbers)
usb_c<-tm_map(usb_c, PlainTextDocument)
# Converting to lowercase
usb_c<-tm_map(usb_c, tolower)
usb_c<-tm_map(usb_c, PlainTextDocument)
# remove stopwords in English
usb_c<-tm_map(usb_c, removeWords, stopwords("english"))
usb_c<-tm_map(usb_c, PlainTextDocument)
# getting the stemwords--There is error in tm-0.7_6 version
#usb_st<-tm_map(usb_c, StemDocument)
#usb_st<-tm_map(usb_st, PlainTextDocument)
# add common ending of stemwords
#usb_stc<-tm_map(usb_st, stemCompletion, dictionary = DocsCopy, lazy=TRUE)
usb_c<-tm_map(usb_c, stripWhitespace)
usb_c<-tm_map(usb_c, PlainTextDocument)
# Read the clean data of usn_c into term matrix
blog<-DocumentTermMatrix(usb_c)
# getting the frequency of word
freq <- colSums(as.matrix(blog))
#head(freq)
#tail(freq)
sum(freq)
## [1] 169245
# ordering frequency based on the decreasing frequency
freq<- sort(colSums(as.matrix(blog)), decreasing=TRUE)
word_f<- data.frame(word=names(freq), freq=freq)
head(word_f, n=15)# top 15 frequent word
## word freq
## one one 1084
## will will 1006
## just just 929
## can can 912
## like like 880
## time time 809
## get get 629
## know know 574
## now now 541
## new new 519
## also also 480
## people people 474
## make make 465
## even even 464
## good good 460
In these 8000 line, total word count is 169245.
library(ggplot2)
# demonstrate the count of word is more than 300
wf300<-subset(word_f, freq>300)
ggplot(wf300, aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity") +
theme(axis.text.x=element_text(angle=90, hjust=1))
The most frequent words include “one”, “will”, “just”, “can”, “like”, and “time”. These words are very common in English.