setwd("C:/Users/Zachary/Data Science/Capstone")
require(tm);require(SnowballC);require(data.table);require(ggplot2)
##Read the three US files into R.
main_path <- file.path('.','final/en_US')
con <- file(paste(main_path, dir(main_path)[1], sep='/'),'r')
en_US_blogs <- readLines(con)
close(con)
con <- file(paste(main_path, dir(main_path)[2], sep='/'),'r')
en_US_news <- readLines(con)
close(con)
con <- file(paste(main_path, dir(main_path)[3], sep='/'),'r')
en_US_twitter <- readLines(con)
close(con)
##Make a data frame for lines count
lineslength <- c(length(en_US_blogs),length(en_US_news),length(en_US_twitter))
lineslength <- data.frame(lineslength)
lineslength$names <- c("blogs","news","twitter")
##Count words in blogs
blogs_words <- lapply(en_US_blogs, function (x) length(unlist(strsplit(x," "))))
blogs_words <- data.frame(do.call("rbind",blogs_words))
names(blogs_words) <- "Blog Word Count"
##Count words in news
news_words <- lapply(en_US_news, function (x) length(unlist(strsplit(x," "))))
news_words <- data.frame(do.call("rbind",news_words))
names(news_words) <- "News Word Count"
##Count words in twitter
twitter_words <- lapply(en_US_twitter, function (x) length(unlist(strsplit(x," "))))
twitter_words <- data.frame(do.call("rbind",twitter_words))
names(twitter_words) <- "Twitter Word Count"
#Combine words tables
words <- c(sum(blogs_words),sum(news_words),sum(twitter_words))
words <- data.frame(words)
words$names <- c("blogs","news","twitter")
par(mfcol = c(1,2))
ggplot(lineslength,aes(x=names,y=lineslength))+
geom_bar(stat='identity',fill='blue',color='grey60')+
xlab('Source') + ylab('Total Lines') + coord_flip() +
ggtitle('Total Line Count by Source')
#Construct bar graph of word count for each type
ggplot(words,aes(x=names,y=words))+
geom_bar(stat='identity',fill='green',color='grey60')+
xlab('Source') + ylab('Total Words') + coord_flip() +
ggtitle('Total Word Count by Source')
load("onegram.RData")
load("twogram.RData")
load("threegram.RData")
par(mfcol = c(1,3))
hist(log10(table(onegram[,2])), xlab="", col = "blue",
ylab="Number of words", main = "One-gram")
hist(log10(table(twogram[,2])), xlab="Frequency (log10)", col = "orange",
ylab="", main = "Two-gram")
hist(log10(table(threegram[,2])), xlab="", col = "purple",
ylab="", main = "Three-gram")