This is the Week Two milestone report for the Capstone project. The purpose of this report is to demonstrate to the reader that I have gotten started opening and reading the data, that I have done some exploratory analysis, and that my word-prediction application is underway.
First of all, I cleared the memory space in R Studio. Some of these steps are going to tax my system resources so I want to start with a clean slate. I also set a seed from the system time.
rm(list=ls())
set.seed(as.numeric(as.POSIXct(Sys.time())))
library(data.table)
library(quanteda)
library(sqldf)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lexicon)
## [,1] [,2] [,3] [,4] [,5] [,6]
## names_row "Lines" "Words" "Uniques" "Freq. Min" "Freq. Mean" "Freq. Max"
## twitter_row "2360148" "30374147" "1290289" "0" "23.54" "837023"
## blogs_row "899288" "37334650" "1103725" "0" "33.83" "1659151"
## news_row "1010242" "2643986" "197861" "1" "13.36" "131810"
rm(list=ls())
ones <- as.data.table(read.table("C:/C2/jhu2018/capstone/ones.dt"))
twos <- as.data.table(read.table("C:/C2/jhu2018/capstone/twos.dt"))
threes <- as.data.table(read.table("C:/C2/jhu2018/capstone/threes.dt"))
onegram_frequency <- sqldf("Select word1 as gram,count(word1) as wc from ones group by gram order by wc desc")
twogram_frequency <- sqldf("Select word1||' '||word2 as gram,count(word1||word2) as wc from twos group by (word1||word2) order by wc desc")
threegram_frequency <- sqldf("Select word1||' '||word2||' '||word3 as gram,count(word1||word2||word3) as wc from threes group by (word1||word2||word3) order by wc desc")
ones_for_plot <- subset(onegram_frequency,onegram_frequency$wc > 500)
p <- ggplot(ones_for_plot, aes(x = reorder(ones_for_plot$gram,-ones_for_plot$wc), y = ones_for_plot$wc)) + geom_bar(stat = "identity") + theme(axis.text.x=element_text(angle=45, hjust=1))
p
twos_for_plot <- subset(twogram_frequency,twogram_frequency$wc > 100)
p <- ggplot(twos_for_plot, aes(x = reorder(twos_for_plot$gram,-twos_for_plot$wc), y = twos_for_plot$wc)) + geom_bar(stat = "identity") + theme(axis.text.x=element_text(angle=45, hjust=1))
p
threes_for_plot <- subset(threegram_frequency,threegram_frequency$wc > 15)
p <- ggplot(threes_for_plot, aes(x = reorder(threes_for_plot$gram,-threes_for_plot$wc), y = threes_for_plot$wc)) + geom_bar(stat = "identity") + theme(axis.text.x=element_text(angle=45, hjust=1))
p
f <- file("c:/c2/JHU2018/Capstone/en_US.twitter.txt", open="rb")
nlines <- 0L
while (length(chunk <- readBin(f, "raw", 65536)) > 0) {
nlines <- nlines + sum(chunk == as.raw(10L))
}
twitter_linecount <- nlines
close(f)
f <- file("c:/c2/JHU2018/Capstone/en_US.blogs.txt", open="rb")
nlines <- 0L
while (length(chunk <- readBin(f, "raw", 65536)) > 0) {
nlines <- nlines + sum(chunk == as.raw(10L))
}
blogs_linecount <- nlines
close(f)
f <- file("c:/c2/JHU2018/Capstone/en_US.news.txt", open="rb")
nlines <- 0L
while (length(chunk <- readBin(f, "raw", 65536)) > 0) {
nlines <- nlines + sum(chunk == as.raw(10L))
}
news_linecount <- nlines
close(f)
twitter_linecount <- round(twitter_linecount,0)
blogs_linecount <- round(blogs_linecount,0)
news_linecount <- round(news_linecount,0)
twitter_words <- scan("c:/c2/JHU2018/Capstone/en_US.twitter.txt", quote=NULL, what="x",skipNul=TRUE)
blogs_words <- scan("c:/c2/JHU2018/Capstone/en_US.blogs.txt", quote=NULL, what="x",skipNul=TRUE)
news_words <- scan("c:/c2/JHU2018/Capstone/en_US.news.txt", quote=NULL, what="x",skipNul=TRUE)
twitter_wordcount <- length(twitter_words)
blogs_wordcount <- length(blogs_words)
news_wordcount <- length(news_words)
twitter_words_dt <- as.data.table(twitter_words)
blogs_words_dt <- as.data.table(blogs_words)
news_words_dt <- as.data.table(news_words)
twitter_wordfrequency <- sqldf("Select twitter_words,count(twitter_words) as wc from twitter_words_dt group by twitter_words order by wc desc")
blogs_wordfrequency <- sqldf("Select blogs_words,count(blogs_words) as wc from blogs_words_dt group by blogs_words order by wc desc")
news_wordfrequency <- sqldf("Select news_words,count(news_words) as wc from news_words_dt group by news_words order by wc desc")
twitter_uniques <- dim(twitter_wordfrequency)[1]
blogs_uniques <- dim(blogs_wordfrequency)[1]
news_uniques <- dim(news_wordfrequency)[1]
twitter_uniques_min <- min(twitter_wordfrequency$wc)
blogs_uniques_min <- min(blogs_wordfrequency$wc)
news_uniques_min <- min(news_wordfrequency$wc)
twitter_uniques_mean <- mean(twitter_wordfrequency$wc)
blogs_uniques_mean <- mean(blogs_wordfrequency$wc)
news_uniques_mean <- mean(news_wordfrequency$wc)
twitter_uniques_max <- max(twitter_wordfrequency$wc)
blogs_uniques_max <- max(blogs_wordfrequency$wc)
news_uniques_max <- max(news_wordfrequency$wc)
twitter_row <- round(c(twitter_linecount,twitter_wordcount,twitter_uniques,twitter_uniques_min,twitter_uniques_mean,twitter_uniques_max),2)
blogs_row <- round(c(blogs_linecount,blogs_wordcount,blogs_uniques,blogs_uniques_min,blogs_uniques_mean,blogs_uniques_max),2)
news_row <- round(c(news_linecount,news_wordcount,news_uniques,news_uniques_min,news_uniques_mean,news_uniques_max),2)
names_row <- c('Lines','Words','Uniques','Freq. Min','Freq. Mean', 'Freq. Max')
rbind(names_row,twitter_row,blogs_row,news_row)
corpus_size <- 600000
milestone_modifier <- (1/100)
milestone_corpus_size <- corpus_size * milestone_modifier
milestone_twitter_sample_size = round(milestone_corpus_size/3,0)
milestone_blogs_sample_size = round(milestone_corpus_size/3,0)
milestone_news_sample_size = round(milestone_corpus_size/3,0)
handle_to_news <- file("c:/c2/JHU2018/Capstone/en_US.news.txt", open="rb")
news <- readLines(handle_to_news,skipNul=TRUE)
handle_to_blogs <- file("c:/c2/JHU2018/Capstone/en_US.blogs.txt", open="rb")
blogs <- readLines(handle_to_blogs,skipNul=TRUE)
handle_to_twitter <- file("c:/c2/JHU2018/Capstone/en_US.twitter.txt", open="rb")
twitter <- readLines(handle_to_twitter,skipNul=TRUE)
close(handle_to_twitter)
close(handle_to_blogs)
close(handle_to_news)
sampletwitter <- sample(twitter,milestone_twitter_sample_size)
sampleblogs <- sample(blogs,milestone_blogs_sample_size)
samplenews <- sample(news,milestone_news_sample_size)
corpus <- c(samplenews,sampleblogs,sampletwitter)
writeLines(corpus,"c:/c2/JHU2018/capstone/corpus.txt")
separatorstring_unescaped <- "{OBSCUREDELIMITER}"
separatorstring_escaped <- "\\{OBSCUREDELIMITER\\}"
ones <- tokens(corpus,what="word", remove_numbers=TRUE,remove_punct=TRUE,remove_symbols=TRUE,remove_separators=TRUE,remove_twitter=TRUE,remove_hyphens=TRUE,remove_url=TRUE, ngrams=1L, concatenator=separatorstring_unescaped)
ones <- as.character(ones)
ones <- as.data.table(ones)
twos <- tokens(corpus,what="word", remove_numbers=TRUE,remove_punct=TRUE,remove_symbols=TRUE,remove_separators=TRUE,remove_twitter=TRUE,remove_hyphens=TRUE,remove_url=TRUE, ngrams=2L, concatenator=separatorstring_unescaped)
twos <- as.character(twos)
twos <- as.data.table(twos)
threes <- tokens(corpus,what="word", remove_numbers=TRUE,remove_punct=TRUE,remove_symbols=TRUE,remove_separators=TRUE,remove_twitter=TRUE,remove_hyphens=TRUE,remove_url=TRUE, ngrams=3L, concatenator=separatorstring_unescaped)
threes <- as.character(threes)
threes <- as.data.table(threes)
ones_dt <- as.data.table(ones) %>% separate(ones,c('word1'),sep=separatorstring_escaped)
twos_dt <- as.data.table(twos) %>% separate(twos,c('word1','word2'),sep=separatorstring_escaped)
threes_dt <- as.data.table(threes) %>% separate(threes,c('word1','word2','word3'),sep=separatorstring_escaped)
pza <- as.data.table(profanity_zac_anger)
ones_dt <- sqldf("Select word1 from ones_dt where word1 not in (select * from pza)")
twos_dt <- sqldf("Select word1,word2 from twos_dt where word1 not in (select * from pza) and word2 not in (select * from pza)")
threes_dt <- sqldf("Select word1,word2,word3 from threes_dt where word1 not in (select * from pza) and word2 not in (select * from pza) and word3 not in (select * from pza)")
write.table(ones_dt,"C:/c2/jhu2018/capstone/ones.dt")
write.table(twos_dt,"C:/c2/jhu2018/capstone/twos.dt")
write.table(threes_dt,"C:/c2/jhu2018/capstone/threes.dt")