The Capstore project is to build a predictive natural language text model based upon 3 provided datasets: (1) Twitter, (2) Blogs and (3) News.
Number of lines of the Twitter dataset:
fileName <- "Coursera-SwiftKey.zip"
urlName <- "http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(urlName,fileName)
unzip(fileName)
con1 <- file("final/en_US/en_US.twitter.txt")
twitter_data <- readLines(con1)
## Warning in readLines(con1): line 167155 appears to contain an embedded nul
## Warning in readLines(con1): line 268547 appears to contain an embedded nul
## Warning in readLines(con1): line 1274086 appears to contain an embedded nul
## Warning in readLines(con1): line 1759032 appears to contain an embedded nul
close(con1)
stri_stats_general(twitter_data)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096031 134082634
Number of lines of the Blogs dataset:
con2 <- file("final/en_US/en_US.blogs.txt")
blog_data <- readLines(con2)
close(con2)
stri_stats_general(blog_data)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
Number of lines of the News dataset:
con3 <- file("final/en_US/en_US.news.txt")
news_data <- readLines(con3)
close(con3)
stri_stats_general(news_data)
## Lines LinesNEmpty Chars CharsNWhite
## 1010242 1010242 203223154 169860866
if (!dir.exists(file.path(".", "samples"))) {
twitter_sample <- twitter_data[c(1:35000)]
dir.create(file.path(".", "samples"))
sam1 <- file("./samples/twitter.sample")
writeLines(twitter_sample, sam1)
close(sam1)
}
if (!dir.exists(file.path(".", "samples"))) {
blogs_sample <- blog_data[c(1:110000)]
dir.create(file.path(".", "samples"))
sam2 <- file("./samples/blogs.sample")
writeLines(twitter_sample, sam2)
close(sam2)
}
if (!dir.exists(file.path(".", "samples"))) {
news_sample <- news_data[c(1:120000)]
dir.create(file.path(".", "samples"))
sam3 <- file("samples/news.sample")
writeLines(news_sample, sam3)
close(sam3)
}
# top profanity words in the US
profanity_list <- c("shit", "fuck", "damn", "bitch", "crap", "piss", "dick", "darn", "cock", "pussy", "asshole", "fag", "bastard", "slut", "douche");
sampleDir <- paste(getwd(), "/samples", sep = "")
target_dir <- file.path(sampleDir)
corpus <- VCorpus(DirSource(target_dir))
corpus1 <- tm_map(corpus, stripWhitespace)
corpus1 <- tm_map(corpus1, removeWords, stopwords("english"))
corpus1 <- tm_map(corpus1, removePunctuation)
corpus1 <- tm_map(corpus1, removeNumbers)
corpus1 <- tm_map(corpus1, removeWords, profanity_list)
dtm1 <- DocumentTermMatrix(corpus1)
freq1 <- sort(colSums(as.matrix(dtm1)), decreasing=TRUE)
df1 <- data.frame(word=names(freq1), freq=freq1)
subset(df1, freq>1000) %>%
ggplot(aes(word, freq)) +
geom_bar(stat="identity", fill="maroon") +
theme(axis.text.x=element_text(angle=45, hjust=1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtm2 <- DocumentTermMatrix(corpus1, control = list(tokenize = BigramTokenizer))
freq2 <- sort(colSums(as.matrix(dtm2)), decreasing=TRUE)
df2 <- data.frame(word=names(freq2), freq=freq2)
subset(df2, freq>100) %>%
ggplot(aes(word, freq)) +
geom_bar(stat="identity", fill="purple") +
theme(axis.text.x=element_text(angle=45, hjust=1))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
dtm3 <- DocumentTermMatrix(corpus1, control = list(tokenize = TrigramTokenizer))
freq3 <- sort(colSums(as.matrix(dtm3)), decreasing=TRUE)
df3 <- data.frame(word=names(freq3), freq=freq3)
subset(df3, freq>10) %>%
ggplot(aes(word, freq)) +
geom_bar(stat="identity", fill="blue") +
theme(axis.text.x=element_text(angle=45, hjust=1))
By analyzing the resulting n-grams and categorizing the popular patterns from combined dataset of Twitter, Blogs, News data. We will be able to build a model to predict text if we see specific patterns based upon the probability that they are appear in the sample datasets in the future.