Milestone Report

Task 1

Loading in the Data

library(NLP)
library(tm)
library(ngram)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(rJava)
library(RWeka)

setwd("C:/Users/Chris/Documents/Data Science Capstone")
con <- file("en_US.twitter.txt", open = "rb"); twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE); close(con)
con <- file("en_US.news.txt", open = "rb"); news <- readLines(con, encoding = "UTF-8", skipNul = TRUE); close(con)
con <- file("en_US.blogs.txt", open = "rb"); blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE); close(con)

Exploratory Analysis

I looked at how many lines are in the file, how many words are in the file, and the longest line in the file. I also looked at the summary of the file.

Twitter Data

length(twitter)

## [1] 2360148

wordcount(twitter,sep=" ",count.function=sum)

## [1] 30373583

max(nchar(twitter))

## [1] 140

summary(twitter)

##    Length     Class      Mode 
##   2360148 character character

Blogs Data

length(blogs)

## [1] 899288

wordcount(blogs,sep=" ",count.function=sum)

## [1] 37334131

max(nchar(blogs))

## [1] 40833

summary(twitter)

##    Length     Class      Mode 
##   2360148 character character

News Data

length(news)

## [1] 1010242

wordcount(news,sep=" ",count.function=sum)

## [1] 34372530

max(nchar(news))

## [1] 11384

summary(twitter)

##    Length     Class      Mode 
##   2360148 character character

Sampling

set.seed(77)
sample_blog<-sample(blogs,5000,replace=FALSE)
sample_twitter<-sample(twitter,5000,replace=FALSE)
sample_news<-sample(news,5000,replace=FALSE)
sample<-c(sample_twitter,sample_blog,sample_news)

I also did an exploratory analysis of my sample file

length(sample)

## [1] 15000

wordcount(sample,sep=" ",count.function=sum)

## [1] 442982

max(nchar(sample))

## [1] 2807

summary(sample)

##    Length     Class      Mode 
##     15000 character character

Cleaning the Data

corpus <- VCorpus(VectorSource(sample))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus<-tm_map(corpus,removeWords,c("2g1c", "2 girls 1 cup","acrotomophilia", "alabama hot pocket", "alaskan pipeline","anal", "anilingus","anus", "apeshit", "arsehole","ass", "asshole","assmunch", "auto erotic", "autoerotic","babeland", "baby batter","baby juice", "ball gag", "ball gravy","ball kicking", "ball licking","ball sack", "ball sucking", "bangbros","bareback", "barely legal","barenaked", "bastard", "bastardo","bastinado", "bbw","bdsm", "beaner", "beaners","beaver cleaver", "beaver lips","bestiality", "big black", "big breasts","big knockers", "big tits","bimbos", "birdlock", "bitch","bitches", "black cock","blonde action", "blonde on blonde action", "blowjob","blow job", "blow your load","blue waffle", "blumpkin", "bollocks","bondage", "boner","boob", "boobs", "booty call","brown showers", "brunette action","bukkake", "bulldyke", "bullet vibe","bullshit", "bung hole","bunghole", "busty", "butt","buttcheeks", "butthole","camel toe", "camgirl", "camslut","camwhore", "carpet muncher","carpetmuncher", "chocolate rosebuds", "circlejerk","cleveland steamer", "clit","clitoris", "clover clamps", "clusterfuck","cock", "cocks","coprolagnia", "coprophilia", "cornhole","coon", "coons","creampie", "cum", "cumming","cunnilingus", "cunt","darkie", "date rape", "daterape","deep throat", "deepthroat","dendrophilia", "dick", "dildo","dingleberry", "dingleberries","dirty pillows", "dirty sanchez", "doggie style","doggiestyle", "doggy style","doggystyle", "dog style", "dolcett","domination", "dominatrix","dommes", "donkey punch", "double dong","double penetration", "dp action","dry hump", "dvda", "eat my ass","ecchi", "ejaculation","erotic", "erotism", "escort","eunuch", "faggot","fecal", "felch", "fellatio","feltch", "female squirting","femdom", "figging", "fingerbang","fingering", "fisting","foot fetish", "footjob", "frotting","fuck", "fuck buttons","fuckin", "fucking", "fucktards","fudge packer", "fudgepacker","futanari", "gang bang", "gay sex","genitals", "giant cock","girl on", "girl on top", "girls gone wild","goatcx", "goatse","god damn", "gokkun", "golden shower","goodpoop", "goo girl","goregasm", "grope", "group sex","g-spot", "guro","hand job", "handjob", "hard core","hardcore", "hentai","homoerotic", "honkey", "hooker","hot carl", "hot chick","how to kill", "how to murder", "huge fat","humping", "incest","intercourse", "jack off", "jail bait","jailbait", "jelly donut","jerk off", "jigaboo", "jiggaboo","jiggerboo", "jizz","juggs", "kike", "kinbaku","kinkster", "kinky","knobbing", "leather restraint", "leather straight jacket","lemon party", "lolita","lovemaking", "make me come", "male squirting","masturbate", "menage a trois","milf", "missionary position", "motherfucker","mound of venus", "mr hands","muff diver", "muffdiving", "nambla","nawashi", "negro","neonazi", "nigga", "nigger","nig nog", "nimphomania","nipple", "nipples", "nsfw images","nude", "nudity","nympho", "nymphomania", "octopussy","omorashi", "one cup two girls","one guy one jar", "orgasm", "orgy","paedophile", "paki","panties", "panty", "pedobear","pedophile", "pegging","penis", "phone sex", "piece of shit","pissing", "piss pig","pisspig", "playboy", "pleasure chest","pole smoker", "ponyplay","poof", "poon", "poontang","punany", "poop chute","poopchute", "porn", "porno","pornography", "prince albert piercing","pthc", "pubes", "pussy","queaf", "queef","quim", "raghead", "raging boner","rape", "raping","rapist", "rectum", "reverse cowgirl","rimjob", "rimming","rosy palm", "rosy palm and her 5 sisters", "rusty trombone","sadism", "santorum","scat", "schlong", "scissoring","semen", "sex","sexo", "sexy", "shaved beaver","shaved pussy", "shemale","shibari", "shit", "shitblimp","shitty", "shota","shrimping", "skeet", "slanteye","slut", "s&m","smut", "snatch", "snowballing","sodomize", "sodomy","spic", "splooge", "splooge moose","spooge", "spread legs","spunk", "strap on", "strapon","strappado", "strip club","style doggy", "suck", "sucks","suicide girls", "sultry women","swastika", "swinger", "tainted love","taste my", "tea bagging","threesome", "throating", "tied up","tight white", "tit","tits", "titties", "titty","tongue in a", "topless","tosser", "towelhead", "tranny","tribadism", "tub girl","tubgirl", "tushy", "twat","twink", "twinkie","two girls one cup", "undressing", "upskirt","urethra play", "urophilia","vagina", "venus mound", "vibrator","violet wand", "vorarephilia","voyeur", "vulva", "wank","wetback", "wet dream","white power", "wrapping men", "wrinkled starfish","xx", "xxx","yaoi", "yellow showers", "yiffy","zoophilia"))
corpus <- tm_map(corpus, stripWhitespace)

Task 2

One Gram Word Frequencies

OneGramTokenizer <- function(x) unlist(lapply(NLP::ngrams(words(x), 1), paste, collapse = " "))
TermDocMatrix1 <- TermDocumentMatrix(corpus, control = list(tokenize = OneGramTokenizer))
TermDocMatrix1<-removeSparseTerms(TermDocMatrix1,0.9999)
frequency1gram<-rowSums(as.matrix(TermDocMatrix1))
frequency1gram<-sort(frequency1gram,decreasing=TRUE)
head(frequency1gram)

##  the said will  one just  can 
## 2421 1439 1434 1256 1134 1061

##Plotting One Gram Frequency

dataframe1<-data.frame(word=names(frequency1gram),freq=frequency1gram)
plot1gram<-ggplot(subset(dataframe1,freq>500),aes(word,freq))
plot1gram<-plot1gram+geom_bar(stat="identity")+ggtitle("One Gram Frequency")
plot1gram

Two Gram and Three Gram Frequencies

##Two Gram Frequency
bigramTokenizer<-function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TermDocMatrix2<- TermDocumentMatrix(corpus, control = list(tokenize = bigramTokenizer))
TermDocMatrix2<-removeSparseTerms(TermDocMatrix2,0.999)
frequency2gram<-rowSums(as.matrix(TermDocMatrix2))
frequency2gram<-sort(frequency2gram,decreasing=TRUE)
head(frequency2gram)

## i think  i love  i dont  i know   i can  i will 
##     252     166     158     152     145     121

##Plotting Two Gram Frequency

dataframe2<-data.frame(word=names(frequency2gram),freq=frequency2gram)
plot2gram<-ggplot(subset(dataframe2,freq>100),aes(word,freq))
plot2gram<-plot2gram+geom_bar(stat="identity")+ggtitle("Two Gram Frequency")
plot2gram

 ##Three Gram Frequency
trigramTokenizer<-function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
TermDocMatrix3<- TermDocumentMatrix(corpus, control = list(tokenize = trigramTokenizer))
TermDocMatrix3<-removeSparseTerms(TermDocMatrix3,0.999)
frequency3gram<-rowSums(as.matrix(TermDocMatrix3))
frequency3gram<-sort(frequency3gram,decreasing=TRUE)
head(frequency3gram)

##     i know i  i dont know    i think i  i feel like i dont think 
##           29           27           25           23           21 
##     i wish i 
##           18

##Plotting Three Gram Frequency

dataframe3<-data.frame(word=names(frequency3gram),freq=frequency3gram)
plot3gram<-ggplot(subset(dataframe3,freq>10),aes(word,freq))
plot3gram<-plot3gram+geom_bar(stat="identity")+ggtitle("Three Gram Frequency")
plot3gram

Next, I plan on doing more analysis on n-grams and start building the model to predict the next word. I will then work to see how effective the model is and improve upon it from there