Loading in the Data
library(NLP)
library(tm)
library(ngram)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(rJava)
library(RWeka)
setwd("C:/Users/Chris/Documents/Data Science Capstone")
con <- file("en_US.twitter.txt", open = "rb"); twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE); close(con)
con <- file("en_US.news.txt", open = "rb"); news <- readLines(con, encoding = "UTF-8", skipNul = TRUE); close(con)
con <- file("en_US.blogs.txt", open = "rb"); blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE); close(con)
I looked at how many lines are in the file, how many words are in the file, and the longest line in the file. I also looked at the summary of the file.
Twitter Data
length(twitter)
## [1] 2360148
wordcount(twitter,sep=" ",count.function=sum)
## [1] 30373583
max(nchar(twitter))
## [1] 140
summary(twitter)
## Length Class Mode
## 2360148 character character
Blogs Data
length(blogs)
## [1] 899288
wordcount(blogs,sep=" ",count.function=sum)
## [1] 37334131
max(nchar(blogs))
## [1] 40833
summary(twitter)
## Length Class Mode
## 2360148 character character
News Data
length(news)
## [1] 1010242
wordcount(news,sep=" ",count.function=sum)
## [1] 34372530
max(nchar(news))
## [1] 11384
summary(twitter)
## Length Class Mode
## 2360148 character character
set.seed(77)
sample_blog<-sample(blogs,5000,replace=FALSE)
sample_twitter<-sample(twitter,5000,replace=FALSE)
sample_news<-sample(news,5000,replace=FALSE)
sample<-c(sample_twitter,sample_blog,sample_news)
I also did an exploratory analysis of my sample file
length(sample)
## [1] 15000
wordcount(sample,sep=" ",count.function=sum)
## [1] 442982
max(nchar(sample))
## [1] 2807
summary(sample)
## Length Class Mode
## 15000 character character
corpus <- VCorpus(VectorSource(sample))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus<-tm_map(corpus,removeWords,c("2g1c", "2 girls 1 cup","acrotomophilia", "alabama hot pocket", "alaskan pipeline","anal", "anilingus","anus", "apeshit", "arsehole","ass", "asshole","assmunch", "auto erotic", "autoerotic","babeland", "baby batter","baby juice", "ball gag", "ball gravy","ball kicking", "ball licking","ball sack", "ball sucking", "bangbros","bareback", "barely legal","barenaked", "bastard", "bastardo","bastinado", "bbw","bdsm", "beaner", "beaners","beaver cleaver", "beaver lips","bestiality", "big black", "big breasts","big knockers", "big tits","bimbos", "birdlock", "bitch","bitches", "black cock","blonde action", "blonde on blonde action", "blowjob","blow job", "blow your load","blue waffle", "blumpkin", "bollocks","bondage", "boner","boob", "boobs", "booty call","brown showers", "brunette action","bukkake", "bulldyke", "bullet vibe","bullshit", "bung hole","bunghole", "busty", "butt","buttcheeks", "butthole","camel toe", "camgirl", "camslut","camwhore", "carpet muncher","carpetmuncher", "chocolate rosebuds", "circlejerk","cleveland steamer", "clit","clitoris", "clover clamps", "clusterfuck","cock", "cocks","coprolagnia", "coprophilia", "cornhole","coon", "coons","creampie", "cum", "cumming","cunnilingus", "cunt","darkie", "date rape", "daterape","deep throat", "deepthroat","dendrophilia", "dick", "dildo","dingleberry", "dingleberries","dirty pillows", "dirty sanchez", "doggie style","doggiestyle", "doggy style","doggystyle", "dog style", "dolcett","domination", "dominatrix","dommes", "donkey punch", "double dong","double penetration", "dp action","dry hump", "dvda", "eat my ass","ecchi", "ejaculation","erotic", "erotism", "escort","eunuch", "faggot","fecal", "felch", "fellatio","feltch", "female squirting","femdom", "figging", "fingerbang","fingering", "fisting","foot fetish", "footjob", "frotting","fuck", "fuck buttons","fuckin", "fucking", "fucktards","fudge packer", "fudgepacker","futanari", "gang bang", "gay sex","genitals", "giant cock","girl on", "girl on top", "girls gone wild","goatcx", "goatse","god damn", "gokkun", "golden shower","goodpoop", "goo girl","goregasm", "grope", "group sex","g-spot", "guro","hand job", "handjob", "hard core","hardcore", "hentai","homoerotic", "honkey", "hooker","hot carl", "hot chick","how to kill", "how to murder", "huge fat","humping", "incest","intercourse", "jack off", "jail bait","jailbait", "jelly donut","jerk off", "jigaboo", "jiggaboo","jiggerboo", "jizz","juggs", "kike", "kinbaku","kinkster", "kinky","knobbing", "leather restraint", "leather straight jacket","lemon party", "lolita","lovemaking", "make me come", "male squirting","masturbate", "menage a trois","milf", "missionary position", "motherfucker","mound of venus", "mr hands","muff diver", "muffdiving", "nambla","nawashi", "negro","neonazi", "nigga", "nigger","nig nog", "nimphomania","nipple", "nipples", "nsfw images","nude", "nudity","nympho", "nymphomania", "octopussy","omorashi", "one cup two girls","one guy one jar", "orgasm", "orgy","paedophile", "paki","panties", "panty", "pedobear","pedophile", "pegging","penis", "phone sex", "piece of shit","pissing", "piss pig","pisspig", "playboy", "pleasure chest","pole smoker", "ponyplay","poof", "poon", "poontang","punany", "poop chute","poopchute", "porn", "porno","pornography", "prince albert piercing","pthc", "pubes", "pussy","queaf", "queef","quim", "raghead", "raging boner","rape", "raping","rapist", "rectum", "reverse cowgirl","rimjob", "rimming","rosy palm", "rosy palm and her 5 sisters", "rusty trombone","sadism", "santorum","scat", "schlong", "scissoring","semen", "sex","sexo", "sexy", "shaved beaver","shaved pussy", "shemale","shibari", "shit", "shitblimp","shitty", "shota","shrimping", "skeet", "slanteye","slut", "s&m","smut", "snatch", "snowballing","sodomize", "sodomy","spic", "splooge", "splooge moose","spooge", "spread legs","spunk", "strap on", "strapon","strappado", "strip club","style doggy", "suck", "sucks","suicide girls", "sultry women","swastika", "swinger", "tainted love","taste my", "tea bagging","threesome", "throating", "tied up","tight white", "tit","tits", "titties", "titty","tongue in a", "topless","tosser", "towelhead", "tranny","tribadism", "tub girl","tubgirl", "tushy", "twat","twink", "twinkie","two girls one cup", "undressing", "upskirt","urethra play", "urophilia","vagina", "venus mound", "vibrator","violet wand", "vorarephilia","voyeur", "vulva", "wank","wetback", "wet dream","white power", "wrapping men", "wrinkled starfish","xx", "xxx","yaoi", "yellow showers", "yiffy","zoophilia"))
corpus <- tm_map(corpus, stripWhitespace)
One Gram Word Frequencies
OneGramTokenizer <- function(x) unlist(lapply(NLP::ngrams(words(x), 1), paste, collapse = " "))
TermDocMatrix1 <- TermDocumentMatrix(corpus, control = list(tokenize = OneGramTokenizer))
TermDocMatrix1<-removeSparseTerms(TermDocMatrix1,0.9999)
frequency1gram<-rowSums(as.matrix(TermDocMatrix1))
frequency1gram<-sort(frequency1gram,decreasing=TRUE)
head(frequency1gram)
## the said will one just can
## 2421 1439 1434 1256 1134 1061
##Plotting One Gram Frequency
dataframe1<-data.frame(word=names(frequency1gram),freq=frequency1gram)
plot1gram<-ggplot(subset(dataframe1,freq>500),aes(word,freq))
plot1gram<-plot1gram+geom_bar(stat="identity")+ggtitle("One Gram Frequency")
plot1gram
Two Gram and Three Gram Frequencies
##Two Gram Frequency
bigramTokenizer<-function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TermDocMatrix2<- TermDocumentMatrix(corpus, control = list(tokenize = bigramTokenizer))
TermDocMatrix2<-removeSparseTerms(TermDocMatrix2,0.999)
frequency2gram<-rowSums(as.matrix(TermDocMatrix2))
frequency2gram<-sort(frequency2gram,decreasing=TRUE)
head(frequency2gram)
## i think i love i dont i know i can i will
## 252 166 158 152 145 121
##Plotting Two Gram Frequency
dataframe2<-data.frame(word=names(frequency2gram),freq=frequency2gram)
plot2gram<-ggplot(subset(dataframe2,freq>100),aes(word,freq))
plot2gram<-plot2gram+geom_bar(stat="identity")+ggtitle("Two Gram Frequency")
plot2gram
##Three Gram Frequency
trigramTokenizer<-function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
TermDocMatrix3<- TermDocumentMatrix(corpus, control = list(tokenize = trigramTokenizer))
TermDocMatrix3<-removeSparseTerms(TermDocMatrix3,0.999)
frequency3gram<-rowSums(as.matrix(TermDocMatrix3))
frequency3gram<-sort(frequency3gram,decreasing=TRUE)
head(frequency3gram)
## i know i i dont know i think i i feel like i dont think
## 29 27 25 23 21
## i wish i
## 18
##Plotting Three Gram Frequency
dataframe3<-data.frame(word=names(frequency3gram),freq=frequency3gram)
plot3gram<-ggplot(subset(dataframe3,freq>10),aes(word,freq))
plot3gram<-plot3gram+geom_bar(stat="identity")+ggtitle("Three Gram Frequency")
plot3gram
Next, I plan on doing more analysis on n-grams and start building the model to predict the next word. I will then work to see how effective the model is and improve upon it from there