enbl <- readLines("D:/R Coursera/Capstone/final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
enne <- readLines("D:/R Coursera/Capstone/final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
entw <- readLines("D:/R Coursera/Capstone/final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
blogs.words <- stri_count_words(enbl)
news.words <- stri_count_words(enne)
twitter.words <- stri_count_words(entw)
o <- data.frame(source = c("enbl", "enne", "entw"),
num.lines = c(length(enbl), length(enne), length(entw)),
num.words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
mean.num.words = c(mean(blogs.words), mean(news.words), mean(twitter.words)))
o
## source num.lines num.words mean.num.words
## 1 enbl 899288 37546246 41.75108
## 2 enne 77259 2674536 34.61779
## 3 entw 2360148 30093410 12.75065
o$source1 = c("blogs", "news", "twitter")
par(mfrow=c(1,3))
pie(o$mean.num.words, labels = o$source1, main="Mean number of words")
pie(o$num.lines, labels = o$source1, main="Number of lines")
pie(o$num.words, labels = o$source1, main="Number of words")
For faster processing we need to sample a sll subset of the dataset. We also convert the text variable to ASCII to erase smileys.
set.seed(67558)
samblo = sample(enbl, size = 0.008*length(enbl))
samne = sample(enne, size = 0.008*length(enne))
samtw = sample(entw, size = 0.008*length(entw))
text <- c(samblo, samne, samtw)
text <- iconv(text, 'UTF-8', 'ASCII')
corpus <- (VectorSource(text))
Sorry for this one. List by: “http://www.cs.cmu.edu/~biglou/resources/bad-words.txt”
badwords <- c("abbo","abo","abortion","abuse","addict","addicts","adult", "africa", "african","alla"
,"allah", "alligatorbait","amateur","american","anal","analannie","analsex","angie","angry"
,"anus", "arab","arabs","areola","argie","aroused","arse","arsehole","asian","ass","assassin"
,"assassinate","assassination","assault","assbagger","assblaster","assclown","asscowboy","asses"
,"assfuck","assfucker","asshat","asshole","assholes","asshore","assjockey","asskiss"
,"asskisser","assklown","asslick","asslicker","asslover","assman","assmonkey","assmunch"
,"assmuncher","asspacker","asspirate","asspuppies","assranger","asswhore","asswipe","athletesfoot"
,"attack","australian","babe","babies","backdoor","backdoorman","backseat","badfuck","balllicker"
,"balls","ballsack","banging","baptist","barelylegal","barf","barface","barfface","bast"
,"bastard","bazongas","bazooms", "beaner","beast","beastality","beastial","beastiality"
,"beatoff","beat-off","beatyourmeat","beaver","bestial","bestiality","bi","biatch","bible","bicurious"
,"bigass","bigbastard","bigbutt","bigger","bisexual","bi-sexual","bitch","bitcher","bitches","bitchez","bitchin"
,"bitching","bitchslap","bitchy","biteme","black","blackman","blackout","blacks","blind","blow"
,"blowjob","boang","bogan","bohunk","bollick","bollock","bomb","bombers","bombing","bombs","bomd"
,"bondage","boner","bong","boob","boobies","boobs","booby","boody","boom","boong","boonga","boonie"
,"booty","bootycall","bountybar","bra","brea5t","breast","breastjob","breastlover","breastman"
,"brothel","bugger","buggered","buggery","bullcrap","bulldike","bulldyke","bullshit","bumblefuck"
,"bumfuck","bunga","bunghole","buried","burn","butchbabes","butchdike","butchdyke","butt","buttbang"
,"butt-bang","buttface","buttfuck","butt-fuck","buttfucker","butt-fucker","buttfuckers","butt-fuckers"
,"butthead","buttman","buttmunch","buttmuncher","buttpirate","buttplug","buttstain","byatch","cacker"
,"cameljockey","cameltoe","canadian","cancer","carpetmuncher","carruth","catholic","catholics","cemetery"
,"chav","cherrypopper","chickslick","children's","chin","chinaman","chinamen","chinese","chink","chinky"
,"choad","chode","christ","christian","church","cigarette","cigs","clamdigger","clamdiver","clit","clitoris"
,"clogwog","cocaine","cock","cockblock","cockblocker","cockcowboy","cockfight","cockhead","cockknob","cocklicker"
,"cocklover","cocknob","cockqueen","cockrider","cocksman","cocksmith","cocksmoker","cocksucer","cocksuck"
,"cocksucked","cocksucker","cocksucking","cocktail","cocktease","cocky","cohee","coitus","color","colored"
,"coloured","commie","communist","condom","conservative","conspiracy","coolie","cooly","coon","coondog"
,"copulate","cornhole","corruption","cra5h","crabs","crack","crackpipe","crackwhore","crack-whore"
,"crap","crapola","crapper","crappy","crash","creamy","crime","crimes","criminal","criminals"
,"crotch","crotchjockey","crotchmonkey","crotchrot","cum","cumbubble","cumfest","cumjockey","cumm","cummer"
,"cumming","cumquat","cumqueen","cumshot","cunilingus","cunillingus","cunn","cunnilingus","cunntt","cunt"
,"cunteyed","cuntfuck","cuntfucker","cuntlick","cuntlicker","cuntlicking","cuntsucker","cybersex","cyberslimer"
,"dago","dahmer","dammit","damn","damnation","damnit","darkie","darky","datnigga","dead","deapthroat","death"
,"deepthroat","defecate","dego","demon","deposit","desire","destroy","deth","devil","devilworshipper"
,"dick","dickbrain","dickforbrains","dickhead","dickless","dicklick","dicklicker","dickman","dickwad"
,"dickweed","diddle","die","died","dies","dike","dildo","dingleberry","dink","dipshit","dipstick","dirty"
,"disease","diseases","disturbed","dive","dix","dixiedike","dixiedyke","doggiestyle","doggystyle"
,"dong","doodoo","doo-doo","doom","dope","dragqueen","dragqween","dripdick","drug","drunk","drunken","dumb"
,"dumbass","dumbbitch","dumbfuck","dyefly","dyke","easyslut","eatballs","eatme","eatpussy","ecstacy","ejaculate"
,"ejaculated","ejaculating","ejaculation","enema","enemy","erect","erection","ero","escort","ethiopian","ethnic"
,"european","evl","excrement","execute","executed","execution","executioner","explosion","facefucker","faeces"
,"fag","fagging","faggot","fagot","failed","failure","fairies","fairy","faith","fannyfucker","fart","farted"
,"farting","farty","fastfuck","fat","fatah","fatass","fatfuck","fatfucker","fatso","fckcum","fear","feces"
,"felatio","felch","felcher","felching","fellatio","feltch","feltcher","feltching","fetish","fight","filipina"
,"filipino","fingerfood","fingerfuck","fingerfucked","fingerfucker","fingerfuckers","fingerfucking","fire"
,"firing","fister","fistfuck","fistfucked","fistfucker","fistfucking","fisting","flange","flasher","flatulence"
,"floo","flydie","flydye","fok","fondle","footaction","footfuck","footfucker","footlicker","footstar","fore"
,"foreskin","forni","fornicate","foursome","fourtwenty","fraud","freakfuck","freakyfucker","freefuck","fu","fubar"
,"fuc","fucck","fuck","fucka","fuckable","fuckbag","fuckbuddy","fucked","fuckedup","fucker","fuckers","fuckface"
,"fuckfest","fuckfreak","fuckfriend","fuckhead","fuckher","fuckin","fuckina","fucking","fuckingbitch","fuckinnuts"
,"fuckinright","fuckit","fuckknob","fuckme","fuckmehard","fuckmonkey","fuckoff","fuckpig","fucks","fucktard"
,"fuckwhore","fuckyou","fudgepacker","fugly","fuk","fuks","funeral","funfuck","fungus","fuuck","gangbang","gangbanged"
,"gangbanger","gangsta","gatorbait","gay","gaymuthafuckinwhore","gaysex","geez","geezer","geni","genital","german"
,"getiton","gin","ginzo","gipp","girls","givehead","glazeddonut","gob","god","godammit","goddamit","goddammit"
,"goddamn","goddamned","goddamnes","goddamnit","goddamnmuthafucker","goldenshower","gonorrehea","gonzagas","gook"
,"gotohell","goy","goyim","greaseball","gringo","groe","gross","grostulation","gubba","gummer","gun","gyp","gypo"
,"gypp","gyppie","gyppo","gyppy","hamas","handjob","hapa","harder","hardon","harem","headfuck","headlights","hebe"
,"heeb","hell","henhouse","heroin","herpes","heterosexual","hijack","hijacker","hijacking","hillbillies","hindoo"
,"hiscock","hitler","hitlerism","hitlerist","hiv","ho","hobo","hodgie","hoes","hole","holestuffer","homicide"
,"homo","homobangers","homosexual","honger","honk","honkers","honkey","honky","hook","hooker","hookers","hooters"
,"hore","hork","horn","horney","horniest","horny","horseshit","hosejob","hoser","hostage","hotdamn","hotpussy"
,"hottotrot","hummer","husky","hussy","hustler","hymen","hymie","iblowu","idiot","ikey","illegal","incest"
,"insest","intercourse","interracial","intheass","inthebuff","israel","israeli","israels","italiano","itch","jackass"
,"jackoff","jackshit","jacktheripper","jade","jap","japanese","japcrap","jebus","jeez","jerkoff","jesus","jesuschrist"
,"jew","jewish","jiga","jigaboo","jigg","jigga","jiggabo","jigger","jiggy","jihad","jijjiboo","jimfish","jism","jiz"
,"jizim","jizjuice","jizm","jizz","jizzim","jizzum","joint","juggalo","jugs","junglebunny","kaffer","kaffir","kaffre"
,"kafir","kanake","kid","kigger","kike","kill","killed","killer","killing","kills","kink","kinky","kissass","kkk"
,"knife","knockers","kock","kondum","koon","kotex","krap","krappy","kraut","kum","kumbubble","kumbullbe","kummer"
,"kumming","kumquat","kums","kunilingus","kunnilingus","kunt","ky","kyke","lactate","laid","lapdance","latin","lesbain"
,"lesbayn","lesbian","lesbin","lesbo","lez","lezbe","lezbefriends","lezbo","lezz","lezzo","liberal","libido","licker"
,"lickme","lies","limey","limpdick","limy","lingerie","liquor","livesex","loadedgun","lolita","looser","loser","lotion"
,"lovebone","lovegoo","lovegun","lovejuice","lovemuscle","lovepistol","loverocket","lowlife","lsd","lubejob","lucifer"
,"luckycammeltoe","lugan","lynch","macaca","mad","mafia","magicwand","mams","manhater","manpaste","marijuana","mastabate"
,"mastabater","masterbate","masterblaster","mastrabator","masturbate","masturbating","mattressprincess","meatbeatter"
,"meatrack","meth","mexican","mgger","mggor","mickeyfinn","mideast","midget","milf","minority","mockey","mockie","mocky"
,"mofo","moky","moles","molest","molestation","molester","molestor","moneyshot","mooncricket","mormon","moron","moslem"
,"mosshead","mothafuck","mothafucka","mothafuckaz","mothafucked","mothafucker","mothafuckin","mothafucking","mothafuckings"
,"motherfuck","motherfucked","motherfucker","motherfuckin","motherfucking","motherfuckings","motherlovebone","muff"
,"muffdive","muffdiver","muffindiver","mufflikcer","mulatto","muncher","munt","murder","murderer","muslim","naked","narcotic"
,"nasty","nastybitch","nastyho","nastyslut","nastywhore","nazi","necro","negro","negroes","negroid","negros","nig","niger"
,"nigerian","nigerians","nigg","nigga","niggah","niggaracci","niggard","niggarded","niggarding","niggardliness","niggardlinesss"
,"niggardly","niggards","niggards","niggaz","nigger","niggerhead","niggerhole","niggers","niggers","niggle","niggled"
,"niggles","niggling","nigglings","niggor","niggur","niglet","nignog","nigr","nigra","nigre","nip","nipple","nipplering"
,"nittit","nlgger","nlggor","nofuckingway","nook","nookey","nookie","noonan","nooner","nude","nudger","nuke","nutfucker"
,"nymph","ontherag","oral","orga","orgasim","orgasm","orgies","orgy","osama","paki","palesimian","palestinian","pansies"
,"pansy","panti","panties","payo","pearlnecklace","peck","pecker","peckerwood","pee","peehole","pee-pee","peepshow"
,"peepshpw","pendy","penetration","peni5","penile","penis","penises","penthouse","period","perv","phonesex","phuk"
,"phuked","phuking","phukked","phukking","phungky","phuq","pi55","picaninny","piccaninny","pickaninny","piker","pikey"
,"piky","pimp","pimped","pimper","pimpjuic","pimpjuice","pimpsimp","pindick","piss","pissed","pisser","pisses","pisshead"
,"pissin","pissing","pissoff","pistol","pixie","pixy","playboy","playgirl","pocha","pocho","pocketpool","pohm","polack"
,"pom","pommie","pommy","poo","poon","poontang","poop","pooper","pooperscooper","pooping","poorwhitetrash","popimp","porchmonkey"
,"porn","pornflick","pornking","porno","pornography","pornprincess","pot","poverty","premature","pric","prick","prickhead"
,"primetime","propaganda","pros","prostitute","protestant","pu55i","pu55y","pube","pubic","pubiclice","pud","pudboy","pudd"
,"puddboy","puke","puntang","purinapricness","puss","pussie","pussies","pussy","pussycat","pussyeater","pussyfucker"
,"pussylicker","pussylips","pussylover","pussypounder","pusy","quashie","queef","queer","quickie","quim","ra8s","rabbi"
,"racial","racist","radical","radicals","raghead","randy","rape","raped","raper","rapist","rearend","rearentry","rectum"
,"redlight","redneck","reefer","reestie","refugee","reject","remains","rentafuck","republican","rere","retard","retarded"
,"ribbed","rigger","rimjob","rimming","roach","robber","roundeye","rump","russki","russkie","sadis","sadom","samckdaddy"
,"sandm","sandnigger","satan","scag","scallywag","scat","schlong","screw","screwyou","scrotum","scum","semen","seppo"
,"servant","sex","sexed","sexfarm","sexhound","sexhouse","sexing","sexkitten","sexpot","sexslave","sexting","sextogo"
,"sextoy","sextoys","sexual","sexually","sexwhore","sexy","sexymoma","sexy-slim","shag","shaggin","shagging","shat"
,"shav","shawtypimp","sheeney","shhit","shinola","shit","shitcan","shitdick","shite","shiteater","shited","shitface"
,"shitfaced","shitfit","shitforbrains","shitfuck","shitfucker","shitfull","shithapens","shithappens","shithead","shithouse"
,"shiting","shitlist","shitola","shitoutofluck","shits","shitstain","shitted","shitter","shitting","shitty","shoot","shooting"
,"shortfuck","showtime","sick","sissy","sixsixsix","sixtynine","sixtyniner","skank","skankbitch","skankfuck","skankwhore"
,"skanky","skankybitch","skankywhore","skinflute","skum","skumbag","slant","slanteye","slapper","slaughter","slav","slave"
,"slavedriver","sleezebag","sleezeball","slideitin","slime","slimeball","slimebucket","slopehead","slopey","slopy","slut"
,"sluts","slutt","slutting","slutty","slutwear","slutwhore","smack","smackthemonkey","smut","snatch","snatchpatch","snigger"
,"sniggered","sniggering","sniggers","sniggers","sniper","snot","snowback","snownigger","sob","sodom","sodomise","sodomite"
,"sodomize","sodomy","sonofabitch","sonofbitch","sooty","sos","soviet","spaghettibender","spaghettinigger","spank","spankthemonkey"
,"sperm","spermacide","spermbag","spermhearder","spermherder","spic","spick","spig","spigotty","spik","spit","spitter"
,"splittail","spooge","spreadeagle","spunk","spunky","squaw","stagg","stiffy","strapon","stringer","stripclub","stroke"
,"stroking","stupid","stupidfuck","stupidfucker","suck","suckdick","sucker","suckme","suckmyass","suckmydick","suckmytit"
,"suckoff","suicide","swallow","swallower","swalow","swastika","sweetness","syphilis","taboo","taff","tampon","tang","tantra"
,"tarbaby","tard","teat","terror","terrorist","teste","testicle","testicles","thicklips","thirdeye","thirdleg","threesome"
,"threeway","timbernigger","tinkle","tit","titbitnipply","titfuck","titfucker","titfuckin","titjob","titlicker","titlover"
,"tits","tittie","titties","titty","tnt","toilet","tongethruster","tongue","tonguethrust","tonguetramp","tortur","torture"
,"tosser","towelhead","trailertrash","tramp","trannie","tranny","transexual","transsexual","transvestite","triplex","trisexual"
,"trojan","trots","tuckahoe","tunneloflove","turd","turnon","twat","twink","twinkie","twobitwhore","uck","uk","unfuckable"
,"upskirt","uptheass","upthebutt","urinary","urinate","urine","usama","uterus","vagina","vaginal","vatican","vibr","vibrater"
,"vibrator","vietcong","violence","virgin","virginbreaker","vomit","vulva","wab","wank","wanker","wanking","waysted"
,"weapon","weenie","weewee","welcher","welfare","wetb","wetback","wetspot","whacker","whash","whigger","whiskey","whiskeydick"
,"whiskydick","whit","whitenigger","whites","whitetrash","whitey","whiz","whop","whore","whorefucker","whorehouse","wigger"
,"willie","williewanker","willy","wn","wog","women's","wop","wtf","wuss","wuzzie","xtc","xxx","yankee","yellowman","zigabo"
,"zipperhead")
We create a corpus free of punctuation, profanity, numbers, standard words and unusual signs. In the end we close white spaces that occured due to the deletion.
corpus <- VCorpus(VectorSource(corpus))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removeWords, badwords)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
Creating wordcloud with the most frequent words
dtm <- DocumentTermMatrix(corpus)
tdm <- TermDocumentMatrix(corpus)
freq <- colSums(as.matrix(dtm))
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
wf <- data.frame(word=names(freq), freq=freq)
set.seed(425)
wordcloud(names(freq), freq, max.words=35)
library(RWeka)
options(mc.cores=1)
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
makePlot <- function(data, label) {
ggplot(data[1:30,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity", fill = I("grey50"))
}
# Get frequencies of most common n-grams in data sample
freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus), 0.9999))
freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999))
freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
## Top 10 2gram Words
freq2$word[1:10]
## [1] add link america now art tables
## [4] aside children beneath generally blogger treasure
## [7] boat unicycle bones skin book mix
## [10] bridges slides
## 93 Levels: add link already started america now ... workbench art
## Top 10 3gram words
freq3$word[1:10]
## [1] america now call art tables garden
## [3] aside children play beneath generally smooth
## [5] blogger treasure hunt boat unicycle merrygoround
## [7] bones skin case bridges slides mashed
## [9] call climber collection case falls nothing
## 88 Levels: already started things america now call ... workbench art tables