The purpose of this project is to demostrate the result of passing the milestone. The motivation for this project is to:
library(dplyr)
library(ggplot2)
library(tm)
Sys.setlocale(local="C")
## [1] "C"
sessionInfo()
## R version 3.2.2 (2015-08-14)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 8 x64 (build 9200)
##
## locale:
## [1] C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tm_0.6-2 NLP_0.1-8 ggplot2_1.0.1 dplyr_0.4.3
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.1 knitr_1.11 magrittr_1.5 MASS_7.3-45
## [5] munsell_0.4.2 colorspace_1.2-6 R6_2.1.1 stringr_1.0.0
## [9] plyr_1.8.3 tools_3.2.2 parallel_3.2.2 grid_3.2.2
## [13] gtable_0.1.2 DBI_0.3.1 htmltools_0.2.6 yaml_2.1.13
## [17] assertthat_0.1 digest_0.6.8 reshape2_1.4.1 formatR_1.2.1
## [21] slam_0.1-32 evaluate_0.8 rmarkdown_0.8.1 stringi_1.0-1
## [25] scales_0.3.0 proto_0.3-10
if (!file.exists("SwiftKey.zip")) {
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
"SwiftKey.zip")
}
if (!file.exists("en_US.blogs.txt")) {
unzip("SwiftKey.zip", junkpaths = TRUE)
}
ttwit = readLines("en_US.twitter.txt", encoding = "utf8")
tnews = readLines("en_US.news.txt", encoding = "utf8")
tblog = readLines("en_US.blogs.txt", encoding = "utf8")
wtwit = strsplit(paste(ttwit, collapse = " "), " ")[[1]]
wnews = strsplit(paste(tnews, collapse = " "), " ")[[1]]
wblog = strsplit(paste(tblog, collapse = " "), " ")[[1]]
| File Name | Counts |
|---|---|
| en_US.twitter.txt | Line Count = 2360148 |
| Word Count = 30373545 | |
| Average Words Per Line = 12.8693391 | |
| en_US.news.txt | Line Count = 77259 |
| Word Count = 2643969 | |
| Average Words Per Line = 34.2221489 | |
| en_US.blogs.txt | Line Count = 899288 |
| Word Count = 37334131 | |
| Average Words Per Line = 41.5152109 |
There’s a notable difference between three media. The average words per line is largest in blog and smallest in twitter. This is a surprising result that there are more long sentences in blog than in news.
We only choose 10% data to generate probability model.
ttwit = paste(sample(ttwit, length(ttwit)/10), collapse = " \1\2\3 \1\2\3 ")
tnews = paste(sample(tnews, length(tnews)/10), collapse = " \1\2\3 \1\2\3 ")
tblog = paste(sample(tblog, length(tblog)/10), collapse = " \1\2\3 \1\2\3 ")
There are five steps in processing data:
removeBadWord = function(x) {
badword = readLines("badword.txt")
bad = paste0(" ", paste(badword, collapse = " | "), " ")
return(gsub(bad, " ", x))
}
removePunctuation = function(x) {
gsub("[][!\"-#$%&()*+,./:;<=>?@^_`{|}~\\]+", " ", x)
}
preprocess = function(text) {
text = tolower(text)
text = removeBadWord(text)
text = removePunctuation(text)
text = removeNumbers(text)
text = stripWhitespace(text)
text = gsub("\1\2\3", "##", text)
return(text)
}
ttwit = preprocess(ttwit)
tnews = preprocess(tnews)
tblog = preprocess(tblog)
In the step, we use iconv function to identify foreign languag. If there is foreign language out of ASCII codes, it will be replaced by "\1" and the word is excluded from unigrams.
probmodel = function(text) {
t = strsplit(text, " ")[[1]]
# remove foreign language
t = t[-grep("\1", iconv(t, "latin1", "ASCII", sub = "\1"))]
uni_freq = sort(table(t), decreasing = TRUE)
uni_freq = log(uni_freq[names(uni_freq) != "##"]) - log(sum(uni_freq))
t2 = paste(t[-length(t)], t[-1])
t2 = t2[grep("##", t2, invert = TRUE)]
bi_freq = sort(table(t2), decreasing = TRUE)
bi_freq = log(bi_freq) - log(sum(bi_freq))
t3 = paste(t[-c(length(t)-1,length(t))], t[-c(1,length(t))], t[-c(1,2)])
t3 = t3[grep("##", t3, invert = TRUE)]
tri_freq = sort(table(t3), decreasing = TRUE)
tri_freq = log(tri_freq) - log(sum(tri_freq))
return(list(uni_freq, bi_freq, tri_freq))
}
ptwit = probmodel(ttwit)
pnews = probmodel(tnews)
pblog = probmodel(tblog)
For twitter, news, and blog text, we plot the probability of N-grams in a logarithmic scale.
plotFreq = function(freq, t) {
y1 = freq[[1]]
y2 = freq[[2]]
y3 = freq[[3]]
y2 = y2[y2 > min(y2)]
y3 = y3[y3 > min(y3)]
len = c(length(y1), length(y2), length(y3))
d = data.frame(x=c(seq_len(len[1]), seq_len(len[2]), seq_len(len[3])),
y=c(y1, y2, y3),
type=rep(c("unigram","bigram","trigram"), len))
ggplot(d) + geom_line(aes(x=x, y=y, group=type, color=type), size=1) +
xlab("Ranking of N-grams") +
ylab("Probability in a logarithmic scale") +
ggtitle(t)
}
plotFreq(ptwit, "Twitter")
plotFreq(pnews, "News")
plotFreq(pblog, "Blog")
I was curious about what words are frequently used in one media and not so frequently used in another. Following is my findings about the difference between media.
a = sapply(names(pnews[[3]][1:20]), function(x) which(names(ptwit[[3]]) == x))
names(sort(a, decreasing = TRUE)[1:5])
## [1] "said in a" "m to p" "a m to"
## [4] "to p m" "according to the"
a = sapply(names(pnews[[3]][1:20]), function(x) which(names(pblog[[3]]) == x))
names(sort(a, decreasing = TRUE)[1:5])
## [1] "m to p" "to p m" "a m to" "at p m" "said in a"
Quatation is very important in news. Mostly, the quotation is from a third person. That’s why “he said he” and “according to the” are frequent in news but not so frequent on blog and twitter. “at 2 a.m.” or “at 8:30 p.m.” are often used in news, but not so often in daily life speaking.
a = sapply(names(pblog[[3]][1:20]), function(x) which(names(ptwit[[3]]) == x))
names(sort(a, decreasing = TRUE)[1:5])
## [1] "as well as" "the fact that" "a couple of" "it is a"
## [5] "some of the"
a = sapply(names(pblog[[3]][1:20]), function(x) which(names(pnews[[3]]) == x))
b = setdiff(names(a), names(unlist(a)))
c(b, names(sort(unlist(a), decreasing = TRUE)[1:(5-length(b))]))
## [1] "one of my" "i have to" "i have a" "the rest of" "i want to"
The text on blog is more literal. I guess that’s why bloggers choose “as well as” and “the fact that”. The text on blog is subjective, but the text in news is objective. That’s why “i have been”, “i have to”, “i have a”, “i want to” are not frequent in news.
a = sapply(names(ptwit[[3]][1:20]), function(x) which(names(pblog[[3]]) == x))
b = setdiff(names(a), names(unlist(a)))
c(b, names(sort(unlist(a), decreasing = TRUE)[1:(5-length(b))]))
## [1] "for the follow" "thanks for the" "to see you" "i love you"
## [5] "can't wait to"
a = sapply(names(ptwit[[3]][1:20]), function(x) which(names(pnews[[3]]) == x))
setdiff(names(a), names(unlist(a)))
## [1] "for the follow" "to see you"
There are some special words that only occured on twitter, such as “for the follow”, “for the rt”. The text on twitter are more casual and conversational, such as “to see you”, “have a great”, “thanks for the”.
Following is my plans to creating a prediction algorithm:
The bad words list is from Bad Words Banned by Google, containing following bad words:
readLines("badword.txt")
## [1] "4r5e" "5h1t"
## [3] "5hit" "a55"
## [5] "anal" "anus"
## [7] "ar5e" "arrse"
## [9] "arse" "ass"
## [11] "ass-fucker" "asses"
## [13] "assfucker" "assfukka"
## [15] "asshole" "assholes"
## [17] "asswhole" "b!tch"
## [19] "b00bs" "b17ch"
## [21] "b1tch" "ballbag"
## [23] "balls" "ballsack"
## [25] "bastard" "beastial"
## [27] "beastiality" "bellend"
## [29] "bestial" "bestiality"
## [31] "bi+ch" "biatch"
## [33] "bitch" "bitcher"
## [35] "bitchers" "bitches"
## [37] "bitchin" "bitching"
## [39] "bloody" "blowjob"
## [41] "blowjobs" "boiolas"
## [43] "bollock" "bollok"
## [45] "boner" "boob"
## [47] "boobs" "booobs"
## [49] "boooobs" "booooobs"
## [51] "booooooobs" "breasts"
## [53] "buceta" "bugger"
## [55] "bum" "bunnyfucker"
## [57] "butt" "butthole"
## [59] "buttmuch" "buttplug"
## [61] "c0ck" "c0cksucker"
## [63] "carpetmuncher" "cawk"
## [65] "chink" "cipa"
## [67] "cl1t" "clit"
## [69] "clitoris" "clits"
## [71] "cnut" "cock"
## [73] "cock-sucker" "cockface"
## [75] "cockhead" "cockmunch"
## [77] "cockmuncher" "cocks"
## [79] "cocksuck" "cocksucked"
## [81] "cocksucker" "cocksucking"
## [83] "cocksucks" "cocksuka"
## [85] "cocksukka" "cok"
## [87] "cokmuncher" "coksucka"
## [89] "coon" "cox"
## [91] "crap" "cum"
## [93] "cummer" "cumming"
## [95] "cums" "cumshot"
## [97] "cunilingus" "cunillingus"
## [99] "cunnilingus" "cunt"
## [101] "cuntlick" "cuntlicker"
## [103] "cuntlicking" "cunts"
## [105] "cyalis" "cyberfuc"
## [107] "cyberfuck" "cyberfucked"
## [109] "cyberfucker" "cyberfuckers"
## [111] "cyberfucking" "d1ck"
## [113] "damn" "dick"
## [115] "dickhead" "dildo"
## [117] "dildos" "dink"
## [119] "dinks" "dirsa"
## [121] "dlck" "dog-fucker"
## [123] "doggin" "dogging"
## [125] "donkeyribber" "doosh"
## [127] "duche" "dyke"
## [129] "ejaculate" "ejaculated"
## [131] "ejaculates" "ejaculating"
## [133] "ejaculatings" "ejaculation"
## [135] "ejakulate" "f4nny"
## [137] "fag" "fagging"
## [139] "faggitt" "faggot"
## [141] "faggs" "fagot"
## [143] "fagots" "fags"
## [145] "fanny" "fannyflaps"
## [147] "fannyfucker" "fanyy"
## [149] "fatass" "fcuk"
## [151] "fcuker" "fcuking"
## [153] "feck" "fecker"
## [155] "felching" "fellate"
## [157] "fellatio" "fingerfuck"
## [159] "fingerfucked" "fingerfucker"
## [161] "fingerfuckers" "fingerfucking"
## [163] "fingerfucks" "fistfuck"
## [165] "fistfucked" "fistfucker"
## [167] "fistfuckers" "fistfucking"
## [169] "fistfuckings" "fistfucks"
## [171] "flange" "fook"
## [173] "fooker" "fuck"
## [175] "fucka" "fucked"
## [177] "fucker" "fuckers"
## [179] "fuckhead" "fuckheads"
## [181] "fuckin" "fucking"
## [183] "fuckings" "fuckingshitmotherfucker"
## [185] "fuckme" "fucks"
## [187] "fuckwhit" "fuckwit"
## [189] "fudgepacker" "fudgepacker"
## [191] "fuk" "fuker"
## [193] "fukker" "fukkin"
## [195] "fuks" "fukwhit"
## [197] "fukwit" "fux"
## [199] "fux0r" "gangbang"
## [201] "gangbanged" "gangbangs"
## [203] "gaylord" "gaysex"
## [205] "goatse" "God"
## [207] "god-dam" "god-damned"
## [209] "goddamn" "goddamned"
## [211] "hardcoresex" "hell"
## [213] "heshe" "hoar"
## [215] "hoare" "hoer"
## [217] "homo" "hore"
## [219] "horniest" "horny"
## [221] "hotsex" "jack-off"
## [223] "jackoff" "jap"
## [225] "jerk-off" "jism"
## [227] "jiz" "jizm"
## [229] "jizz" "kawk"
## [231] "knob" "knobead"
## [233] "knobed" "knobend"
## [235] "knobhead" "knobjocky"
## [237] "knobjokey" "kock"
## [239] "kondum" "kondums"
## [241] "kum" "kummer"
## [243] "kumming" "kums"
## [245] "kunilingus" "l3ich"
## [247] "l3itch" "labia"
## [249] "lmfao" "lust"
## [251] "lusting" "m0f0"
## [253] "m0fo" "m45terbate"
## [255] "ma5terb8" "ma5terbate"
## [257] "masochist" "masterb8"
## [259] "masterbat" "masterbat3"
## [261] "masterbate" "masterbation"
## [263] "masterbations" "masturbate"
## [265] "mof0" "mofo"
## [267] "mothafuck" "mothafucka"
## [269] "mothafuckas" "mothafuckaz"
## [271] "mothafucked" "mothafucker"
## [273] "mothafuckers" "mothafuckin"
## [275] "mothafucking" "mothafuckings"
## [277] "mothafucks" "motherfuck"
## [279] "motherfucked" "motherfucker"
## [281] "motherfuckers" "motherfuckin"
## [283] "motherfucking" "motherfuckings"
## [285] "motherfuckka" "motherfucks"
## [287] "muff" "mutha"
## [289] "muthafecker" "muthafuckker"
## [291] "muther" "mutherfucker"
## [293] "n1gga" "n1gger"
## [295] "nazi" "nigg3r"
## [297] "nigg4h" "nigga"
## [299] "niggah" "niggas"
## [301] "niggaz" "nigger"
## [303] "niggers" "nob"
## [305] "nobhead" "nobjocky"
## [307] "nobjokey" "nobjokey"
## [309] "numbnuts" "nutsack"
## [311] "orgasim" "orgasims"
## [313] "orgasm" "orgasms"
## [315] "p0rn" "pawn"
## [317] "pecker" "penis"
## [319] "penisfucker" "phonesex"
## [321] "phuck" "phuk"
## [323] "phuked" "phuking"
## [325] "phukked" "phukking"
## [327] "phuks" "phuq"
## [329] "pigfucker" "pimpis"
## [331] "piss" "pissed"
## [333] "pisser" "pissers"
## [335] "pisses" "pissflaps"
## [337] "pissin" "pissing"
## [339] "pissoff" "poop"
## [341] "porn" "porno"
## [343] "pornography" "pornos"
## [345] "prick" "pricks"
## [347] "pron" "pube"
## [349] "pusse" "pussi"
## [351] "pussies" "pussy"
## [353] "pussys" "rectum"
## [355] "retard" "rimjaw"
## [357] "rimming" "sadist"
## [359] "schlong" "screwing"
## [361] "scroat" "scrote"
## [363] "scrotum" "semen"
## [365] "sex" "sh!+"
## [367] "sh!t" "sh1t"
## [369] "shag" "shagger"
## [371] "shaggin" "shagging"
## [373] "shemale" "shi+"
## [375] "shit" "shit"
## [377] "shit" "shitdick"
## [379] "shite" "shited"
## [381] "shitey" "shitfuck"
## [383] "shitfull" "shithead"
## [385] "shiting" "shitings"
## [387] "shits" "shitted"
## [389] "shitter" "shitters"
## [391] "shitting" "shittings"
## [393] "shitty" "skank"
## [395] "slut" "sluts"
## [397] "smegma" "smut"
## [399] "snatch" "son-of-a-bitch"
## [401] "spac" "spunk"
## [403] "t1tt1e5" "t1tties"
## [405] "teets" "teez"
## [407] "testical" "testicle"
## [409] "tit" "titfuck"
## [411] "tits" "titt"
## [413] "tittie5" "tittiefucker"
## [415] "titties" "tittyfuck"
## [417] "tittywank" "titwank"
## [419] "tosser" "turd"
## [421] "tw4t" "twat"
## [423] "twathead" "twatty"
## [425] "twunt" "twunter"
## [427] "v14gra" "v1gra"
## [429] "vagina" "viagra"
## [431] "vulva" "w00se"
## [433] "wang" "wank"
## [435] "wanker" "wanky"
## [437] "whoar" "whore"
## [439] "willies" "willy"
## [441] "xrated" "xxx"