Milestone Report of Data Science Capstone

Synopsis

The purpose of this project is to demostrate the result of passing the milestone. The motivation for this project is to:

Download data and extract data.
Summarize the data.
Create probability model of the data.
Use plot to make exploratory data analysis.
List the plans for final project.

Background

Software Environment

library(dplyr)
library(ggplot2)
library(tm)
Sys.setlocale(local="C")

## [1] "C"

sessionInfo()

## R version 3.2.2 (2015-08-14)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 8 x64 (build 9200)
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tm_0.6-2      NLP_0.1-8     ggplot2_1.0.1 dplyr_0.4.3  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.1      knitr_1.11       magrittr_1.5     MASS_7.3-45     
##  [5] munsell_0.4.2    colorspace_1.2-6 R6_2.1.1         stringr_1.0.0   
##  [9] plyr_1.8.3       tools_3.2.2      parallel_3.2.2   grid_3.2.2      
## [13] gtable_0.1.2     DBI_0.3.1        htmltools_0.2.6  yaml_2.1.13     
## [17] assertthat_0.1   digest_0.6.8     reshape2_1.4.1   formatR_1.2.1   
## [21] slam_0.1-32      evaluate_0.8     rmarkdown_0.8.1  stringi_1.0-1   
## [25] scales_0.3.0     proto_0.3-10

Downloading the Data

if (!file.exists("SwiftKey.zip")) {
    download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
                  "SwiftKey.zip")
}
if (!file.exists("en_US.blogs.txt")) {
    unzip("SwiftKey.zip", junkpaths = TRUE)
}

Exploratory Data Analysis

Data Summary

ttwit = readLines("en_US.twitter.txt", encoding = "utf8")
tnews = readLines("en_US.news.txt", encoding = "utf8")
tblog = readLines("en_US.blogs.txt", encoding = "utf8")
wtwit = strsplit(paste(ttwit, collapse = " "), " ")[[1]]
wnews = strsplit(paste(tnews, collapse = " "), " ")[[1]]
wblog = strsplit(paste(tblog, collapse = " "), " ")[[1]]

File Name	Counts
en_US.twitter.txt	Line Count = 2360148
	Word Count = 30373545
	Average Words Per Line = 12.8693391
en_US.news.txt	Line Count = 77259
	Word Count = 2643969
	Average Words Per Line = 34.2221489
en_US.blogs.txt	Line Count = 899288
	Word Count = 37334131
	Average Words Per Line = 41.5152109

There’s a notable difference between three media. The average words per line is largest in blog and smallest in twitter. This is a surprising result that there are more long sentences in blog than in news.

Loading Data

We only choose 10% data to generate probability model.

ttwit = paste(sample(ttwit, length(ttwit)/10), collapse = " \1\2\3 \1\2\3 ")
tnews = paste(sample(tnews, length(tnews)/10), collapse = " \1\2\3 \1\2\3 ")
tblog = paste(sample(tblog, length(tblog)/10), collapse = " \1\2\3 \1\2\3 ")

Preprocessing Data

There are five steps in processing data:

transform to lower case
remove bad word (see the Appendix for further information)
remove punctuations
remove numbers
strip white spaces

removeBadWord = function(x) {
    badword = readLines("badword.txt")
    bad = paste0(" ", paste(badword, collapse = " | "), " ")
    return(gsub(bad, " ", x))
}
removePunctuation = function(x) {
    gsub("[][!\"-#$%&()*+,./:;<=>?@^_`{|}~\\]+", " ", x)
}

preprocess = function(text) {
    text = tolower(text)
    text = removeBadWord(text)
    text = removePunctuation(text)
    text = removeNumbers(text)
    text = stripWhitespace(text)
    text = gsub("\1\2\3", "##", text)
    return(text)
}
ttwit = preprocess(ttwit)
tnews = preprocess(tnews)
tblog = preprocess(tblog)

Creating Probability Model

In the step, we use iconv function to identify foreign languag. If there is foreign language out of ASCII codes, it will be replaced by "\1" and the word is excluded from unigrams.

probmodel = function(text) {
    t = strsplit(text, " ")[[1]]
    # remove foreign language
    t = t[-grep("\1", iconv(t, "latin1", "ASCII", sub = "\1"))]

    uni_freq = sort(table(t), decreasing = TRUE)
    uni_freq = log(uni_freq[names(uni_freq) != "##"]) - log(sum(uni_freq))

    t2 = paste(t[-length(t)], t[-1])
    t2 = t2[grep("##", t2, invert = TRUE)]
    bi_freq = sort(table(t2), decreasing = TRUE)
    bi_freq = log(bi_freq) - log(sum(bi_freq))

    t3 = paste(t[-c(length(t)-1,length(t))], t[-c(1,length(t))], t[-c(1,2)])
    t3 = t3[grep("##", t3, invert = TRUE)]
    tri_freq = sort(table(t3), decreasing = TRUE)
    tri_freq = log(tri_freq) - log(sum(tri_freq))

    return(list(uni_freq, bi_freq, tri_freq))
}
ptwit = probmodel(ttwit)
pnews = probmodel(tnews)
pblog = probmodel(tblog)

Data Plots

For twitter, news, and blog text, we plot the probability of N-grams in a logarithmic scale.

plotFreq = function(freq, t) {
    y1 = freq[[1]]
    y2 = freq[[2]]
    y3 = freq[[3]]
    y2 = y2[y2 > min(y2)]
    y3 = y3[y3 > min(y3)]
    len = c(length(y1), length(y2), length(y3))
    d = data.frame(x=c(seq_len(len[1]), seq_len(len[2]), seq_len(len[3])),
                   y=c(y1, y2, y3),
                   type=rep(c("unigram","bigram","trigram"), len))
    ggplot(d) + geom_line(aes(x=x, y=y, group=type, color=type), size=1) +
        xlab("Ranking of N-grams") +
        ylab("Probability in a logarithmic scale") +
        ggtitle(t)
}
plotFreq(ptwit, "Twitter")

plotFreq(pnews, "News")

plotFreq(pblog, "Blog")

Other Findings

I was curious about what words are frequently used in one media and not so frequently used in another. Following is my findings about the difference between media.

a = sapply(names(pnews[[3]][1:20]), function(x) which(names(ptwit[[3]]) == x))
names(sort(a, decreasing = TRUE)[1:5])

## [1] "said in a"        "m to p"           "a m to"          
## [4] "to p m"           "according to the"

a = sapply(names(pnews[[3]][1:20]), function(x) which(names(pblog[[3]]) == x))
names(sort(a, decreasing = TRUE)[1:5])

## [1] "m to p"    "to p m"    "a m to"    "at p m"    "said in a"

Quatation is very important in news. Mostly, the quotation is from a third person. That’s why “he said he” and “according to the” are frequent in news but not so frequent on blog and twitter. “at 2 a.m.” or “at 8:30 p.m.” are often used in news, but not so often in daily life speaking.

a = sapply(names(pblog[[3]][1:20]), function(x) which(names(ptwit[[3]]) == x))
names(sort(a, decreasing = TRUE)[1:5])

## [1] "as well as"    "the fact that" "a couple of"   "it is a"      
## [5] "some of the"

a = sapply(names(pblog[[3]][1:20]), function(x) which(names(pnews[[3]]) == x))
b = setdiff(names(a), names(unlist(a)))
c(b, names(sort(unlist(a), decreasing = TRUE)[1:(5-length(b))]))

## [1] "one of my"   "i have to"   "i have a"    "the rest of" "i want to"

The text on blog is more literal. I guess that’s why bloggers choose “as well as” and “the fact that”. The text on blog is subjective, but the text in news is objective. That’s why “i have been”, “i have to”, “i have a”, “i want to” are not frequent in news.

a = sapply(names(ptwit[[3]][1:20]), function(x) which(names(pblog[[3]]) == x))
b = setdiff(names(a), names(unlist(a)))
c(b, names(sort(unlist(a), decreasing = TRUE)[1:(5-length(b))]))

## [1] "for the follow" "thanks for the" "to see you"     "i love you"    
## [5] "can't wait to"

a = sapply(names(ptwit[[3]][1:20]), function(x) which(names(pnews[[3]]) == x))
setdiff(names(a), names(unlist(a)))

## [1] "for the follow" "to see you"

There are some special words that only occured on twitter, such as “for the follow”, “for the rt”. The text on twitter are more casual and conversational, such as “to see you”, “have a great”, “thanks for the”.

App Plans

Following is my plans to creating a prediction algorithm:

Current N-gram model is saved as character vectors. It’s slow and consuming space. The most urgent task is find a way to store N-gram model.
Combine N-grame models by backoff method. When the user input two words, we search the highest probability 3-gram starting with the two words. If 3-gram is not found, we search 2-gram instead. if 2-gram is not found, search 1-gram instead.
User inputs should be combined into N-gram model with large weights.

Appendix

The bad words list is from Bad Words Banned by Google, containing following bad words:

readLines("badword.txt")

##   [1] "4r5e"                    "5h1t"                   
##   [3] "5hit"                    "a55"                    
##   [5] "anal"                    "anus"                   
##   [7] "ar5e"                    "arrse"                  
##   [9] "arse"                    "ass"                    
##  [11] "ass-fucker"              "asses"                  
##  [13] "assfucker"               "assfukka"               
##  [15] "asshole"                 "assholes"               
##  [17] "asswhole"                "b!tch"                  
##  [19] "b00bs"                   "b17ch"                  
##  [21] "b1tch"                   "ballbag"                
##  [23] "balls"                   "ballsack"               
##  [25] "bastard"                 "beastial"               
##  [27] "beastiality"             "bellend"                
##  [29] "bestial"                 "bestiality"             
##  [31] "bi+ch"                   "biatch"                 
##  [33] "bitch"                   "bitcher"                
##  [35] "bitchers"                "bitches"                
##  [37] "bitchin"                 "bitching"               
##  [39] "bloody"                  "blowjob"                
##  [41] "blowjobs"                "boiolas"                
##  [43] "bollock"                 "bollok"                 
##  [45] "boner"                   "boob"                   
##  [47] "boobs"                   "booobs"                 
##  [49] "boooobs"                 "booooobs"               
##  [51] "booooooobs"              "breasts"                
##  [53] "buceta"                  "bugger"                 
##  [55] "bum"                     "bunnyfucker"            
##  [57] "butt"                    "butthole"               
##  [59] "buttmuch"                "buttplug"               
##  [61] "c0ck"                    "c0cksucker"             
##  [63] "carpetmuncher"           "cawk"                   
##  [65] "chink"                   "cipa"                   
##  [67] "cl1t"                    "clit"                   
##  [69] "clitoris"                "clits"                  
##  [71] "cnut"                    "cock"                   
##  [73] "cock-sucker"             "cockface"               
##  [75] "cockhead"                "cockmunch"              
##  [77] "cockmuncher"             "cocks"                  
##  [79] "cocksuck"                "cocksucked"             
##  [81] "cocksucker"              "cocksucking"            
##  [83] "cocksucks"               "cocksuka"               
##  [85] "cocksukka"               "cok"                    
##  [87] "cokmuncher"              "coksucka"               
##  [89] "coon"                    "cox"                    
##  [91] "crap"                    "cum"                    
##  [93] "cummer"                  "cumming"                
##  [95] "cums"                    "cumshot"                
##  [97] "cunilingus"              "cunillingus"            
##  [99] "cunnilingus"             "cunt"                   
## [101] "cuntlick"                "cuntlicker"             
## [103] "cuntlicking"             "cunts"                  
## [105] "cyalis"                  "cyberfuc"               
## [107] "cyberfuck"               "cyberfucked"            
## [109] "cyberfucker"             "cyberfuckers"           
## [111] "cyberfucking"            "d1ck"                   
## [113] "damn"                    "dick"                   
## [115] "dickhead"                "dildo"                  
## [117] "dildos"                  "dink"                   
## [119] "dinks"                   "dirsa"                  
## [121] "dlck"                    "dog-fucker"             
## [123] "doggin"                  "dogging"                
## [125] "donkeyribber"            "doosh"                  
## [127] "duche"                   "dyke"                   
## [129] "ejaculate"               "ejaculated"             
## [131] "ejaculates"              "ejaculating"            
## [133] "ejaculatings"            "ejaculation"            
## [135] "ejakulate"               "f4nny"                  
## [137] "fag"                     "fagging"                
## [139] "faggitt"                 "faggot"                 
## [141] "faggs"                   "fagot"                  
## [143] "fagots"                  "fags"                   
## [145] "fanny"                   "fannyflaps"             
## [147] "fannyfucker"             "fanyy"                  
## [149] "fatass"                  "fcuk"                   
## [151] "fcuker"                  "fcuking"                
## [153] "feck"                    "fecker"                 
## [155] "felching"                "fellate"                
## [157] "fellatio"                "fingerfuck"             
## [159] "fingerfucked"            "fingerfucker"           
## [161] "fingerfuckers"           "fingerfucking"          
## [163] "fingerfucks"             "fistfuck"               
## [165] "fistfucked"              "fistfucker"             
## [167] "fistfuckers"             "fistfucking"            
## [169] "fistfuckings"            "fistfucks"              
## [171] "flange"                  "fook"                   
## [173] "fooker"                  "fuck"                   
## [175] "fucka"                   "fucked"                 
## [177] "fucker"                  "fuckers"                
## [179] "fuckhead"                "fuckheads"              
## [181] "fuckin"                  "fucking"                
## [183] "fuckings"                "fuckingshitmotherfucker"
## [185] "fuckme"                  "fucks"                  
## [187] "fuckwhit"                "fuckwit"                
## [189] "fudgepacker"             "fudgepacker"            
## [191] "fuk"                     "fuker"                  
## [193] "fukker"                  "fukkin"                 
## [195] "fuks"                    "fukwhit"                
## [197] "fukwit"                  "fux"                    
## [199] "fux0r"                   "gangbang"               
## [201] "gangbanged"              "gangbangs"              
## [203] "gaylord"                 "gaysex"                 
## [205] "goatse"                  "God"                    
## [207] "god-dam"                 "god-damned"             
## [209] "goddamn"                 "goddamned"              
## [211] "hardcoresex"             "hell"                   
## [213] "heshe"                   "hoar"                   
## [215] "hoare"                   "hoer"                   
## [217] "homo"                    "hore"                   
## [219] "horniest"                "horny"                  
## [221] "hotsex"                  "jack-off"               
## [223] "jackoff"                 "jap"                    
## [225] "jerk-off"                "jism"                   
## [227] "jiz"                     "jizm"                   
## [229] "jizz"                    "kawk"                   
## [231] "knob"                    "knobead"                
## [233] "knobed"                  "knobend"                
## [235] "knobhead"                "knobjocky"              
## [237] "knobjokey"               "kock"                   
## [239] "kondum"                  "kondums"                
## [241] "kum"                     "kummer"                 
## [243] "kumming"                 "kums"                   
## [245] "kunilingus"              "l3ich"                  
## [247] "l3itch"                  "labia"                  
## [249] "lmfao"                   "lust"                   
## [251] "lusting"                 "m0f0"                   
## [253] "m0fo"                    "m45terbate"             
## [255] "ma5terb8"                "ma5terbate"             
## [257] "masochist"               "masterb8"               
## [259] "masterbat"               "masterbat3"             
## [261] "masterbate"              "masterbation"           
## [263] "masterbations"           "masturbate"             
## [265] "mof0"                    "mofo"                   
## [267] "mothafuck"               "mothafucka"             
## [269] "mothafuckas"             "mothafuckaz"            
## [271] "mothafucked"             "mothafucker"            
## [273] "mothafuckers"            "mothafuckin"            
## [275] "mothafucking"            "mothafuckings"          
## [277] "mothafucks"              "motherfuck"             
## [279] "motherfucked"            "motherfucker"           
## [281] "motherfuckers"           "motherfuckin"           
## [283] "motherfucking"           "motherfuckings"         
## [285] "motherfuckka"            "motherfucks"            
## [287] "muff"                    "mutha"                  
## [289] "muthafecker"             "muthafuckker"           
## [291] "muther"                  "mutherfucker"           
## [293] "n1gga"                   "n1gger"                 
## [295] "nazi"                    "nigg3r"                 
## [297] "nigg4h"                  "nigga"                  
## [299] "niggah"                  "niggas"                 
## [301] "niggaz"                  "nigger"                 
## [303] "niggers"                 "nob"                    
## [305] "nobhead"                 "nobjocky"               
## [307] "nobjokey"                "nobjokey"               
## [309] "numbnuts"                "nutsack"                
## [311] "orgasim"                 "orgasims"               
## [313] "orgasm"                  "orgasms"                
## [315] "p0rn"                    "pawn"                   
## [317] "pecker"                  "penis"                  
## [319] "penisfucker"             "phonesex"               
## [321] "phuck"                   "phuk"                   
## [323] "phuked"                  "phuking"                
## [325] "phukked"                 "phukking"               
## [327] "phuks"                   "phuq"                   
## [329] "pigfucker"               "pimpis"                 
## [331] "piss"                    "pissed"                 
## [333] "pisser"                  "pissers"                
## [335] "pisses"                  "pissflaps"              
## [337] "pissin"                  "pissing"                
## [339] "pissoff"                 "poop"                   
## [341] "porn"                    "porno"                  
## [343] "pornography"             "pornos"                 
## [345] "prick"                   "pricks"                 
## [347] "pron"                    "pube"                   
## [349] "pusse"                   "pussi"                  
## [351] "pussies"                 "pussy"                  
## [353] "pussys"                  "rectum"                 
## [355] "retard"                  "rimjaw"                 
## [357] "rimming"                 "sadist"                 
## [359] "schlong"                 "screwing"               
## [361] "scroat"                  "scrote"                 
## [363] "scrotum"                 "semen"                  
## [365] "sex"                     "sh!+"                   
## [367] "sh!t"                    "sh1t"                   
## [369] "shag"                    "shagger"                
## [371] "shaggin"                 "shagging"               
## [373] "shemale"                 "shi+"                   
## [375] "shit"                    "shit"                   
## [377] "shit"                    "shitdick"               
## [379] "shite"                   "shited"                 
## [381] "shitey"                  "shitfuck"               
## [383] "shitfull"                "shithead"               
## [385] "shiting"                 "shitings"               
## [387] "shits"                   "shitted"                
## [389] "shitter"                 "shitters"               
## [391] "shitting"                "shittings"              
## [393] "shitty"                  "skank"                  
## [395] "slut"                    "sluts"                  
## [397] "smegma"                  "smut"                   
## [399] "snatch"                  "son-of-a-bitch"         
## [401] "spac"                    "spunk"                  
## [403] "t1tt1e5"                 "t1tties"                
## [405] "teets"                   "teez"                   
## [407] "testical"                "testicle"               
## [409] "tit"                     "titfuck"                
## [411] "tits"                    "titt"                   
## [413] "tittie5"                 "tittiefucker"           
## [415] "titties"                 "tittyfuck"              
## [417] "tittywank"               "titwank"                
## [419] "tosser"                  "turd"                   
## [421] "tw4t"                    "twat"                   
## [423] "twathead"                "twatty"                 
## [425] "twunt"                   "twunter"                
## [427] "v14gra"                  "v1gra"                  
## [429] "vagina"                  "viagra"                 
## [431] "vulva"                   "w00se"                  
## [433] "wang"                    "wank"                   
## [435] "wanker"                  "wanky"                  
## [437] "whoar"                   "whore"                  
## [439] "willies"                 "willy"                  
## [441] "xrated"                  "xxx"