Data Science Capstone: https://www.coursera.org/learn/data-science-project/
Quiz: https://www.coursera.org/learn/data-science-project/exam/QbBvW/quiz-2-natural-language-processing-i
Refer to this report for statistics: http://www.modsimworld.org/papers/2015/Natural_Language_Processing.pdf
library(stringr)
library(tm)
library(ggplot2)
library(ngram)
## constants
# original texts
co_twitter_en = "../data/capstone/en_US/en_US.twitter.txt"
co_blogs_en = "../data/capstone/en_US/en_US.blogs.txt"
co_news_en = "../data/capstone/en_US/en_US.news.txt"
# cleaned texts
co_tidy_twitter_en = "../data/capstone/tidy_twitter_en.rds"
co_tidy_blogs_en = "../data/capstone/tidy_blogs_en.rds"
co_tidy_news_en = "../data/capstone/tidy_news_en.rds"
# n-grams
co_3gram_en = "../data/capstone/3gram_en.rds"
co_3gram_notail_en = "../data/capstone/3gram_notail_en.rds"
tidyText <- function(file, tidyfile) {
# read in text
con <- file(file, open="r")
lines <- readLines(con)
close(con)
lines <- tolower(lines)
# replace words that contain "@", "#", "http://", "https://"
# with space (especially for Twitter text)
lines <- gsub("([^[:space:]]*)(@|#|http://|https://)([^[:space:]]*)", " ", lines)
# split at all ".", ",", brackets and etc.
lines <- unlist(strsplit(lines, "[.,:;!?(){}<>]|[][]+"))
# replace all non-alphanumeric characters with a space at the beginning/end of a word.
lines <- gsub("^[^a-z0-9]+|[^a-z0-9]+$", " ", lines) # at the begining/end of a line
lines <- gsub("[^a-z0-9]+\\s", " ", lines) # before space
lines <- gsub("\\s[^a-z0-9]+", " ", lines) # after space
# split a string at spaces then remove the words
# that contain any non-alphabetic characters (excpet "-", "'")
# then paste them together (separate them with spaces)
lines <- unlist(lapply(lines, function(line){
words <- unlist(strsplit(line, "\\s+"))
words <- words[!grepl("[^a-z'-]", words, perl=TRUE)]
paste(words, collapse=" ")}))
# remove axcess spaces
#lines <- gsub("\\s+", " ", lines) # remove mutiple spaces
lines <- str_trim(lines) # remove spaces at the beginning/end of the line
# drop blank lines
lines <- lines[nchar(lines)>0]
saveRDS(lines, file=tidyfile)
}
# clean texts
tidyText(co_twitter_en, co_tidy_twitter_en) # 12.52235 mins, 6,658 KB
tidyText(co_news_en, co_tidy_news_en) # 45.31975 secs, 70,998 KB
tidyText(co_blogs_en, co_tidy_blogs_en) # 9.873513 mins, 87,014 KB
# merge texts
df_news <- readRDS(co_tidy_news_en) # 340061 lines
df_blogs <- readRDS(co_tidy_blogs_en) # 4532671 lines
df_twitter <- readRDS(co_tidy_twitter_en) # 5030042 lines
lines <- c(df_news, df_blogs, df_twitter)
rm(df_news, df_blogs, df_twitter)
# get 3-grams
# remove lines that contain less than 3 words, or ngram() would throw errors.
lines <- lines[str_count(lines, "\\s+")>1] # reduced 9902774 lines to 7607099 lines
trigram <- ngram(lines, n=3); rm(lines) # 7.619798 mins
df <- get.phrasetable(trigram); rm(trigram) # 3.286831 mins
saveRDS(df, co_3gram_en) # 211,607 KB
df <- readRDS(co_3gram_en) # 22662982 objects
The 3-gram dictionary has a long tail.
plot(df[1:5000,]$freq,
main='3-Grams Top 5000 Word Frequence',
ylab="Frequence",
xlab="Word")
rbind(head(df, 10), tail(df, 10))
## ngrams freq prop
## 1 thanks for the 23773 4.798102e-04
## 2 one of the 21097 4.258006e-04
## 3 a lot of 19349 3.905207e-04
## 4 i want to 14337 2.893635e-04
## 5 to be a 13162 2.656485e-04
## 6 going to be 12722 2.567680e-04
## 7 i have a 11167 2.253835e-04
## 8 i have to 10713 2.162204e-04
## 9 looking forward to 10554 2.130113e-04
## 10 it was a 10284 2.075619e-04
## 22662973 off canceled for 1 2.018299e-08
## 22662974 like immigrant blame 1 2.018299e-08
## 22662975 the six pictures 1 2.018299e-08
## 22662976 neighbors meeting and 1 2.018299e-08
## 22662977 just my honest 1 2.018299e-08
## 22662978 shows on cbs 1 2.018299e-08
## 22662979 patriots laughed at 1 2.018299e-08
## 22662980 yourself a green 1 2.018299e-08
## 22662981 answer to two 1 2.018299e-08
## 22662982 hype from industry-friendly 1 2.018299e-08
Remove 3-grams with freqence = 1 (the long tail)
df_notail <- df[df$freq>1,]
saveRDS(df_notail, co_3gram_notail_en) # 32,995 KB
df_notail <- readRDS(co_3gram_notail_en)
dim(df_notail)
## [1] 4060396 3
The reduced 3-gram dictionary contains 17.91% original 3-grams, covers 62.45% instances, sizes 15% of the full 3-gram dictionary.
c(nrow(df_notail)/nrow(df),
sum(df_notail$freq)/sum(df$freq),
32995/211607)
## [1] 0.1791642 0.6245442 0.1559258
For each of the sentence fragments below use your natural language processing algorithm to predict the next word in the sentence.
give (wrong)
sleep
eat
die (right)
rbind(df[grep("^and i'd give", df[,1]),],
df[grep("^and i'd sleep", df[,1]),],
df[grep("^and i'd eat", df[,1]),],
df[grep("^and i'd die", df[,1]),])
## ngrams freq prop
## 637961 and i'd give 7 1.412809e-07
## 17385622 and i'd given 1 2.018299e-08
## 11506360 and i'd eat 1 2.018299e-08
rbind(df_notail[grep("^and i'd give", df_notail[,1]),],
df_notail[grep("^and i'd sleep", df_notail[,1]),],
df_notail[grep("^and i'd eat", df_notail[,1]),],
df_notail[grep("^and i'd die", df_notail[,1]),])
## ngrams freq prop
## 637961 and i'd give 7 1.412809e-07
horticultural
spiritual (wrong)
marital (right)
financial
rbind(df[grep("^about his horticultural", df[,1]),],
df[grep("^about his spiritual", df[,1]),],
df[grep("^about his marital", df[,1]),],
df[grep("^about his financial", df[,1]),])
## ngrams freq prop
## 14865805 about his spiritual 1 2.018299e-08
rbind(df_notail[grep("^about his horticultural", df_notail[,1]),],
df_notail[grep("^about his spiritual", df_notail[,1]),],
df_notail[grep("^about his marital", df_notail[,1]),],
df_notail[grep("^about his financial", df_notail[,1]),])
## [1] ngrams freq prop
## <0 rows> (or 0-length row.names)
weekend
month
decade
morning
head(df[grep("^monkeys this", df[,1]),], 10)
## [1] ngrams freq prop
## <0 rows> (or 0-length row.names)
head(df[grep("^arctic monkeys", df[,1]),], 10)
## ngrams freq prop
## 2229165 arctic monkeys with 2 4.036598e-08
## 3266611 arctic monkeys and 2 4.036598e-08
## 5066428 arctic monkeys opening 1 2.018299e-08
## 6107215 arctic monkeys last 1 2.018299e-08
## 6456208 arctic monkeys opens 1 2.018299e-08
## 6754268 arctic monkeys tonight 1 2.018299e-08
## 7919258 arctic monkeys my 1 2.018299e-08
## 8915715 arctic monkeys humbug 1 2.018299e-08
## 9984819 arctic monkeys classic 1 2.018299e-08
## 12425907 arctic monkeys idk 1 2.018299e-08
rbind(head(df[grep("this weekend", df[,1]),], 1),
head(df[grep("this month", df[,1]),], 1),
head(df[grep("this decade", df[,1]),], 1),
head(df[grep("this morning", df[,1]),], 1))
## ngrams freq prop
## 6276 for this weekend 336 6.781485e-06
## 12333 earlier this month 205 4.137513e-06
## 511919 of this decade 9 1.816469e-07
## 3123 this morning and 554 1.118138e-05
rbind(head(df_notail[grep("this weekend", df_notail[,1]),], 1),
head(df_notail[grep("this month", df_notail[,1]),], 1),
head(df_notail[grep("this decade", df_notail[,1]),], 1),
head(df_notail[grep("this morning", df_notail[,1]),], 1))
## ngrams freq prop
## 6276 for this weekend 336 6.781485e-06
## 12333 earlier this month 205 4.137513e-06
## 511919 of this decade 9 1.816469e-07
## 3123 this morning and 554 1.118138e-05
hunger
stress
happiness
sleepiness
head(df[grep("^reduce your", df[,1]),], 5)
## ngrams freq prop
## 274618 reduce your risk 15 3.027449e-07
## 1100192 reduce your stress 4 8.073196e-08
## 1376907 reduce your costs 4 8.073196e-08
## 1523574 reduce your debt 3 6.054897e-08
## 1561405 reduce your exposure 3 6.054897e-08
rbind(head(df[grep("^reduce your hunger", df[,1]),], 1),
head(df[grep("^reduce your stress", df[,1]),], 1),
head(df[grep("^reduce your happiness", df[,1]),], 1),
head(df[grep("^reduce your sleepiness", df[,1]),], 1))
## ngrams freq prop
## 1100192 reduce your stress 4 8.073196e-08
rbind(head(df_notail[grep("^reduce your hunger", df_notail[,1]),], 1),
head(df_notail[grep("^reduce your stress", df_notail[,1]),], 1),
head(df_notail[grep("^reduce your happiness", df_notail[,1]),], 1),
head(df_notail[grep("^reduce your sleepiness", df_notail[,1]),], 1))
## ngrams freq prop
## 1100192 reduce your stress 4 8.073196e-08
look (wrong)
picture (right)
walk
minute
head(df[grep("^take a ", df[,1]),], 5)
## ngrams freq prop
## 879 take a look 1227 2.476453e-05
## 3893 take a picture 474 9.566738e-06
## 4172 take a nap 450 9.082346e-06
## 5001 take a break 395 7.972281e-06
## 6254 take a moment 337 6.801668e-06
head(df_notail[grep("^take a ", df_notail[,1]),], 5)
## ngrams freq prop
## 879 take a look 1227 2.476453e-05
## 3893 take a picture 474 9.566738e-06
## 4172 take a nap 450 9.082346e-06
## 5001 take a break 395 7.972281e-06
## 6254 take a moment 337 6.801668e-06
incident
account
matter (right)
case (wrong)
head(df[grep("^settle the ", df[,1]),], 5)
## ngrams freq prop
## 908372 settle the bill 5 1.009150e-07
## 910045 settle the score 5 1.009150e-07
## 943200 settle the case 5 1.009150e-07
## 1282749 settle the matter 4 8.073196e-08
## 1326203 settle the issue 4 8.073196e-08
head(df_notail[grep("^settle the ", df_notail[,1]),], 5)
## ngrams freq prop
## 908372 settle the bill 5 1.009150e-07
## 910045 settle the score 5 1.009150e-07
## 943200 settle the case 5 1.009150e-07
## 1282749 settle the matter 4 8.073196e-08
## 1326203 settle the issue 4 8.073196e-08
finger
arm
toe
hand
rbind(head(df[grep("^in each finger", df[,1]),], 1),
head(df[grep("^in each arm", df[,1]),], 1),
head(df[grep("^in each toe", df[,1]),], 1),
head(df[grep("^in each hand", df[,1]),], 1))
## ngrams freq prop
## 1237294 in each arms 4 8.073196e-08
## 231077 in each hand 18 3.632938e-07
rbind(head(df_notail[grep("^in each finger", df_notail[,1]),], 1),
head(df_notail[grep("^in each arm", df_notail[,1]),], 1),
head(df_notail[grep("^in each toe", df_notail[,1]),], 1),
head(df_notail[grep("^in each hand", df_notail[,1]),], 1))
## ngrams freq prop
## 1237294 in each arms 4 8.073196e-08
## 231077 in each hand 18 3.632938e-07
side
center
top
middle
rbind(head(df[grep("^to the side", df[,1]),], 1),
head(df[grep("^to the center", df[,1]),], 1),
head(df[grep("^to the top", df[,1]),], 1),
head(df[grep("^to the middle", df[,1]),], 1))
## ngrams freq prop
## 5463 to the side 370 7.467707e-06
## 19215 to the center 146 2.946717e-06
## 1396 to the top 944 1.905274e-05
## 20898 to the middle 137 2.765070e-06
rbind(head(df_notail[grep("^to the side", df_notail[,1]),], 1),
head(df_notail[grep("^to the center", df_notail[,1]),], 1),
head(df_notail[grep("^to the top", df_notail[,1]),], 1),
head(df_notail[grep("^to the middle", df_notail[,1]),], 1))
## ngrams freq prop
## 5463 to the side 370 7.467707e-06
## 19215 to the center 146 2.946717e-06
## 1396 to the top 944 1.905274e-05
## 20898 to the middle 137 2.765070e-06
inside
daily
weekly
outside
head(df[grep("^from playing", df[,1]),], 10)
## ngrams freq prop
## 288872 from playing the 15 3.027449e-07
## 657602 from playing with 7 1.412809e-07
## 1059558 from playing basketball 5 1.009150e-07
## 1539602 from playing at 3 6.054897e-08
## 1545579 from playing so 3 6.054897e-08
## 1814394 from playing outside 3 6.054897e-08
## 1908553 from playing in 3 6.054897e-08
## 1947543 from playing on 3 6.054897e-08
## 2013344 from playing a 3 6.054897e-08
## 2406696 from playing my 2 4.036598e-08
head(df_notail[grep("^from playing", df_notail[,1]),], 10)
## ngrams freq prop
## 288872 from playing the 15 3.027449e-07
## 657602 from playing with 7 1.412809e-07
## 1059558 from playing basketball 5 1.009150e-07
## 1539602 from playing at 3 6.054897e-08
## 1545579 from playing so 3 6.054897e-08
## 1814394 from playing outside 3 6.054897e-08
## 1908553 from playing in 3 6.054897e-08
## 1947543 from playing on 3 6.054897e-08
## 2013344 from playing a 3 6.054897e-08
## 2406696 from playing my 2 4.036598e-08
stories
movies
pictures
novels
head(df[grep("^adam sandler's ", df[,1]),], 5)
## ngrams freq prop
## 10343550 adam sandler's grandma 1 2.018299e-08
## 11814967 adam sandler's thoughts 1 2.018299e-08
## 11854765 adam sandler's jack 1 2.018299e-08
## 15764079 adam sandler's recent 1 2.018299e-08
## 17803430 adam sandler's lunchlady 1 2.018299e-08
df[grep("adam sandler's", df[,1]),]
## ngrams freq prop
## 5354146 was adam sandler's 1 2.018299e-08
## 10343550 adam sandler's grandma 1 2.018299e-08
## 11463702 watching adam sandler's 1 2.018299e-08
## 11814967 adam sandler's thoughts 1 2.018299e-08
## 11854765 adam sandler's jack 1 2.018299e-08
## 15764079 adam sandler's recent 1 2.018299e-08
## 15770108 at adam sandler's 1 2.018299e-08
## 17803430 adam sandler's lunchlady 1 2.018299e-08
## 21905373 loved adam sandler's 1 2.018299e-08