Around the world, people are spending an increasing amount of time on their mobile devices for email, social networking, banking and a whole range of other activities. But typing on mobile devices can be a serious pain. SwiftKey, our corporate partner in this capstone, builds a smart keyboard that makes it easier for people to type on their mobile devices. One cornerstone of their smart keyboard is predictive text models. When someone types: “I went to the”
the keyboard presents three options for what the next word might be. For example, the three words might be gym, store, restaurant. In this capstone we will work on understanding and building predictive text models like those used by SwiftKey.
suppressMessages(library(dplyr))
suppressMessages(library(tokenizers))
suppressMessages(library(sentimentr))
suppressMessages(library(tm))
suppressMessages(library(textcat))
suppressMessages(library(data.table))
suppressMessages(library(wordcloud))
suppressMessages(library(ggplot2))
data is downloaded and kept in the working directory,In this I worked on the data set “en-US”,where they are about three text files named as “blogs.txt”, “tweets.txt” , “news.txt”.
As per my system configuration i have taken only about “60,000” lines form each set
tweets <- readLines("en_US.twitter.txt",n = 60000,skipNul = TRUE)
blogs <- readLines("en_US.blogs.txt",n=60000,skipNul = TRUE)
news <- readLines("en_US.news.txt",n = 60000,skipNul = TRUE)
cleaning <- function(lines){
profanity <- readLines("neg.txt")
file <- c()
j <- 1
for(i in lines){
for(li in profanity){
file[j] <- sub(pattern =li, replacement = '', x = i)
}
j <- j+1
}
j <- 1
file1 <- c()
for(i in file){
file1[j] <- gsub(pattern ='[^a-zA-Z ]+', replacement = '', x = i)
j <- j+1
}
file1 <- stripWhitespace(file1)
file1 <- tolower(file1)
df <- data.table(file1)
colnames(df) <- c("lines")
df
}
tokenization <- function(df){
colnames(df)<- c("tweets")
df <- sample_frac(df,0.5)
df <- mutate(df,noofcharacters = nchar(tweets))
df <- mutate(df,nooflines = lengths(tokenize_lines(tweets)))
df <- mutate(df,noofwords = lengths(tokenize_words(tweets)))
df <- mutate(df,sentimentscore = sentiment(tweets)$sentiment)
f <- df[df$sentimentscore>0]
g <-df[df$sentimentscore <0]
h <- df[df$sentimentscore==0]
f[["overallsentiment"]] <-1
g[["overallsentiment"]] <- 2
h[['overallsentiment']] <- 0
df1 <- rbind(f,g)
df1 <- rbind(df1,h)
df1 <- mutate(df1,noof_1_grams = lengths(tokenize_ngrams(tweets,n=1)))
df1 <- mutate(df1,noof_2_grams = lengths(tokenize_ngrams(tweets,n=2)))
df1 <- mutate(df1,noof_3_grams = lengths(tokenize_ngrams(tweets,n=3)))
df1
}
ngrams <- function(df1)
{
words <- tokenize_ngrams(df1$tweets,n=3)
tokens_3 <- list()
i<- 1
for(word in words){
for(li in word){
tokens_3[i] <- li
i <- i+1
}
}
tokens_3 <- data.table(tokens_3)
tokens_3 <- mutate(tokens_3,tokens_3 = as.character(tokens_3))
tokens_3 <- dplyr::count(tokens_3,tokens_3,sort = TRUE)
colnames(tokens_3) <- c("words","count")
spli <- with(tokens_3,strsplit(words," "))
first <- list()
j<- 1
for(i in c(1:length(spli))){
first[j]<- spli[[i]][1]
j <- j+1
}
first <- data.table(first)
tokens_3 <- cbind(tokens_3,first)
second <- list()
j<- 1
for(i in c(1:length(spli))){
second[j]<- spli[[i]][2]
j <- j+1
}
second <- data.table(second)
tokens_3 <- cbind(tokens_3,second)
thrid <- list()
j<- 1
for(i in c(1:length(spli))){
thrid[j]<- spli[[i]][3]
j <- j+1
}
thrid <- data.table(thrid)
tokens_3 <- cbind(tokens_3,thrid)
tokens_3
}
ngram_frequency_barchart <-function(ngrams,total)
ggplot(ngrams, aes(x = words, y = count)) +
geom_bar(stat="identity", fill = "burlywood1") +
scale_x_discrete(limits = ngrams$ngram) +
ggtitle(paste("Top", nrow(ngrams), "Word Frequencies in the Corpus",total)) +
ylab("Frequency") +
theme(axis.text.x = element_text(size = 12, angle = 90, hjust = 1, vjust = 0.5)) +
theme(plot.title = element_text(size = 18, face = "bold",
hjust = 0.5, margin = margin(b = 30, unit = "pt"))) +
theme(axis.title.x = element_blank()) +
theme(axis.title.y = element_text(size = 14, face="bold")) +
theme(panel.background = element_blank(), axis.line = element_line(colour = "black")) +
theme(panel.border = element_rect(colour = "black", fill = NA, size = 0.5)) +
theme(strip.background = element_rect(fill = alpha("burlywood3", 0.3), color = "black", size = 0.5)) +
theme(legend.title = element_blank())
The data set from which we have taken the negative words to remove the profanitiy is Data Set
This file contains a list of NEGATIVE opinion words (or sentiment words).
; If you use this list, please cite the following paper: ; ; Minqing Hu and Bing Liu. “Mining and Summarizing Customer Reviews.” ; Proceedings of the ACM SIGKDD International Conference on Knowledge ; Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, ; Washington, USA, ;
; Notes: ; 1. The appearance of an opinion word in a sentence does not necessarily
; mean that the sentence expresses a positive or negative opinion. ; See the paper below: ; ; Bing Liu. “Sentiment Analysis and Subjectivity.” An chapter in ; Handbook of Natural Language Processing, Second Edition, ; (editors: N. Indurkhya and F. J. Damerau), 2010. ;````````` ; 2. You will notice many misspelled words in the list. They are not ; mistakes. They are included as these misspelled words appear ; frequently in social media content
note:- I removed some of the words for this notes
df_t <- cleaning(tweets)
head(df_t,n=5)
## lines
## 1: how are you btw thanks for the rt you gonna be in dc anytime soon love to see you been way way too long
## 2: when you meet someone special youll know your heart will beat more rapidly and youll smile for no reason
## 3: theyve decided its more fun if i dont
## 4: so tired d played lazer tag ran a lot d ughh going to sleep like in minutes
## 5: words from a complete stranger made my birthday even better
df_b <- cleaning(blogs)
head(df_b,n=5)
## lines
## 1: in the years thereafter most of the oil fields and platforms were named after pagan gods
## 2: we love you mr brown
## 3: chad has been awesome with the kids and holding down the fort while i work later than usual the kids have been busy together playing skylander on the xbox together after kyan cashed in his from his piggy bank he wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it he never taps into that thing either that is how we know he wanted it so bad we made him count all of his money to make sure that he had enough it was very cute to watch his reaction when he realized he did he also does a very good job of letting lola feel like she is playing too by letting her switch out the characters she loves it almost as much as him
## 4: so anyways i am going to share some home decor inspiration that i have been storing in my folder on the puter i have all these amazing images stored away ready to come to life when we get our home
## 5: with graduation season right around the corner nancy has whipped up a fun set to help you out with not only your graduation cards and gifts but any occasion that brings on a change in ones life i stamped the images in memento tuxedo black and cut them out with circle nestabilities i embossed the kraft and red cardstock with tes new stars impressions plate which is double sided and gives you fantastic patterns you can see how to use the impressions plates in this tutorial taylor created just one pass through your die cut machine using the embossing pad kit is all you need to do super easy
df_n <- cleaning(news)
head(df_n,n=5)
## lines
## 1: he wasnt home alone apparently
## 2: the st louis plant had to close it would die of old age workers had been making cars there since the onset of mass automotive production in the s
## 3: wsus plans quickly became a hot topic on local online sites though most people applauded plans for the new biomedical center many deplored the potential loss of the building
## 4: the alaimo group of mount holly was up for a contract last fall to evaluate and suggest improvements to trenton water works but campaign finance records released this week show the two employees donated a total of to the political action committee pac partners for progress in early june partners for progress reported it gave more than in both direct and inkind contributions to mayor tony mack in the two weeks leading up to his victory in the mayoral runoff election june
## 5: and when its often difficult to predict a laws impact legislators should think twice before carrying any bill is it absolutely necessary is it an issue serious enough to merit their attention will it definitely not make the situation worse
In this we will find the counts of words,lines and calculate the senitment of lines and also the count of ngrams(1 t0 3).
set.seed(1234)
df_t <- tokenization(df_t)
head(df_t,n=5)
## tweets
## 1: breakfast of champions and partyproofing
## 2: onward upward great day be in the film industry in louisiana d
## 3: i mean they are wlw but still wheres the poc love
## 4: oh man the amount of my life i wasted playing that game at the arcade it was the best though
## 5: revival day a faithful believer with a pressing problem matthew dr booth
## noofcharacters nooflines noofwords sentimentscore overallsentiment
## 1: 40 1 5 0.26832816 1
## 2: 62 1 12 0.14433757 1
## 3: 49 1 11 0.50880039 1
## 4: 92 1 20 0.07826238 1
## 5: 72 1 12 0.31754265 1
## noof_1_grams noof_2_grams noof_3_grams
## 1: 5 4 3
## 2: 12 11 10
## 3: 11 10 9
## 4: 20 19 18
## 5: 12 11 10
df_b <- tokenization(df_b)
head(df_b,n=5)
## tweets
## 1: non police readers who want to know more about what a duty inspector does or police officers for that matter should read the bottom few paragraphs which outline the role a bit more
## 2: awardwinner and the one who has given the prize have to show the link of arte y pico blog so everyone will know the origin of this award which is here arte y pico
## 3: being like a lot postmenopausal women i didnt know too much about what my body was up to i had had an easy time of menopause my cycle simply stopped no more periods no odd symptoms no hot flashes nothing
## 4: as these and a multitude of other familiar comforts have slotted into their usual places i have happily given in to mishymashy sentimentality oh come all ye ghosties joyful and triumphant
## 5: thinking shes by his side only to finally realize hes not at home that hes in france and shes at
## noofcharacters nooflines noofwords sentimentscore overallsentiment
## 1: 180 1 33 0.008703883 1
## 2: 163 1 34 0.265822807 1
## 3: 203 1 40 0.515451259 1
## 4: 187 1 31 0.871085715 1
## 5: 96 1 20 0.017888544 1
## noof_1_grams noof_2_grams noof_3_grams
## 1: 33 32 31
## 2: 34 33 32
## 3: 40 39 38
## 4: 31 30 29
## 5: 20 19 18
df_n <- tokenization(df_n)
head(df_n,n=5)
## tweets
## 1: im proud of them because they basically achieved what everyone else is trying to achieve baez said everyone wants to play in college get a good education and basically go to school for free thats what theyre both doing
## 2: st louis jessica p chamberlain valarie c frazier latosha a haney deanna l jones latonya r ming tramyra t nathan rosemary ruffin charity j thompson mary turnerstockard
## 3: raffle tickets for fun prizes and light snacks will be for sale
## 4: we liked most of the reds robert robbie cooks la vieille ferme a basic cotes du rhone was the perfect table wine it was light and charming and for a steal
## 5: romney has rejected many of the positions some of which date to his us senate run against ted kennedy in massachusetts years ago he has said repeatedly in recent years that he supports the right to keep and bear arms and is prolife he has promised to repeal the national health care reform law while defending massachusetts saying it was a stateonly model and newspaper reports suggest romney closed tax loopholes and raised fees while cutting spending to bring more money into state coffers
## noofcharacters nooflines noofwords sentimentscore overallsentiment
## 1: 218 1 39 0.4803845 1
## 2: 166 1 27 0.1539601 1
## 3: 63 1 12 0.3319764 1
## 4: 154 1 31 0.3412501 1
## 5: 491 1 84 0.3578773 1
## noof_1_grams noof_2_grams noof_3_grams
## 1: 39 38 37
## 2: 27 26 25
## 3: 12 11 10
## 4: 31 30 29
## 5: 84 83 82
g <- ggplot(df_b,aes(overallsentiment))
g+geom_bar()+ggtitle("sentiment of sentences in blogs")
g <- ggplot(df_n,aes(overallsentiment))
g+geom_bar() + ggtitle("sentiment of sentences in news")
g <- ggplot(df_t,aes(overallsentiment))
g+geom_bar() + ggtitle("sentment of sentences in twitter")
grams_t <- ngrams(df_t)
head(grams_t,n=5)
## words count first second thrid
## 1: <NA> 988 NA NA NA
## 2: thanks for the 306 thanks for the
## 3: i love you 121 i love you
## 4: looking forward to 116 looking forward to
## 5: cant wait to 106 cant wait to
grams_b <- ngrams(df_b)
head(grams_b,n=5)
## words count first second thrid
## 1: <NA> 1596 NA NA NA
## 2: one of the 501 one of the
## 3: a lot of 429 a lot of
## 4: be able to 234 be able to
## 5: as well as 227 as well as
grams_n <- ngrams(df_n)
head(grams_n,n=5)
## words count first second thrid
## 1: <NA> 851 NA NA NA
## 2: one of the 415 one of the
## 3: a lot of 332 a lot of
## 4: as well as 181 as well as
## 5: in the first 163 in the first
grams_b_1 <- grams_b[1:20,]
grams_n_1 <- grams_n[1:20,]
grams_t_1 <- grams_t[1:20,]
words_cloud <- function(ngrams)
wordcloud(ngrams$words, ngrams$count, scale = c(3, 0.4), colors = brewer.pal(8, "Dark2"))
As of my understanding and learning I planning to use the markov chain trainsition and with parameters using the Katz Back-off and katz probabilites . The model will be bulit for the next word prediction of sentences,words and phrases and it would be multiple prediction.
As part of Shiny app we need to work with sample of data to apply entire population as the servers need to handle.