This week’s Objective:

Getting the data , cleaning the data , Understanding the data and performing the some explonatory analysis.

Loading the packages

suppressMessages(library(dplyr))
suppressMessages(library(tokenizers))
suppressMessages(library(sentimentr))
suppressMessages(library(tm))
suppressMessages(library(textcat))
suppressMessages(library(data.table))
suppressMessages(library(wordcloud))
suppressMessages(library(ggplot2))

Loading the data

data is downloaded and kept in the working directory,In this I worked on the data set “en-US”,where they are about three text files named as “blogs.txt”, “tweets.txt” , “news.txt”.

As per my system configuration i have taken only about “60,000” lines form each set

tweets <- readLines("en_US.twitter.txt",n = 60000,skipNul = TRUE)
blogs <- readLines("en_US.blogs.txt",n=60000,skipNul = TRUE)
news <- readLines("en_US.news.txt",n = 60000,skipNul = TRUE)

Functions which helps in the performing the task

cleaning <- this function removes the profanity words and replaces the punctuations and numbers
tokenization <- this function helps in tokenizing by lines,words and also it will find the sentiment of line
ngrams <- this function performs the division of lines into three-grams of words and finds their count
ngram_frequency_barchart <- this function performs some exploratory analysis

cleaning <- function(lines){
profanity <- readLines("neg.txt")
file <- c()
j <- 1
for(i in lines){
  for(li in profanity){
    file[j] <- sub(pattern =li, replacement = '', x = i)
  }
  j <- j+1
}
j <- 1
file1 <- c()
for(i in file){
  file1[j] <- gsub(pattern ='[^a-zA-Z ]+', replacement = '', x = i)
  j <- j+1
}
file1 <- stripWhitespace(file1)
file1 <- tolower(file1)
df <- data.table(file1)
colnames(df) <- c("lines")
df
}

tokenization <- function(df){
colnames(df)<- c("tweets")
df <- sample_frac(df,0.5)
df <- mutate(df,noofcharacters = nchar(tweets))
df <- mutate(df,nooflines = lengths(tokenize_lines(tweets)))
df <- mutate(df,noofwords = lengths(tokenize_words(tweets)))
df <- mutate(df,sentimentscore = sentiment(tweets)$sentiment)
f <- df[df$sentimentscore>0]
g <-df[df$sentimentscore <0]
h <- df[df$sentimentscore==0]
f[["overallsentiment"]] <-1
g[["overallsentiment"]] <- 2
h[['overallsentiment']] <- 0
df1 <- rbind(f,g)
df1 <- rbind(df1,h)
df1 <- mutate(df1,noof_1_grams = lengths(tokenize_ngrams(tweets,n=1)))
df1 <- mutate(df1,noof_2_grams = lengths(tokenize_ngrams(tweets,n=2)))
df1 <- mutate(df1,noof_3_grams = lengths(tokenize_ngrams(tweets,n=3)))
df1
}

ngrams <- function(df1)
{
  words <- tokenize_ngrams(df1$tweets,n=3)
  tokens_3 <- list()
  i<- 1
  for(word in words){
    for(li in word){
      tokens_3[i] <- li
      i <- i+1
    }
  }
  tokens_3 <- data.table(tokens_3)
  tokens_3 <- mutate(tokens_3,tokens_3 = as.character(tokens_3))
  tokens_3 <- dplyr::count(tokens_3,tokens_3,sort = TRUE)
  colnames(tokens_3) <- c("words","count")
  spli <- with(tokens_3,strsplit(words," "))
  first <- list()
  j<- 1
  for(i in c(1:length(spli))){
    first[j]<- spli[[i]][1]
    j <- j+1
  }
  first <- data.table(first)
  tokens_3 <- cbind(tokens_3,first)
  
  
  second <- list()
  j<- 1
  for(i in c(1:length(spli))){
    second[j]<- spli[[i]][2]
    j <- j+1
  }
  second <- data.table(second)
  tokens_3 <- cbind(tokens_3,second)
  
  
  
  thrid <- list()
  j<- 1
  for(i in c(1:length(spli))){
    thrid[j]<- spli[[i]][3]
    j <- j+1
  }
  thrid <- data.table(thrid)
  tokens_3 <- cbind(tokens_3,thrid)
  tokens_3
}

ngram_frequency_barchart <-function(ngrams,total)
    ggplot(ngrams, aes(x = words, y = count)) +
        geom_bar(stat="identity", fill = "burlywood1") +
        scale_x_discrete(limits = ngrams$ngram) +
        ggtitle(paste("Top", nrow(ngrams), "Word Frequencies in the Corpus",total)) +
        ylab("Frequency") +
        theme(axis.text.x = element_text(size = 12, angle = 90, hjust = 1, vjust = 0.5)) +
        theme(plot.title = element_text(size = 18, face = "bold",
                                        hjust = 0.5, margin = margin(b = 30, unit = "pt"))) +
        theme(axis.title.x = element_blank()) +
        theme(axis.title.y = element_text(size = 14, face="bold")) +
        theme(panel.background = element_blank(), axis.line = element_line(colour = "black")) +
        theme(panel.border = element_rect(colour = "black", fill = NA, size = 0.5)) +
        theme(strip.background = element_rect(fill = alpha("burlywood3", 0.3), color = "black", size = 0.5)) +
        theme(legend.title = element_blank())

Cleaning the data

The data set from which we have taken the negative words to remove the profanitiy is Data Set

Opinion Lexicon: Negative

This file contains a list of NEGATIVE opinion words (or sentiment words).

; If you use this list, please cite the following paper: ; ; Minqing Hu and Bing Liu. “Mining and Summarizing Customer Reviews.” ; Proceedings of the ACM SIGKDD International Conference on Knowledge ; Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, ; Washington, USA, ;

; Notes: ; 1. The appearance of an opinion word in a sentence does not necessarily
; mean that the sentence expresses a positive or negative opinion. ; See the paper below: ; ; Bing Liu. “Sentiment Analysis and Subjectivity.” An chapter in ; Handbook of Natural Language Processing, Second Edition, ; (editors: N. Indurkhya and F. J. Damerau), 2010. ;````````` ; 2. You will notice many misspelled words in the list. They are not ; mistakes. They are included as these misspelled words appear ; frequently in social media content

note:- I removed some of the words for this notes

df_t <- cleaning(tweets)
head(df_t,n=5)

##                                                                                                       lines
## 1:  how are you btw thanks for the rt you gonna be in dc anytime soon love to see you been way way too long
## 2: when you meet someone special youll know your heart will beat more rapidly and youll smile for no reason
## 3:                                                                    theyve decided its more fun if i dont
## 4:                             so tired d played lazer tag ran a lot d ughh going to sleep like in minutes 
## 5:                                             words from a complete stranger made my birthday even better

df_b <- cleaning(blogs)
head(df_b,n=5)

##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   lines
## 1:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             in the years thereafter most of the oil fields and platforms were named after pagan gods
## 2:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 we love you mr brown
## 3: chad has been awesome with the kids and holding down the fort while i work later than usual the kids have been busy together playing skylander on the xbox together after kyan cashed in his from his piggy bank he wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it he never taps into that thing either that is how we know he wanted it so bad we made him count all of his money to make sure that he had enough it was very cute to watch his reaction when he realized he did he also does a very good job of letting lola feel like she is playing too by letting her switch out the characters she loves it almost as much as him
## 4:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 so anyways i am going to share some home decor inspiration that i have been storing in my folder on the puter i have all these amazing images stored away ready to come to life when we get our home
## 5:                                                                                   with graduation season right around the corner nancy has whipped up a fun set to help you out with not only your graduation cards and gifts but any occasion that brings on a change in ones life i stamped the images in memento tuxedo black and cut them out with circle nestabilities i embossed the kraft and red cardstock with tes new stars impressions plate which is double sided and gives you fantastic patterns you can see how to use the impressions plates in this tutorial taylor created just one pass through your die cut machine using the embossing pad kit is all you need to do super easy

df_n <- cleaning(news)
head(df_n,n=5)

##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          lines
## 1:                                                                                                                                                                                                                                                                                                                                                                                                                                                              he wasnt home alone apparently
## 2:                                                                                                                                                                                                                                                                                                                                           the st louis plant had to close it would die of old age workers had been making cars there since the onset of mass automotive production in the s
## 3:                                                                                                                                                                                                                                                                                                               wsus plans quickly became a hot topic on local online sites though most people applauded plans for the new biomedical center many deplored the potential loss of the building
## 4: the alaimo group of mount holly was up for a contract last fall to evaluate and suggest improvements to trenton water works but campaign finance records released this week show the two employees donated a total of to the political action committee pac partners for progress in early june partners for progress reported it gave more than in both direct and inkind contributions to mayor tony mack in the two weeks leading up to his victory in the mayoral runoff election june 
## 5:                                                                                                                                                                                                                                             and when its often difficult to predict a laws impact legislators should think twice before carrying any bill is it absolutely necessary is it an issue serious enough to merit their attention will it definitely not make the situation worse

tokenization

In this we will find the counts of words,lines and calculate the senitment of lines and also the count of ngrams(1 t0 3).

set.seed(1234)
df_t <- tokenization(df_t)
head(df_t,n=5)

##                                                                                          tweets
## 1:                                                     breakfast of champions and partyproofing
## 2:                               onward upward great day be in the film industry in louisiana d
## 3:                                            i mean they are wlw but still wheres the poc love
## 4: oh man the amount of my life i wasted playing that game at the arcade it was the best though
## 5:                     revival day a faithful believer with a pressing problem matthew dr booth
##    noofcharacters nooflines noofwords sentimentscore overallsentiment
## 1:             40         1         5     0.26832816                1
## 2:             62         1        12     0.14433757                1
## 3:             49         1        11     0.50880039                1
## 4:             92         1        20     0.07826238                1
## 5:             72         1        12     0.31754265                1
##    noof_1_grams noof_2_grams noof_3_grams
## 1:            5            4            3
## 2:           12           11           10
## 3:           11           10            9
## 4:           20           19           18
## 5:           12           11           10

df_b <- tokenization(df_b)
head(df_b,n=5)

##                                                                                                                                                                                                         tweets
## 1:                        non police readers who want to know more about what a duty inspector does or police officers for that matter should read the bottom few paragraphs which outline the role a bit more
## 2:                                          awardwinner and the one who has given the prize have to show the link of arte y pico blog so everyone will know the origin of this award which is here arte y pico
## 3: being like a lot postmenopausal women i didnt know too much about what my body was up to i had had an easy time of menopause my cycle simply stopped no more periods no odd symptoms no hot flashes nothing
## 4:                 as these and a multitude of other familiar comforts have slotted into their usual places i have happily given in to mishymashy sentimentality oh come all ye ghosties joyful and triumphant
## 5:                                                                                                            thinking shes by his side only to finally realize hes not at home that hes in france and shes at
##    noofcharacters nooflines noofwords sentimentscore overallsentiment
## 1:            180         1        33    0.008703883                1
## 2:            163         1        34    0.265822807                1
## 3:            203         1        40    0.515451259                1
## 4:            187         1        31    0.871085715                1
## 5:             96         1        20    0.017888544                1
##    noof_1_grams noof_2_grams noof_3_grams
## 1:           33           32           31
## 2:           34           33           32
## 3:           40           39           38
## 4:           31           30           29
## 5:           20           19           18

df_n <- tokenization(df_n)
head(df_n,n=5)

##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         tweets
## 1:                                                                                                                                                                                                                                                                                  im proud of them because they basically achieved what everyone else is trying to achieve baez said everyone wants to play in college get a good education and basically go to school for free thats what theyre both doing
## 2:                                                                                                                                                                                                                                                                                                                                      st louis jessica p chamberlain valarie c frazier latosha a haney deanna l jones latonya r ming tramyra t nathan rosemary ruffin charity j thompson mary turnerstockard
## 3:                                                                                                                                                                                                                                                                                                                                                                                                                                             raffle tickets for fun prizes and light snacks will be for sale
## 4:                                                                                                                                                                                                                                                                                                                                                  we liked most of the reds robert robbie cooks la vieille ferme a basic cotes du rhone was the perfect table wine it was light and charming and for a steal
## 5: romney has rejected many of the positions some of which date to his us senate run against ted kennedy in massachusetts years ago he has said repeatedly in recent years that he supports the right to keep and bear arms and is prolife he has promised to repeal the national health care reform law while defending massachusetts saying it was a stateonly model and newspaper reports suggest romney closed tax loopholes and raised fees while cutting spending to bring more money into state coffers
##    noofcharacters nooflines noofwords sentimentscore overallsentiment
## 1:            218         1        39      0.4803845                1
## 2:            166         1        27      0.1539601                1
## 3:             63         1        12      0.3319764                1
## 4:            154         1        31      0.3412501                1
## 5:            491         1        84      0.3578773                1
##    noof_1_grams noof_2_grams noof_3_grams
## 1:           39           38           37
## 2:           27           26           25
## 3:           12           11           10
## 4:           31           30           29
## 5:           84           83           82

some exploratory analysis on the datasets.

g <- ggplot(df_b,aes(overallsentiment))
g+geom_bar()+ggtitle("sentiment of sentences in blogs")

g <- ggplot(df_n,aes(overallsentiment))
g+geom_bar() + ggtitle("sentiment of sentences in news")

g <- ggplot(df_t,aes(overallsentiment))
g+geom_bar() + ggtitle("sentment of sentences in twitter")

3_grams division and their counts

grams_t <- ngrams(df_t)
head(grams_t,n=5)

##                 words count   first  second thrid
## 1:               <NA>   988      NA      NA    NA
## 2:     thanks for the   306  thanks     for   the
## 3:         i love you   121       i    love   you
## 4: looking forward to   116 looking forward    to
## 5:       cant wait to   106    cant    wait    to

grams_b <- ngrams(df_b)
head(grams_b,n=5)

##         words count first second thrid
## 1:       <NA>  1596    NA     NA    NA
## 2: one of the   501   one     of   the
## 3:   a lot of   429     a    lot    of
## 4: be able to   234    be   able    to
## 5: as well as   227    as   well    as

grams_n <- ngrams(df_n)
head(grams_n,n=5)

##           words count first second thrid
## 1:         <NA>   851    NA     NA    NA
## 2:   one of the   415   one     of   the
## 3:     a lot of   332     a    lot    of
## 4:   as well as   181    as   well    as
## 5: in the first   163    in    the first

some exploratory analysis on the data sets

grams_b_1 <- grams_b[1:20,]
grams_n_1 <- grams_n[1:20,]
grams_t_1 <- grams_t[1:20,]
words_cloud <- function(ngrams)
    wordcloud(ngrams$words, ngrams$count, scale = c(3, 0.4), colors = brewer.pal(8, "Dark2"))

word cloud of the blogs of top 20 3_grams

word cloud of the news of top 20 3_grams

word cloud of the tweets of top 20 3_grams

some barplots of the 3-grams on the data set

About the prediction and buliding the shiny app

As of my understanding and learning I planning to use the markov chain trainsition and with parameters using the Katz Back-off and katz probabilites . The model will be bulit for the next word prediction of sentences,words and phrases and it would be multiple prediction.

As part of Shiny app we need to work with sample of data to apply entire population as the servers need to handle.

project_report_upto_explonatory_analysis

Srinithin

7/14/2020

Project Overview