## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Introduction

We will be building a predictive model for texts by understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.In this exercise we will be doign Tasks to accomplish Exploratory analysis - understanding the distribution of words and relationship between the words in the corpora.Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.

Date - 06/08/2020

knitr::opts_knit$set(root.dir = "C:/Sumeet/Visa/Knowledge/Coursera/datascience/DataProducts/Coursera-SwiftKey/final/en_US")

Load the data from twitter

There are four language databases in the dataset, We will be using English database for exploratory analysis

## Warning in readLines(con = "C:/Sumeet/Visa/Knowledge/Coursera/datascience/
## DataProducts/Coursera-SwiftKey/final/en_US/en_US.news.txt"): incomplete final
## line found on 'C:/Sumeet/Visa/Knowledge/Coursera/datascience/DataProducts/
## Coursera-SwiftKey/final/en_US/en_US.news.txt'
## Warning in readLines(con = "C:/Sumeet/Visa/Knowledge/Coursera/datascience/
## DataProducts/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"): line 167155
## appears to contain an embedded nul
## Warning in readLines(con = "C:/Sumeet/Visa/Knowledge/Coursera/datascience/
## DataProducts/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"): line 268547
## appears to contain an embedded nul
## Warning in readLines(con = "C:/Sumeet/Visa/Knowledge/Coursera/datascience/
## DataProducts/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"): line 1274086
## appears to contain an embedded nul
## Warning in readLines(con = "C:/Sumeet/Visa/Knowledge/Coursera/datascience/
## DataProducts/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"): line 1759032
## appears to contain an embedded nul
## [1] 899288
## [1] "Number of line in the n_US.blogs.txt  899288"
## [1] 210160014
## [1] "Number of line in the en_US.news.txt  2360148"
## [1] 205811889
## [1] "Number of line in the en_US.twitter.txt  77259"
## [1] 167105338
## [1] "Language is english"

Analyze the data and find the unique words and pair

Go over each line and convert each line into a table of words. Aggregate them to find the words most used. Similar process is followed for the wordpair.Since the data set is huge , we will be analyzing 10000 lines

i <-0
l <- length(data)
ncount <- 0

#while(i < l+1) {
while(i < 10000) {
  i <- i+1
  
  # words list
  splitword <- strsplit(data[i],' ')
  wordline <- data.frame(as.data.table(splitword))
  
  
  j <-0
  wordl <- length(splitword[[1]])
  while(j < wordl) {
    j <- j+1
    linewordpairdf <- data.frame(as.data.table(paste((splitword[[1]])[j] , " ",(splitword[[1]])[j+1]) ))
    if(j==1 && i==1)   wordpairdf <- linewordpairdf
    else 
      wordpairdf <- rbind( wordpairdf, linewordpairdf)
  }
  
  k <-0
  worldmin1 <- wordl - 1
  while(k < worldmin1) {
    k <- k+1
    linewordpairdf2 <- data.frame(as.data.table(paste( (splitword[[1]])[k] , " ",(splitword[[1]])[k+1] , " ",(splitword[[1]])[k+2])))
    if(k==1 && i==1)   wordpairdf2 <- linewordpairdf2
    else 
      wordpairdf2 <- rbind( wordpairdf2, linewordpairdf2)
  }
  
  
 
   # getting word count
   if (i==1)   worddf <- wordline
   else   worddf <- rbind(worddf,wordline)
  
  # getting pairs
  
  
  
  

}

Analyzing the data and plotting top words

##     V1  N
## 1: How 93
##     word    N
##  1:  the 3465
##  2:   to 3138
##  3:    I 2579
##  4:    a 2401
##  5:  you 1747
##  6:  and 1675
##  7:  for 1548
##  8:   of 1475
##  9:   in 1458
## 10:   is 1420

Analyzing the data and plotting top pair of words

##         wordpair   N
##  1:     in   the 309
##  2:    for   the 300
##  3:      :)   NA 266
##  4:     of   the 238
##  5:       RT   : 221
##  6:      to   be 208
##  7:     on   the 176
##  8:     to   the 168
##  9:     at   the 133
## 10: Thanks   for 131

Analyzing the data and plotting top gram-3 of words

Below is the analysis of three words appearing in conjunction and there frequency of it Plot shows the data for top-10

##              wordpair2  N
##  1: Thanks   for   the 74
##  2: thanks   for   the 32
##  3:    going   to   be 29
##  4:      I   want   to 29
##  5:  Thank   you   for 29
##  6:       I   have   a 24
##  7:        to   be   a 24
##  8:     to   see   you 22
##  9:       a   lot   of 21
## 10:    you   want   to 21

Conclusion

Created a dictionary of words, wordpair 2-gram, tri-gram wordpair . These words help us predict the next words when we type any word. Frequency of 2-gram and 3-gram will help us in defining the weight .