## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
We will be building a predictive model for texts by understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.In this exercise we will be doign Tasks to accomplish Exploratory analysis - understanding the distribution of words and relationship between the words in the corpora.Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.
Date - 06/08/2020
knitr::opts_knit$set(root.dir = "C:/Sumeet/Visa/Knowledge/Coursera/datascience/DataProducts/Coursera-SwiftKey/final/en_US")
There are four language databases in the dataset, We will be using English database for exploratory analysis
## Warning in readLines(con = "C:/Sumeet/Visa/Knowledge/Coursera/datascience/
## DataProducts/Coursera-SwiftKey/final/en_US/en_US.news.txt"): incomplete final
## line found on 'C:/Sumeet/Visa/Knowledge/Coursera/datascience/DataProducts/
## Coursera-SwiftKey/final/en_US/en_US.news.txt'
## Warning in readLines(con = "C:/Sumeet/Visa/Knowledge/Coursera/datascience/
## DataProducts/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"): line 167155
## appears to contain an embedded nul
## Warning in readLines(con = "C:/Sumeet/Visa/Knowledge/Coursera/datascience/
## DataProducts/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"): line 268547
## appears to contain an embedded nul
## Warning in readLines(con = "C:/Sumeet/Visa/Knowledge/Coursera/datascience/
## DataProducts/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"): line 1274086
## appears to contain an embedded nul
## Warning in readLines(con = "C:/Sumeet/Visa/Knowledge/Coursera/datascience/
## DataProducts/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"): line 1759032
## appears to contain an embedded nul
## [1] 899288
## [1] "Number of line in the n_US.blogs.txt 899288"
## [1] 210160014
## [1] "Number of line in the en_US.news.txt 2360148"
## [1] 205811889
## [1] "Number of line in the en_US.twitter.txt 77259"
## [1] 167105338
## [1] "Language is english"
Go over each line and convert each line into a table of words. Aggregate them to find the words most used. Similar process is followed for the wordpair.Since the data set is huge , we will be analyzing 10000 lines
i <-0
l <- length(data)
ncount <- 0
#while(i < l+1) {
while(i < 10000) {
i <- i+1
# words list
splitword <- strsplit(data[i],' ')
wordline <- data.frame(as.data.table(splitword))
j <-0
wordl <- length(splitword[[1]])
while(j < wordl) {
j <- j+1
linewordpairdf <- data.frame(as.data.table(paste((splitword[[1]])[j] , " ",(splitword[[1]])[j+1]) ))
if(j==1 && i==1) wordpairdf <- linewordpairdf
else
wordpairdf <- rbind( wordpairdf, linewordpairdf)
}
k <-0
worldmin1 <- wordl - 1
while(k < worldmin1) {
k <- k+1
linewordpairdf2 <- data.frame(as.data.table(paste( (splitword[[1]])[k] , " ",(splitword[[1]])[k+1] , " ",(splitword[[1]])[k+2])))
if(k==1 && i==1) wordpairdf2 <- linewordpairdf2
else
wordpairdf2 <- rbind( wordpairdf2, linewordpairdf2)
}
# getting word count
if (i==1) worddf <- wordline
else worddf <- rbind(worddf,wordline)
# getting pairs
}
## V1 N
## 1: How 93
## word N
## 1: the 3465
## 2: to 3138
## 3: I 2579
## 4: a 2401
## 5: you 1747
## 6: and 1675
## 7: for 1548
## 8: of 1475
## 9: in 1458
## 10: is 1420
## wordpair N
## 1: in the 309
## 2: for the 300
## 3: :) NA 266
## 4: of the 238
## 5: RT : 221
## 6: to be 208
## 7: on the 176
## 8: to the 168
## 9: at the 133
## 10: Thanks for 131
Below is the analysis of three words appearing in conjunction and there frequency of it Plot shows the data for top-10
## wordpair2 N
## 1: Thanks for the 74
## 2: thanks for the 32
## 3: going to be 29
## 4: I want to 29
## 5: Thank you for 29
## 6: I have a 24
## 7: to be a 24
## 8: to see you 22
## 9: a lot of 21
## 10: you want to 21
Created a dictionary of words, wordpair 2-gram, tri-gram wordpair . These words help us predict the next words when we type any word. Frequency of 2-gram and 3-gram will help us in defining the weight .