Quick review of statistics

The following text files are being used to prepare for the project
en_US.blogs.txt
en_US.news.txt
en_US.twitter.txt

Line and Word count

The first step is to get a quick statistics on these files
The following code calculated the number of lines and number of words in each file

Files <- c("blogs", "news", "twitter")
Stats <- as.data.frame(paste("en_US.", Files, sep =""))
for (i in 1:3) {
   conn <- file(paste("en_US.", Files[i], ".txt", sep = ""), "r")
   nline <- 0
   nword <- 0
   while (TRUE) {
      oneline <- readLines(conn, 1, skipNul = TRUE)
      if (length(oneline) == 0) { break }
      nline <- nline + 1
      nword <- nword + length(strsplit(oneline, "\\W+")[[1]])
   }
   Stats[i,2] <- nline
   Stats[i,3] <- nword
   close(conn)
}

Sample data

Since these files are big, we can create a Sample data for each of the above file and call them
1. Sampleblogs.txt
2. Samplenews.txt
3. Sampletwitter.txt
These files are created by extracting 10,000 random lines from corresponding files
Sample script used is as below
ssize <- 10000
writeLines("“, con =”Course10/Samplenews.txt“, sep =”“, useBytes = FALSE)
Random <- floor(runif(n = ssize, min = 1, max = 1010242))
for (i in 1:ssize) {
conn <- file(”Course10/en_US.news.txt“,”r“)
temp <- readLines(conn, Random[i]-1, skipNul = TRUE)
oneline <- readLines(conn, 1, skipNul = TRUE) write(oneline, file =”Course10/Samplenews.txt“, sep =”", append = TRUE)
close(conn)
}

Sampling the sample

Next step is to identify all words in the Sample data and analyse their frequencies
We can store these words as below
Bwords : all words from blogs sample
Nwords : all words from news sample
Twords : all words from twitter sample
While extracting these words, words with "" (Null) has been removed from the list
Also, removed are the words which contains ’_’ only

for (i in 1:3) {
   conn <- file(paste("Sample", Files[i], ".txt", sep = ""), "r")
   allwords <- vector()
   while (TRUE) {
      oneline <- readLines(conn, 1, skipNul = TRUE)
      if (length(oneline) == 0) { break }
      allwords <- c(allwords, strsplit(oneline, "\\W+")[[1]])
   }
   close(conn)
   allwords <- allwords[allwords != ""]
   allwords <- allwords[!grepl("_", allwords)]
   if (i == 1) { Bwords <- allwords }
   if (i == 2) { Nwords <- allwords }
   if (i == 3) { Twords <- allwords }
   Stats[i,4] <- length(allwords)
   Stats[i,5] <- length(unique(allwords))
}

Here are the statistics calculated

names(Stats) <- c("File", "Lines", "Words", "Sample_Words", "Unique_Sample_Words")
Stats
##            File   Lines    Words Sample_Words Unique_Sample_Words
## 1   en_US.blogs  899288 38635978       425416               36463
## 2    en_US.news 1010242 35939068       355244               33268
## 3 en_US.twitter 2360148 31231111       132029               19094

Histogram of words frequency

hist(table(Bwords), main = "Histogram of blogs words", xlab = "number of words")

hist(table(Nwords), main = "Histogram of news words", xlab = "number of words")

hist(table(Twords), main = "Histogram of twitter words", xlab = "number of words")

The reason why the histogram looks like this can be viewed by looking at the words itself

head(sort(table(Bwords), decreasing = TRUE), 50)
## Bwords
##   the    to   and     a    of     I    in  that    is    it   for     s   you 
## 18139 11587 11335  9602  9583  9250  6136  4912  4674  3994  3827  3464  3131 
##   was  with     â    on    my    be  have  this    as   The   are     t    at 
##  3099  3094  3015  2761  2650  2365  2269  2225  2142  2104  2101  2025  1817 
##   not   but    or    we  from   all    me    by    so  they    an   can  will 
##  1780  1719  1594  1531  1502  1439  1434  1387  1379  1285  1250  1228  1220 
##   out about   one    he   had    up  your   his   her  like their 
##  1179  1168  1168  1155  1137  1136  1097  1058  1057  1048  1014
head(sort(table(Nwords), decreasing = TRUE), 50)
## Nwords
##   the    to   and     a    of    in     s   for  that    is    on  said   The 
## 17057  9016  8510  8433  7630  6409  4111  3316  3301  2797  2640  2499  2440 
##  with   was    at    it     I    he    as   his    be  from  have     â   are 
##  2410  2321  2037  1979  1962  1752  1740  1517  1514  1495  1473  1386  1359 
##    by   has    an     t   not   who  will  they  this   you    or   but about 
##  1320  1217  1185  1182  1116  1079  1061  1044   939   912   893   868   864 
##   had their    up   one  more   out  year    He    It would  were 
##   819   808   808   801   799   778   775   768   757   729   696
head(sort(table(Twords), decreasing = TRUE), 50)
## Twords
##   the     I    to     a   you   and   for    in    of    is    it     s    on 
##  3629  3360  3328  2475  2043  1713  1674  1567  1521  1393  1312  1306  1169 
##    my  that     t    be    me    at  with  have  your   are  this     m   can 
##  1062   997   901   766   765   763   750   651   639   632   596   558   488 
##    so    up  just   all   was   out  like     i    we   but   not    do   get 
##   484   478   474   472   470   466   444   443   443   434   423   420   418 
##   The    RT about  will  love  what     u  know  from    or   day 
##   377   373   352   341   335   330   328   326   325   320   318
hist(head(sort(table(Bwords), decreasing = TRUE), 100), main = "Histogram of top 100 blogs words", xlab = "Number of Words")

hist(head(sort(table(Nwords), decreasing = TRUE), 100), main = "Histogram of top 100 news words", xlab = "Number of Words")

hist(head(sort(table(Twords), decreasing = TRUE), 100), main = "Histogram of top 100 twitter words", xlab = "Number of Words")

Next steps

  1. Further analysis of Sample data
  2. Identify needed words
  3. Eliminate unwanted words
  4. Develop a model
  5. Refine the sample words further as identified by the model
  6. Finalize the model