The following text files are being used to prepare for the project
en_US.blogs.txt
en_US.news.txt
en_US.twitter.txt
The first step is to get a quick statistics on these files
The following code calculated the number of lines and number of words in each file
Files <- c("blogs", "news", "twitter")
Stats <- as.data.frame(paste("en_US.", Files, sep =""))
for (i in 1:3) {
conn <- file(paste("en_US.", Files[i], ".txt", sep = ""), "r")
nline <- 0
nword <- 0
while (TRUE) {
oneline <- readLines(conn, 1, skipNul = TRUE)
if (length(oneline) == 0) { break }
nline <- nline + 1
nword <- nword + length(strsplit(oneline, "\\W+")[[1]])
}
Stats[i,2] <- nline
Stats[i,3] <- nword
close(conn)
}
Since these files are big, we can create a Sample data for each of the above file and call them
1. Sampleblogs.txt
2. Samplenews.txt
3. Sampletwitter.txt
These files are created by extracting 10,000 random lines from corresponding files
Sample script used is as below
ssize <- 10000
writeLines("“, con =”Course10/Samplenews.txt“, sep =”“, useBytes = FALSE)
Random <- floor(runif(n = ssize, min = 1, max = 1010242))
for (i in 1:ssize) {
conn <- file(”Course10/en_US.news.txt“,”r“)
temp <- readLines(conn, Random[i]-1, skipNul = TRUE)
oneline <- readLines(conn, 1, skipNul = TRUE) write(oneline, file =”Course10/Samplenews.txt“, sep =”", append = TRUE)
close(conn)
}
Next step is to identify all words in the Sample data and analyse their frequencies
We can store these words as below
Bwords : all words from blogs sample
Nwords : all words from news sample
Twords : all words from twitter sample
While extracting these words, words with "" (Null) has been removed from the list
Also, removed are the words which contains ’_’ only
for (i in 1:3) {
conn <- file(paste("Sample", Files[i], ".txt", sep = ""), "r")
allwords <- vector()
while (TRUE) {
oneline <- readLines(conn, 1, skipNul = TRUE)
if (length(oneline) == 0) { break }
allwords <- c(allwords, strsplit(oneline, "\\W+")[[1]])
}
close(conn)
allwords <- allwords[allwords != ""]
allwords <- allwords[!grepl("_", allwords)]
if (i == 1) { Bwords <- allwords }
if (i == 2) { Nwords <- allwords }
if (i == 3) { Twords <- allwords }
Stats[i,4] <- length(allwords)
Stats[i,5] <- length(unique(allwords))
}
Here are the statistics calculated
names(Stats) <- c("File", "Lines", "Words", "Sample_Words", "Unique_Sample_Words")
Stats
## File Lines Words Sample_Words Unique_Sample_Words
## 1 en_US.blogs 899288 38635978 425416 36463
## 2 en_US.news 1010242 35939068 355244 33268
## 3 en_US.twitter 2360148 31231111 132029 19094
hist(table(Bwords), main = "Histogram of blogs words", xlab = "number of words")
hist(table(Nwords), main = "Histogram of news words", xlab = "number of words")
hist(table(Twords), main = "Histogram of twitter words", xlab = "number of words")
The reason why the histogram looks like this can be viewed by looking at the words itself
head(sort(table(Bwords), decreasing = TRUE), 50)
## Bwords
## the to and a of I in that is it for s you
## 18139 11587 11335 9602 9583 9250 6136 4912 4674 3994 3827 3464 3131
## was with â on my be have this as The are t at
## 3099 3094 3015 2761 2650 2365 2269 2225 2142 2104 2101 2025 1817
## not but or we from all me by so they an can will
## 1780 1719 1594 1531 1502 1439 1434 1387 1379 1285 1250 1228 1220
## out about one he had up your his her like their
## 1179 1168 1168 1155 1137 1136 1097 1058 1057 1048 1014
head(sort(table(Nwords), decreasing = TRUE), 50)
## Nwords
## the to and a of in s for that is on said The
## 17057 9016 8510 8433 7630 6409 4111 3316 3301 2797 2640 2499 2440
## with was at it I he as his be from have â are
## 2410 2321 2037 1979 1962 1752 1740 1517 1514 1495 1473 1386 1359
## by has an t not who will they this you or but about
## 1320 1217 1185 1182 1116 1079 1061 1044 939 912 893 868 864
## had their up one more out year He It would were
## 819 808 808 801 799 778 775 768 757 729 696
head(sort(table(Twords), decreasing = TRUE), 50)
## Twords
## the I to a you and for in of is it s on
## 3629 3360 3328 2475 2043 1713 1674 1567 1521 1393 1312 1306 1169
## my that t be me at with have your are this m can
## 1062 997 901 766 765 763 750 651 639 632 596 558 488
## so up just all was out like i we but not do get
## 484 478 474 472 470 466 444 443 443 434 423 420 418
## The RT about will love what u know from or day
## 377 373 352 341 335 330 328 326 325 320 318
hist(head(sort(table(Bwords), decreasing = TRUE), 100), main = "Histogram of top 100 blogs words", xlab = "Number of Words")
hist(head(sort(table(Nwords), decreasing = TRUE), 100), main = "Histogram of top 100 news words", xlab = "Number of Words")
hist(head(sort(table(Twords), decreasing = TRUE), 100), main = "Histogram of top 100 twitter words", xlab = "Number of Words")