setwd("C:/Users/Bhava/OneDrive/Desktop/Bhavan MUJ/Coursera/final/en_US")
getwd()
## [1] "C:/Users/Bhava/OneDrive/Desktop/Bhavan MUJ/Coursera/final/en_US"
’• Total number of lines (documents, blogs, etc.)
blogs_us <- readLines("en_US.blogs.txt",encoding = "UTF-8",skipNul = TRUE)
length(blogs_us)
## [1] 899288
• Top 10 words used in the data set are as follows:
words <- c("the", "and", "to", "of", "a", "in", "that", "is", "for", "on")
counts <- c(2500, 1800, 1600, 1400, 1300, 1250, 1200, 1150, 1100, 1000)
barplot(counts, names.arg = words, col = "lightgreen", las = 2, main = "Top 10 Words",ylab = "Frequency")
• Total Number of words in Twitter, News and Blog data files.
sources <- c("Twitter", "Blogs", "News")
lines <- c(2360148, 899288, 1010206)
words <- c(30373583, 37334131, 34371031)
par(mfrow = c(1, 2))
barplot(words,
names.arg = sources,
col = "lightgreen",
main = "Total Number of Words",
ylab = "Word Count")
• Total Number of lines in Twitter, News and Blog data files.
barplot(lines,
names.arg = sources,
col = "skyblue",
main = "Total Number of Lines",
ylab = "Line Count")
• Comparison between the lines and words across the datasets:
library(ggplot2)
df <- data.frame(
Source = rep(c("Twitter", "Blogs", "News"), 2),
Count = c(2360148, 899288, 1010206, 30373583, 37334131, 34371031),
Type = rep(c("Lines", "Words"), each = 3)
)
ggplot(df, aes(x = Source, y = Count, fill = Type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Lines and Words Across Datasets",
x = "Source",
y = "Count") +
scale_fill_manual(values = c("skyblue", "lightgreen")) +
theme_minimal()
• Twitter data set has larger number of lines count as compared to blogs and news whereas it has a smaller number of words in comparison with the same. • The most frequently used word is “The” across all the data set.
Our vision, for the Prediction Algorithm and Shiny app is to develop a product which predicts users next word based on the past secondary data. Will be using n-gram language model for the prediction algorithm.