This report presents the exploratory analysis of the training data for the Data Science Capstone project. The goal is to understand the dataset and prepare for building a prediction model and Shiny application.
setwd("C:/Users/91962/OneDrive/Desktop")
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
word_count <- function(x) {
sum(sapply(strsplit(x, "\\s+"), length))
}
blogs_words <- word_count(blogs)
news_words <- word_count(news)
twitter_words <- word_count(twitter)
data_summary <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(blogs_words, news_words, twitter_words)
)
data_summary
## Dataset Lines Words
## 1 Blogs 899288 37334131
## 2 News 1010206 34371031
## 3 Twitter 2360148 30373583
blog_chars <- nchar(blogs)
hist(blog_chars, main="Blogs Character Count", xlab="Characters per line")