#Module 2 Milestone Report ##1. Does the link lead to an HTML page describing the exploratory analysis of the training data set?
Loading data from local machine
library(stringi)
library(quanteda)
## Warning: package 'quanteda' was built under R version 4.2.3
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "ndiMatrix" of class "replValueSp"; definition not updated
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "pcorMatrix" of class "replValueSp"; definition not updated
## Package version: 4.0.2
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 12 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
# loading data from local machine
en_US.blogs.data <- readLines("C:/Users/wmwabumba/Documents/R Code/data/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
en_US.news.data <- readLines("C:/Users/wmwabumba/Documents/R Code/data/Coursera-SwiftKey/final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("C:/Users/wmwabumba/Documents/R
## Code/data/Coursera-SwiftKey/final/en_US/en_US.news.txt", :
## incomplete final line found on 'C:/Users/wmwabumba/Documents/R
## Code/data/Coursera-SwiftKey/final/en_US/en_US.news.txt'
en_US.twitter.data <- readLines("C:/Users/wmwabumba/Documents/R Code/data/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
##2. Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables? Performing Word count, lines count and placing them in a data table counting of word countin line numbers *creating a data table
#counting number of words in each text file
en_US.twitter.data.words <- stri_count_words(en_US.twitter.data)
en_US.blogs.data.words <- stri_count_words(en_US.blogs.data)
en_US.news.data.words <- stri_count_words(en_US.news.data)
#counting number of lines in each file
en_US.twitter.data.lines <- length(en_US.twitter.data)
en_US.blogs.data.lines <- length(en_US.blogs.data)
en_US.news.data.lines <- length(en_US.news.data)
#counting the file size, maximum characters, , minimum characters and then
data_table <- data.frame(Type= c("Twitter", "Blogs", "News"),
File_size_in_MB = c(object.size(en_US.twitter.data)/(1024^2), object.size(en_US.blogs.data)/(1024^2), object.size(en_US.news.data)/(1024^2))
,
Number_of_Lines = c(en_US.twitter.data.lines,en_US.blogs.data.lines,en_US.news.data.lines),
Number_of_Words = c(sum(en_US.twitter.data.words),sum(en_US.blogs.data.words),
sum(en_US.news.data.words)),
Median_Characters = c(median(nchar(en_US.twitter.data)), median(nchar(en_US.blogs.data)), median(nchar(en_US.news.data))),
Maximum_Characters = c(max(nchar(en_US.twitter.data)), max(nchar(en_US.blogs.data)), max(nchar(en_US.news.data)))
)
knitr::kable(data_table)
| Type | File_size_in_MB | Number_of_Lines | Number_of_Words | Median_Characters | Maximum_Characters |
|---|---|---|---|---|---|
| 318.98975 | 2360148 | 30093413 | 64 | 140 | |
| Blogs | 255.35453 | 899288 | 37546250 | 156 | 40833 |
| News | 19.76917 | 77259 | 2674536 | 186 | 5760 |
##3. Has the data scientist made basic plots, such as histograms to illustrate features of the data? and ##4. Was the report written in a brief, concise style, in a way that a non-data scientist manager could appreciate?
# Pre-processing of the training datasets
# sampling only 20% of each dataset
set.seed(1234)
en_US.twitter.sample.data <- sample(en_US.twitter.data, length(en_US.twitter.data) * 0.2)
en_US.blogs.sample.data <- sample(en_US.blogs.data, length(en_US.blogs.data) * 0.2)
en_US.news.sample.data <- sample(en_US.news.data, length(en_US.news.data) * 0.2)
tbn <- c(en_US.twitter.sample.data, en_US.blogs.sample.data, en_US.news.sample.data); rm(en_US.twitter.data, en_US.blogs.data, en_US.news.data)
#Summarizing the datasets
tbn_corpus <- as.list(strsplit(tbn, " "))
tbn_corpus <- unlist(tbn_corpus)
#Removing non-alphabetic symbols: punctuation marks, numbers and others.
tbn_corpus <- strsplit(gsub("[^[:alnum:] ]", "", tbn_corpus), " +")
tbn_corpus <- unlist(tbn_corpus)
#reporting the top 20 most common words (unigrams) with more than 7 letters by identifying the desired words and their frequencies.
top_ten_words <- as.data.frame(table(tbn_corpus[nchar(tbn_corpus) > 6]))
top_ten_words <- top_ten_words[order(top_ten_words$Freq, decreasing = T),]
top_ten_words <- top_ten_words[c(1:20),]
names(top_ten_words)[1] <- paste("word"); names(top_ten_words)[2] <- paste("freq")
barplot(top_ten_words$freq, names = top_ten_words$word, las = 3, main="Top 20 used words")
#Distribution of the number of words for the twitter dataset
hist(en_US.twitter.data.words, xlab = "Twitter File Distribution of words per line", main = "Twitter Daset",
xlim = c(0, 40), breaks = 20)
#Distribution of the number of words for blogs dataset
hist(en_US.blogs.data.words, xlab = "Blogs File Distribution of words per line", main = "Blogs Daset",
xlim = c(0, 40), breaks = 500)
#Distribution of the number of words for the news dataset
hist(en_US.news.data.words, xlab = "News File Distribution of words per line", main = "News Daset",
xlim = c(0, 40), breaks = 500)