Capstone Project Task 2

#Module 2 Milestone Report ##1. Does the link lead to an HTML page describing the exploratory analysis of the training data set?

Loading data from local machine

library(stringi)
library(quanteda)

## Warning: package 'quanteda' was built under R version 4.2.3

## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "ndiMatrix" of class "replValueSp"; definition not updated

## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "pcorMatrix" of class "replValueSp"; definition not updated

## Package version: 4.0.2
## Unicode version: 13.0
## ICU version: 69.1

## Parallel computing: 12 of 12 threads used.

## See https://quanteda.io for tutorials and examples.

# loading data from local machine
en_US.blogs.data <- readLines("C:/Users/wmwabumba/Documents/R Code/data/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
en_US.news.data <- readLines("C:/Users/wmwabumba/Documents/R Code/data/Coursera-SwiftKey/final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)

## Warning in readLines("C:/Users/wmwabumba/Documents/R
## Code/data/Coursera-SwiftKey/final/en_US/en_US.news.txt", :
## incomplete final line found on 'C:/Users/wmwabumba/Documents/R
## Code/data/Coursera-SwiftKey/final/en_US/en_US.news.txt'

en_US.twitter.data <- readLines("C:/Users/wmwabumba/Documents/R Code/data/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

##2. Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables? Performing Word count, lines count and placing them in a data table counting of word countin line numbers *creating a data table

#counting number of words in each text file

en_US.twitter.data.words <- stri_count_words(en_US.twitter.data)
en_US.blogs.data.words <- stri_count_words(en_US.blogs.data)
en_US.news.data.words <- stri_count_words(en_US.news.data)

#counting number of lines in each file
en_US.twitter.data.lines <- length(en_US.twitter.data)
en_US.blogs.data.lines <- length(en_US.blogs.data)
en_US.news.data.lines <- length(en_US.news.data)

#counting the file size, maximum characters, , minimum characters and then
data_table <- data.frame(Type= c("Twitter", "Blogs", "News"), 
                        File_size_in_MB = c(object.size(en_US.twitter.data)/(1024^2), object.size(en_US.blogs.data)/(1024^2), object.size(en_US.news.data)/(1024^2))
                        ,
                        Number_of_Lines = c(en_US.twitter.data.lines,en_US.blogs.data.lines,en_US.news.data.lines),
                        Number_of_Words = c(sum(en_US.twitter.data.words),sum(en_US.blogs.data.words),
                                         sum(en_US.news.data.words)),
                        Median_Characters = c(median(nchar(en_US.twitter.data)), median(nchar(en_US.blogs.data)), median(nchar(en_US.news.data))),
                        Maximum_Characters = c(max(nchar(en_US.twitter.data)), max(nchar(en_US.blogs.data)), max(nchar(en_US.news.data)))
                       )

knitr::kable(data_table)

Type	File_size_in_MB	Number_of_Lines	Number_of_Words	Median_Characters	Maximum_Characters
Twitter	318.98975	2360148	30093413	64	140
Blogs	255.35453	899288	37546250	156	40833
News	19.76917	77259	2674536	186	5760

##3. Has the data scientist made basic plots, such as histograms to illustrate features of the data? and ##4. Was the report written in a brief, concise style, in a way that a non-data scientist manager could appreciate?

Question 4 has further been addressed through commenting in the code chucks

# Pre-processing of the training datasets
# sampling only 20% of each dataset
set.seed(1234)
en_US.twitter.sample.data <- sample(en_US.twitter.data, length(en_US.twitter.data) * 0.2)
en_US.blogs.sample.data <- sample(en_US.blogs.data, length(en_US.blogs.data) * 0.2)
en_US.news.sample.data <- sample(en_US.news.data, length(en_US.news.data) * 0.2)
tbn <- c(en_US.twitter.sample.data, en_US.blogs.sample.data, en_US.news.sample.data); rm(en_US.twitter.data, en_US.blogs.data, en_US.news.data)
#Summarizing the datasets
tbn_corpus <- as.list(strsplit(tbn, " "))
tbn_corpus <- unlist(tbn_corpus)
#Removing non-alphabetic symbols: punctuation marks, numbers and others.
tbn_corpus <- strsplit(gsub("[^[:alnum:] ]", "", tbn_corpus), " +")
tbn_corpus <- unlist(tbn_corpus)

#reporting the top 20 most common words (unigrams) with more than 7 letters by identifying the desired words and their frequencies.
top_ten_words <- as.data.frame(table(tbn_corpus[nchar(tbn_corpus) > 6]))
top_ten_words <- top_ten_words[order(top_ten_words$Freq, decreasing = T),]
top_ten_words <- top_ten_words[c(1:20),]
names(top_ten_words)[1] <- paste("word"); names(top_ten_words)[2] <- paste("freq")

barplot(top_ten_words$freq, names = top_ten_words$word, las = 3, main="Top 20 used words")

#Distribution of the number of words for the twitter dataset
hist(en_US.twitter.data.words, xlab = "Twitter File Distribution of words per line", main = "Twitter Daset", 
     xlim = c(0, 40), breaks = 20)

#Distribution of the number of words for blogs dataset
hist(en_US.blogs.data.words, xlab = "Blogs File Distribution of words per line", main = "Blogs Daset", 
     xlim = c(0, 40), breaks = 500)

#Distribution of the number of words for the news dataset
hist(en_US.news.data.words, xlab = "News File Distribution of words per line", main = "News Daset", 
     xlim = c(0, 40), breaks = 500)