Milestone Report

What I was asked for?

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs ( http://rpubs.com/ ) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Loading libraries and reading the data

First of all I am loading the needed libraries

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.2.2

library(stringi)

## Warning: package 'stringi' was built under R version 4.2.2

library(knitr)

## Warning: package 'knitr' was built under R version 4.2.2

library(tm)

## Warning: package 'tm' was built under R version 4.2.3

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.2.3

## Loading required package: RColorBrewer

The data were downloaded from the given link, stored locally, and then, the ones correspondig to US English were read in as “blogs”, “news” and “twitter”.

path_blogs <- paste("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt",sep="")
path_news <- paste("./Coursera-SwiftKey/final/en_US/en_US.news.txt",sep="")
path_twitter <-  paste("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt", sep="")

blogs <- readLines(path_blogs, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(path_news, encoding = "UTF-8", skipNul = TRUE)

## Warning in readLines(path_news, encoding = "UTF-8", skipNul = TRUE): incomplete
## final line found on './Coursera-SwiftKey/final/en_US/en_US.news.txt'

twitter <- readLines(path_twitter, encoding = "UTF-8", skipNul = TRUE)

Summary of the data

To summarize the data, the following table is made. It shows the size of the files.

stats <- data.frame(
  FileName=c("en_US.blogs","en_US.news","en_US.twitter"),
  FileSizeinMB=c(file.info(path_blogs)$size/1024^2,
                 file.info(path_news)$size/1024^2,
                 file.info(path_twitter)$size/1024^2),
  t(rbind(sapply(list(blogs,news,twitter), stri_stats_general),
          WordCount=
            sapply(list(blogs, news, twitter),
                   stri_stats_latex)[4,]))
)
kable(stats)

FileName	FileSizeinMB	Lines	LinesNEmpty	Chars	CharsNWhite	WordCount
en_US.blogs	200.4242	899288	899288	206824382	170389539	37570839
en_US.news	196.2775	77259	77259	15639408	13072698	2651432
en_US.twitter	159.3641	2360148	2360148	162096241	134082806	30451170

Sample data

Given the extension of the files, the data will be limited to keep the code efficient.

# set seed for reproducability
set.seed(660067)

# assign sample size
sampleSize = 0.01

# sample all three data sets
sampleBlogs <- sample(blogs, length(blogs) * sampleSize, replace = FALSE)
sampleNews <- sample(news, length(news) * sampleSize, replace = FALSE)
sampleTwitter <- sample(twitter, length(twitter) * sampleSize, replace = FALSE)

# remove all non-English characters from the sampled data
sampleBlogs <- iconv(sampleBlogs, "latin1", "ASCII", sub = "")
sampleNews <- iconv(sampleNews, "latin1", "ASCII", sub = "")
sampleTwitter <- iconv(sampleTwitter, "latin1", "ASCII", sub = "")

# combine all three data sets into a single data set and write to disk
sampleData <- c(sampleBlogs, sampleNews, sampleTwitter)
sampleDataFileName <- "Coursera-SwiftKey/final/en_US/en_US.sample.txt"
con <- file(sampleDataFileName, open = "w")
writeLines(sampleData, con)
close(con)

# get number of lines and words from the sample data set
sampleDataLines <- length(sampleData);
sampleDataWords <- sum(stri_count_words(sampleData))

# remove variables no longer needed to free up memory
rm(blogs, news, twitter, sampleBlogs, sampleNews, sampleTwitter)

Corpus

Sample <- paste("./Coursera-SwiftKey/final/en_US/en_US.sample.txt",sep="")
corpus <- Corpus(DirSource("./Coursera-SwiftKey/final/en_US"), readerControl=list(reader=readPlain))

#Create function to transform the data
removeURL <- function(x) gsub("http[[:alnum:]]*","",x)
removeSign <- function(x) gsub("[[:punct:]]","",x)
removeNum <- function(x) gsub("[[:digit:]]","",x)
removeapo <- function(x) gsub("'","",x)
removeNonASCII <- function(x) iconv(x, "latin1", "ASCII", sub="")
removerepeat <- function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x)
toLowerCase <- function(x) sapply(x,tolower)
removeSpace <- function(x) gsub("\\s+"," ",x)

#Transform the corpus
corpus<-tm_map(corpus,content_transformer(removeapo))#remove apostrophe
corpus<-tm_map(corpus,content_transformer(removeNum))#remove numbers
corpus<-tm_map(corpus,content_transformer(removeURL)) #remove web url
corpus<-tm_map(corpus,content_transformer(removeSign)) #remove number and punctuation except apostrophe
corpus<-tm_map(corpus,content_transformer(removeNonASCII)) #remove non-ASCII
corpus<-tm_map(corpus,content_transformer(toLowerCase))# convert uppercase to lowercase
corpus<-tm_map(corpus,content_transformer(removerepeat))# remove repeated alphabets in a words
corpus<-tm_map(corpus,content_transformer(removeSpace)) #remove multiple space
corpus<-tm_map(corpus,removeWords,stopwords("english")) #remove common english words

Word frequency plot

The frequncies in which words appear in the provided data are shown in a frequency plot.

corpus_tdm <- TermDocumentMatrix(corpus)
corpus_tdm_m <- as.matrix(corpus_tdm)
corpus_tdm_m_freq <- rowSums(corpus_tdm_m)
corpus_tdm_m_freq<-sort(corpus_tdm_m_freq, decreasing = TRUE)
corpus_tdm <- TermDocumentMatrix(corpus)
corpus_tdm_m <- as.matrix(corpus_tdm)
corpus_tdm_m_freq <- rowSums(corpus_tdm_m)
corpus_tdm_m_freq<-sort(corpus_tdm_m_freq, decreasing = TRUE)
barplot(corpus_tdm_m_freq[1:25], las = 2, main = "Word Frequency in the data")

Wordcloud

Also a wordcloud plot is shown with the most frequently used terms in the datasets.

wordcloud(corpus, max.words=75, random.order=TRUE, rot.per=.15, colors=colorRampPalette(brewer.pal(9,"Blues"))(32), scale=c(3, .3))