The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs ( http://rpubs.com/ ) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.
First of all I am loading the needed libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.2
library(stringi)
## Warning: package 'stringi' was built under R version 4.2.2
library(knitr)
## Warning: package 'knitr' was built under R version 4.2.2
library(tm)
## Warning: package 'tm' was built under R version 4.2.3
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.2.3
## Loading required package: RColorBrewer
The data were downloaded from the given link, stored locally, and then, the ones correspondig to US English were read in as “blogs”, “news” and “twitter”.
path_blogs <- paste("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt",sep="")
path_news <- paste("./Coursera-SwiftKey/final/en_US/en_US.news.txt",sep="")
path_twitter <- paste("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt", sep="")
blogs <- readLines(path_blogs, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(path_news, encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines(path_news, encoding = "UTF-8", skipNul = TRUE): incomplete
## final line found on './Coursera-SwiftKey/final/en_US/en_US.news.txt'
twitter <- readLines(path_twitter, encoding = "UTF-8", skipNul = TRUE)
To summarize the data, the following table is made. It shows the size of the files.
stats <- data.frame(
FileName=c("en_US.blogs","en_US.news","en_US.twitter"),
FileSizeinMB=c(file.info(path_blogs)$size/1024^2,
file.info(path_news)$size/1024^2,
file.info(path_twitter)$size/1024^2),
t(rbind(sapply(list(blogs,news,twitter), stri_stats_general),
WordCount=
sapply(list(blogs, news, twitter),
stri_stats_latex)[4,]))
)
kable(stats)
| FileName | FileSizeinMB | Lines | LinesNEmpty | Chars | CharsNWhite | WordCount |
|---|---|---|---|---|---|---|
| en_US.blogs | 200.4242 | 899288 | 899288 | 206824382 | 170389539 | 37570839 |
| en_US.news | 196.2775 | 77259 | 77259 | 15639408 | 13072698 | 2651432 |
| en_US.twitter | 159.3641 | 2360148 | 2360148 | 162096241 | 134082806 | 30451170 |
Given the extension of the files, the data will be limited to keep the code efficient.
# set seed for reproducability
set.seed(660067)
# assign sample size
sampleSize = 0.01
# sample all three data sets
sampleBlogs <- sample(blogs, length(blogs) * sampleSize, replace = FALSE)
sampleNews <- sample(news, length(news) * sampleSize, replace = FALSE)
sampleTwitter <- sample(twitter, length(twitter) * sampleSize, replace = FALSE)
# remove all non-English characters from the sampled data
sampleBlogs <- iconv(sampleBlogs, "latin1", "ASCII", sub = "")
sampleNews <- iconv(sampleNews, "latin1", "ASCII", sub = "")
sampleTwitter <- iconv(sampleTwitter, "latin1", "ASCII", sub = "")
# combine all three data sets into a single data set and write to disk
sampleData <- c(sampleBlogs, sampleNews, sampleTwitter)
sampleDataFileName <- "Coursera-SwiftKey/final/en_US/en_US.sample.txt"
con <- file(sampleDataFileName, open = "w")
writeLines(sampleData, con)
close(con)
# get number of lines and words from the sample data set
sampleDataLines <- length(sampleData);
sampleDataWords <- sum(stri_count_words(sampleData))
# remove variables no longer needed to free up memory
rm(blogs, news, twitter, sampleBlogs, sampleNews, sampleTwitter)
Sample <- paste("./Coursera-SwiftKey/final/en_US/en_US.sample.txt",sep="")
corpus <- Corpus(DirSource("./Coursera-SwiftKey/final/en_US"), readerControl=list(reader=readPlain))
#Create function to transform the data
removeURL <- function(x) gsub("http[[:alnum:]]*","",x)
removeSign <- function(x) gsub("[[:punct:]]","",x)
removeNum <- function(x) gsub("[[:digit:]]","",x)
removeapo <- function(x) gsub("'","",x)
removeNonASCII <- function(x) iconv(x, "latin1", "ASCII", sub="")
removerepeat <- function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x)
toLowerCase <- function(x) sapply(x,tolower)
removeSpace <- function(x) gsub("\\s+"," ",x)
#Transform the corpus
corpus<-tm_map(corpus,content_transformer(removeapo))#remove apostrophe
corpus<-tm_map(corpus,content_transformer(removeNum))#remove numbers
corpus<-tm_map(corpus,content_transformer(removeURL)) #remove web url
corpus<-tm_map(corpus,content_transformer(removeSign)) #remove number and punctuation except apostrophe
corpus<-tm_map(corpus,content_transformer(removeNonASCII)) #remove non-ASCII
corpus<-tm_map(corpus,content_transformer(toLowerCase))# convert uppercase to lowercase
corpus<-tm_map(corpus,content_transformer(removerepeat))# remove repeated alphabets in a words
corpus<-tm_map(corpus,content_transformer(removeSpace)) #remove multiple space
corpus<-tm_map(corpus,removeWords,stopwords("english")) #remove common english words
The frequncies in which words appear in the provided data are shown in a frequency plot.
corpus_tdm <- TermDocumentMatrix(corpus)
corpus_tdm_m <- as.matrix(corpus_tdm)
corpus_tdm_m_freq <- rowSums(corpus_tdm_m)
corpus_tdm_m_freq<-sort(corpus_tdm_m_freq, decreasing = TRUE)
corpus_tdm <- TermDocumentMatrix(corpus)
corpus_tdm_m <- as.matrix(corpus_tdm)
corpus_tdm_m_freq <- rowSums(corpus_tdm_m)
corpus_tdm_m_freq<-sort(corpus_tdm_m_freq, decreasing = TRUE)
barplot(corpus_tdm_m_freq[1:25], las = 2, main = "Word Frequency in the data")
Also a wordcloud plot is shown with the most frequently used terms in the datasets.
wordcloud(corpus, max.words=75, random.order=TRUE, rot.per=.15, colors=colorRampPalette(brewer.pal(9,"Blues"))(32), scale=c(3, .3))