This report is intended to be a first glance at the data science capstone project for the Johns Hopkins specialization, the goal of this project is to create a data product capable of predicting the words following a previous one that was typed by the user, similar to how a predictive typing works in smartphones.
To do this three data sets are evaluated, first a blogs recolection, second a news recollection and third a twitter recolection. The goal of this first report is to give a basic analysis of the dataset and an exploratory data analysis.
first, lets take a look at the three datasets in english and review the size in MB, the number of lines, the longest line in characters and the number of words of each dataset.
fileInformation <- function(filepath) {
size <- file.info(filepath)$size/1048576
conn <- file(filepath, "r")
fulltext <- readLines(conn)
nlines <- length(fulltext)
maxline <- 0
for (i in 1:nlines) {
linelength <- nchar(fulltext[i])
if (linelength > maxline) { maxline <- linelength }
}
nwords <- sum(stri_count_words(fulltext))
close(conn)
list(size=size, nlines=nlines, maxline=maxline, nwords=nwords)
}
data_dir <- 'C:/Dev/Estudio/Coursera/Data Science JH/Data Science Capstone/data/en_US/'
blog_info <- fileInformation(paste0(data_dir,"en_US.blogs.txt"))
news_info <- fileInformation(paste0(data_dir,"en_US.news.txt"))
## Warning in readLines(conn): incomplete final line found on 'C:/Dev/Estudio/
## Coursera/Data Science JH/Data Science Capstone/data/en_US/en_US.news.txt'
twit_info <- fileInformation(paste0(data_dir,"en_US.twitter.txt"))
## Warning in readLines(conn): line 167155 appears to contain an embedded nul
## Warning in readLines(conn): line 268547 appears to contain an embedded nul
## Warning in readLines(conn): line 1274086 appears to contain an embedded nul
## Warning in readLines(conn): line 1759032 appears to contain an embedded nul
print(matrix(c(blog_info[1],blog_info[2],blog_info[3],blog_info[4],
news_info[1],news_info[2],news_info[3],news_info[4],
twit_info[1],twit_info[2],twit_info[3],twit_info[4]),
nrow = 3, ncol = 4, byrow = TRUE,
dimnames = list(c("Blogs Info:", "News Info:", "Twit Info:"),
c("Size (MB)", "No of Lines", "Longest Line (No characters)", "No of words"))))
## Size (MB) No of Lines Longest Line (No characters) No of words
## Blogs Info: 200.4242 899288 40835 38154238
## News Info: 196.2775 77259 5760 2693898
## Twit Info: 159.3641 2360148 213 30218125
The next step is to create a corpus which will be a sample of each of the three text files in order to analyze the data.
Then after creating the corpus, we will transform to lower and remove the punctuation, numbers and stopwords.
corpus_lowercase <- tm_map(corpus, content_transformer(tolower))
corpus_low_punct <- tm_map(corpus_lowercase, removePunctuation)
corpus_low_punct_no <- tm_map(corpus_low_punct, removeNumbers)
corpus_low_punct_no_stop <- tm_map(corpus_low_punct_no, removeWords,stopwords("english"))
corpus_low_punct_no_stop_white <- tm_map(corpus_low_punct_no_stop, stripWhitespace)
Then with the corpus ready, i do not want to remove bad words as i want to check if they have an important frequency as a personal question. ## Analysis of the corpus ### A) 1 Gram first we create a 1 gram frequency dataset.
frq_data_1 = as.data.frame((as.matrix( TermDocumentMatrix(corpus_low_punct_no_stop_white) )) )
frq_data_1 <- sort(rowSums(frq_data_1),decreasing=TRUE)
frq_data_1 <- data.frame(word = names(frq_data_1),freq=frq_data_1)
secondly we create a 2 gram frequency dataset
tok <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
frq_data_2 <- as.data.frame((as.matrix( TermDocumentMatrix(corpus_low_punct_no_stop_white,control = list(tokenize = tok)) )) )
frq_data_2 <- sort(rowSums(frq_data_2),decreasing=TRUE)
frq_data_2 <- data.frame(word = names(frq_data_2),freq=frq_data_2)
lastly a 3 gram frequency dataset
tok3 <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
frq_data_3 = as.data.frame((as.matrix( TermDocumentMatrix(corpus_low_punct_no_stop_white,control = list(tokenize = tok3)) )) )
frq_data_3 <- sort(rowSums(frq_data_3),decreasing=TRUE)
frq_data_3 <- data.frame(word = names(frq_data_3),freq=frq_data_3)
to visually represent the gram datasets we create three histograms.
ggplot(frq_data_1[1:20,], aes(x=reorder(word, freq),y=freq)) +
geom_bar(stat="identity", width=0.5) +
labs(title="Unigrams")+
xlab("Unigrams") + ylab("Frequency")
ggplot(frq_data_2[1:10,], aes(x=reorder(word, freq),y=freq)) +
geom_bar(stat="identity", width=0.5) +
labs(title="Bigrams")+
xlab("Bigrams") + ylab("Frequency")
ggplot(frq_data_3[1:5,], aes(x=reorder(word, freq),y=freq)) +
geom_bar(stat="identity", width=0.5) +
labs(title="3-grams")+
xlab("3-grams") + ylab("Frequency")
Now that the datasets are cleaned and ready we can proceed to create a model to predict, as we can see the n-gram datasets will present a good advantage when training the model and creating the shiny app. it was a good test and learned a lot of different technologies developing it.