setwd("~/Dropbox/Coursera/DataScienceR/Capstone")
# fileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip?accessType=DOWNLOAD"
# download.file(fileUrl, destfile = "/Users/administrador/Specialization/capstone/SwiftKey.zip", method = "curl")
list.files()
## [1] "_df0955b4e78bda9942ab7efd4a49cdc4_logistic-regression-overfitting-annotated.pdf"
## [2] "Capstone.Rmd"
## [3] "Data Science Capstone - Milestone Report_files"
## [4] "Data Science Capstone - Milestone Report.html"
## [5] "Data Science Capstone Project__files"
## [6] "Data Science Capstone Project_.html"
## [7] "final"
## [8] "MilestoneReport_cache"
## [9] "MilestoneReport_files"
## [10] "MilestoneReport.html"
## [11] "MilestoneReport.Rmd"
# unzip("SwiftKey.zip", list =TRUE)
Exploratory Data Analysis # Loading and Cleaning Data * Due to the large sample size, reduce to the data into 0.05 the original size.
set.seed(1)
data_twitter.raw <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8")
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 167155 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 268547 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 1274086 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 1759032 appears to contain an embedded nul
data_news.raw <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8")
data_blogs.raw <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8")
paste("Number of lines from twitter:", length(data_twitter.raw))
## [1] "Number of lines from twitter: 2360148"
paste("Number of lines from news:", length(data_news.raw))
## [1] "Number of lines from news: 1010242"
paste("Number of lines from blog:", length(data_blogs.raw))
## [1] "Number of lines from blog: 899288"
data_twitter <- data_twitter.raw[sample(1:length(data_twitter.raw), length(data_twitter.raw) * 0.05)]
data_news <- data_news.raw[sample(1:length(data_news.raw), length(data_news.raw) * 0.05)]
data_blogs <- data_blogs.raw[sample(1:length(data_blogs.raw), length(data_blogs.raw) * 0.05)]
data.training <- c(data_twitter, data_news, data_blogs)
suppressWarnings(suppressMessages(library(stringi)))
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(magrittr)))
suppressWarnings(suppressMessages(library(markdown)))
suppressWarnings(suppressMessages(library(wordcloud)))
suppressWarnings(suppressMessages(library(tm)))
suppressWarnings(suppressMessages(library(NLP)))
suppressWarnings(suppressMessages(library(quanteda)))
suppressWarnings(suppressMessages(library(lattice)))
To explorate the data, I would get the words form the sample I created above. * Use quanteda library to remove all the puncturation.
tokens <- tokenize(toLower(data.training), removePunct = TRUE, removeNumbers=TRUE, simplify=TRUE)
dfm <- dfm(tokens, stem = TRUE)
##
## ... lowercasing
## ... tokenizing
## ... indexing documents: 5,007,148 documents
## ... indexing features: 145,576 feature types
## ... stemming features (English), trimmed 33691 feature variants
## ... created a 5007148 x 111885 sparse dfm
## ... complete.
## Elapsed time: 133.16 seconds.
dfm2 <- dfm(tokens, ignoredFeatures = stopwords("SMART"), stem = TRUE)
##
## ... lowercasing
## ... tokenizing
## ... indexing documents: 5,007,148 documents
## ... indexing features: 145,576 feature types
## ... removed 562 features, from 570 supplied (glob) feature types
## ... stemming features (English), trimmed 33322 feature variants
## ... created a 5007148 x 111692 sparse dfm
## ... complete.
## Elapsed time: 124.568 seconds.
print(dfm)
## Document-feature matrix of: 5,007,148 documents, 111,885 features.
print(dfm2)
## Document-feature matrix of: 5,007,148 documents, 111,692 features.
barchart(topfeatures(dfm, 20))
barchart(topfeatures(dfm2, 20))
plot(dfm, main = "Wordcloud for the top 100 features", max.words=100, colors = brewer.pal(6, "Dark2"), scale=c(8, .5))
plot(dfm2, main = "Wordcloud for the top 100 features (removing stopwords)", max.words=100, colors = brewer.pal(6, "Dark2"), scale=c(8, .5))
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): good could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): year could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): make could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): play could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): love could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): thing could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): time could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): live could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): life could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): god could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): thought
## could not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): peopl could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): support
## could not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): watch could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): offic could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): put could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): plan could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): public could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): back could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): feel could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): made could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): famili could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): morn could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): run could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): long could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): day could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): give could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): report could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): check could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): tonight
## could not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): high could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): season could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): big could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): happi could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): read could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): head could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): end could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): told could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): hous could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): today could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): hour could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): talk could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(features(x), colSums(x), ...): world could
## not be fit on page. It will not be plotted.
Due to having exam this week, I didn’t do a good job for my report. But I did show the simple analysis on the sample dataset. I will update this report later.
The results and plots showed that a lot of the words in the dataset are “unimportant words.” And the most frequently used words are suprisingly useless and have no information content. The next step I filtered all the less informtional words and then run the similar analysis.
From the horizontal bar chart, I notice the word distribution (frequency) following a interesting distribution. After ignoring stopwords, I noticed the frequence decreased dramatically. And the word cloud shows what we are focused on.