capstone project-1

DOWNLOADING & READING DATA

library(RColorBrewer)
library(wordcloud)
library(NLP)
library(tm)
library(stringi)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

capstoneDatasetUrl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
dataDir = "final"

if (!dir.exists(dataDir)) {
  dataZipFile <- "Coursera-SwiftKey.zip"
  if (!file.exists(dataZipFile))
    download.file(url, dataZipFile, method = "auto")
  unzip(dataZipFile)
  if (dir.exists(dataDir))
    file.remove(dataZipFile)
}
dataBlogs <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
dataNews <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)

## Warning in readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8", :
## incomplete final line found on './final/en_US/en_US.news.txt'

dataTwitter <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

SUMMARY OF DATA FROM BLOGS,NEWS,TWITTER

stri_stats_general(dataBlogs)

##       Lines LinesNEmpty       Chars CharsNWhite 
##      899288      899288   206824382   170389539

stri_stats_general(dataNews)

##       Lines LinesNEmpty       Chars CharsNWhite 
##       77259       77259    15639408    13072698

stri_stats_general(dataTwitter)

##       Lines LinesNEmpty       Chars CharsNWhite 
##     2360148     2360148   162096241   134082806

DATA ANALYSIS USING PLOTS

wordsblogs<- stri_count_words(dataBlogs)
summary(wordsblogs)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   28.00   41.75   60.00 6726.00

qplot(wordsblogs)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

wordsnews  <- stri_count_words(dataNews)
summary(wordsnews)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   32.00   34.62   46.00 1123.00

qplot( wordsnews)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

wordstwitter<- stri_count_words(dataTwitter)

qplot( wordstwitter )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(wordstwitter)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.75   18.00   47.00

EXPLORATORY DATA ANALYSIS-cleaning data ie,removing punctuation.

twitter<-sample(dataTwitter, 200)
twitter<- iconv(twitter, 'UTF-8', 'ASCII', "byte")
twitterVec <- VectorSource(twitter)  
twitterCorp <- Corpus(twitterVec)
twitterCorp<- tm_map(twitterCorp, tolower)

## Warning in tm_map.SimpleCorpus(twitterCorp, tolower): transformation drops
## documents

twitterCorp<- tm_map(twitterCorp, removeNumbers)

## Warning in tm_map.SimpleCorpus(twitterCorp, removeNumbers): transformation drops
## documents

twitterCorp<- tm_map(twitterCorp, removePunctuation)

## Warning in tm_map.SimpleCorpus(twitterCorp, removePunctuation): transformation
## drops documents

twitter2 <- tm_map(twitterCorp, PlainTextDocument)

## Warning in tm_map.SimpleCorpus(twitterCorp, PlainTextDocument): transformation
## drops documents

WORDCLOUD-TWITTER

wordcloud(twitterCorp, max.words = 5000, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(10, "Set1"))

## Warning in brewer.pal(10, "Set1"): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors

#.

WORDCLOUD-NEWS

n<-sample(dataNews, 200)
n<- iconv(n, 'UTF-8', 'ASCII', "byte")
nVec <- VectorSource(n)  
nCorp <- Corpus(nVec)
nCorp<- tm_map(nCorp, tolower)

## Warning in tm_map.SimpleCorpus(nCorp, tolower): transformation drops documents

nCorp<- tm_map(nCorp, removeNumbers)

## Warning in tm_map.SimpleCorpus(nCorp, removeNumbers): transformation drops
## documents

nCorp<- tm_map(nCorp, removePunctuation)

## Warning in tm_map.SimpleCorpus(nCorp, removePunctuation): transformation drops
## documents

n2 <- tm_map(nCorp, PlainTextDocument)

## Warning in tm_map.SimpleCorpus(nCorp, PlainTextDocument): transformation drops
## documents

wordcloud(nCorp, max.words = 5000, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(10, "Set1"))

## Warning in brewer.pal(10, "Set1"): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors

#.

WORDCLOUD-BLOGS

b<-sample(dataBlogs, 200)
b<- iconv(b, 'UTF-8', 'ASCII', "byte")
bVec <- VectorSource(b)  
bCorp <- Corpus(bVec)
bCorp<- tm_map(bCorp, tolower)

## Warning in tm_map.SimpleCorpus(bCorp, tolower): transformation drops documents

bCorp<- tm_map(bCorp, removeNumbers)

## Warning in tm_map.SimpleCorpus(bCorp, removeNumbers): transformation drops
## documents

bCorp<- tm_map(bCorp, removePunctuation)

## Warning in tm_map.SimpleCorpus(bCorp, removePunctuation): transformation drops
## documents

b2 <- tm_map(bCorp, PlainTextDocument)

## Warning in tm_map.SimpleCorpus(bCorp, PlainTextDocument): transformation drops
## documents

wordcloud(bCorp, max.words = 5000, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(10, "Set1"))

## Warning in brewer.pal(10, "Set1"): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors

#.

INFERENCE

-Q plots and word clouds have been made use of to gain a basic understanding of corpus data from th required sources.

SOLUTION PLAN

-N gram and other statistical models will be incorporated to obtain a wholistic understanding of the corpus data. -I plan to familiarize myself with NLP frameworks and machine learning models such as Hidden Markov model to develop a deeper understandng of NLP which help me incorporae the reqired frameworks for the development of the Shiny app.