Rowen Remis R. Iral
Friday, March 20, 2015
## [1] "E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone/data set 2"
## [1] "E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone/data set 2/data set"
## [1] "Coursera-SwiftKey.zip" "final"
## [1] "E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone/data set 2/data set/final"
## [1] "de_DE" "en_US" "fi_FI" "ru_RU"
## size isdir mode mtime ctime
## de_DE 0 TRUE 777 2014-07-22 10:10:12 2015-03-19 21:57:05
## en_US 0 TRUE 777 2014-07-22 10:10:12 2015-03-19 21:58:09
## fi_FI 0 TRUE 777 2014-07-22 10:10:12 2015-03-19 21:59:13
## ru_RU 0 TRUE 777 2014-07-22 10:10:12 2015-03-19 21:57:35
## atime exe
## de_DE 2015-03-19 21:57:21 no
## en_US 2015-03-19 21:58:52 no
## fi_FI 2015-03-19 21:59:40 no
## ru_RU 2015-03-19 21:57:59 no
## [1] "de_DE.blogs.txt" "de_DE.news.txt" "de_DE.twitter.txt"
## [4] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
## [7] "fi_FI.blogs.txt" "fi_FI.news.txt" "fi_FI.twitter.txt"
## [10] "ru_RU.blogs.txt" "ru_RU.news.txt" "ru_RU.twitter.txt"
## size
## de_DE/de_DE.blogs.txt 85459666
## size
## de_DE/de_DE.news.txt 95591959
## size
## de_DE/de_DE.twitter.txt 75578341
## size
## en_US/en_US.blogs.txt 210160014
## size
## en_US/en_US.news.txt 205811889
## size
## en_US/en_US.twitter.txt 167105338
## size
## fi_FI/fi_FI.blogs.txt 108503595
## size
## fi_FI/fi_FI.news.txt 94234350
## size
## fi_FI/fi_FI.twitter.txt 25331142
## size
## ru_RU/ru_RU.blogs.txt 116855835
## size
## ru_RU/ru_RU.news.txt 118996424
## size
## ru_RU/ru_RU.twitter.txt 105182346
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
## [1] "length of en_US Twitter: 2360148"
## [1] 213
## [1] "length of en_US Blog: 899288"
## [1] 40835
## Warning in readLines(con3): incomplete final line found on
## 'en_US/en_US.news.txt'
## [1] "length of en_US News: 77259"
## [1] 5760
Swiftkey data was downloaded and it was in a zip file. The three files: 1. en_US.twitter.txt 2. en_US.blogs.txt 3. en_US.news.txt
The library used is tm package for R.
library(tm)
## Warning: package 'tm' was built under R version 3.1.2
## Loading required package: NLP
set_dir <- function() {
setwd("E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone/")
prj_dir <- getwd()
setwd(prj_dir)
getwd()
#load("nlp_dscap.RData")
print(prj_dir)
data_dir <- paste(getwd(),"/data set/",sep="")
setwd(data_dir)
print(getwd())
final_dir <- paste(data_dir,"/final/",sep="")
setwd(final_dir)
print(getwd())
}
set_dir()
## [1] "E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone"
## [1] "E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone/data set"
## [1] "E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone/data set/final"
txtLocation = paste(getwd(), "/en_US",sep="")
docs <- Corpus(DirSource(txtLocation,encoding="UTF-8"), readerControl = list(reader=readPlain, language="en_US"))
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3908257 104.4 5860551 156.5 4033177 107.7
## Vcells 75151808 573.4 84159387 642.1 79894116 609.6
for(i in 1:length(docs)){
print(meta(docs[[i]]))
print("----------------")
}
## Metadata:
## author : character(0)
## datetimestamp: 2015-03-21 00:09:01
## description : character(0)
## heading : character(0)
## id : en_US.blogs.txt
## language : en_US
## origin : character(0)
## [1] "----------------"
## Metadata:
## author : character(0)
## datetimestamp: 2015-03-21 00:09:25
## description : character(0)
## heading : character(0)
## id : en_US.news.txt
## language : en_US
## origin : character(0)
## [1] "----------------"
## Metadata:
## author : character(0)
## datetimestamp: 2015-03-21 00:09:25
## description : character(0)
## heading : character(0)
## id : en_US.twitter.txt
## language : en_US
## origin : character(0)
## [1] "----------------"
rm(i)
print(paste("Lines in en_US.blogs.txt:", length(docs[[1]]$content)))
## [1] "Lines in en_US.blogs.txt: 899288"
print(paste("Lines in en_US.news.txt:", length(docs[[2]]$content)))
## [1] "Lines in en_US.news.txt: 77259"
print(paste("Lines in en_US.twitter.txt:", length(docs[[3]]$content)))
## [1] "Lines in en_US.twitter.txt: 2360148"
## Warning: package 'RWeka' was built under R version 3.1.2
## Warning: package 'openNLP' was built under R version 3.1.2
## Warning: package 'ggplot2' was built under R version 3.1.2
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
## [1] "E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone"
## [1] "E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone/data set"
## [1] "E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone/data set/final"
## [1] "File size of TWitter in MB"
## [1] 167.1053
## [1] "File size of Blogs in MB"
## [1] 210.16
## [1] "File size of News in MB"
## [1] 205.8119
set_dir()
## [1] "E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone"
## [1] "E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone/data set"
## [1] "E:/Other Business/Data Scientist/Data Science Track/Data Science Capstone/data set/final"
twitter <- readLines(con,5000)
blog <- readLines(con2,5000)
news <- readLines(con3,5000)
sample_text <- paste(twitter,blog,news)
corpus <- VCorpus(VectorSource(sample_text))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
cleantext<-data.frame(text=unlist(sapply(corpus, `[`, "content")), stringsAsFactors=F)
head (cleantext,3)
## text
## 1 how are you btw thanks for the rt you gonna be in dc anytime soon love to see you been way way too long in the years thereafter most of the oil fields and platforms were named after pagan âgodsâ he wasnt home alone apparently
## 2 when you meet someone special youll know your heart will beat more rapidly and youll smile for no reason we love you mr brown the st louis plant had to close it would die of old age workers had been making cars there since the onset of mass automotive production in the s
## 3 theyve decided its more fun if i dont chad has been awesome with the kids and holding down the fort while i work later than usual the kids have been busy together playing skylander on the xbox together after kyan cashed in his from his piggy bank he wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it he never taps into that thing either that is how we know he wanted it so bad we made him count all of his money to make sure that he had enough it was very cute to watch his reaction when he realized he did he also does a very good job of letting lola feel like she is playing too by letting her switch out the characters she loves it almost as much as him wsus plans quickly became a hot topic on local online sites though most people applauded plans for the new biomedical center many deplored the potential loss of the building
one_token <- NGramTokenizer(cleantext, Weka_control(min = 1, max = 1))
two_token <- NGramTokenizer(cleantext, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
three_token <- NGramTokenizer(cleantext, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))
one <- data.frame(table(one_token))
two <- data.frame(table(two_token))
tri <- data.frame(table(three_token))
one_s <- one[order(one$Freq,decreasing = TRUE),]
two_s <- two[order(two$Freq,decreasing = TRUE),]
tri_s <- tri[order(tri$Freq,decreasing = TRUE),]
one30 <- one_s[1:30,]
colnames(one30) <- c("Word","Frequency")
head (one30)
## Word Frequency
## 33220 the 21786
## 33718 to 11877
## 1854 and 11254
## 488 a 10542
## 23257 of 9278
## 16673 in 7241
two30 <- two_s[1:30,]
colnames(two30) <- c("Word","Frequency")
head (two30)
## Word Frequency
## 142859 of the 2006
## 102449 in the 1896
## 214069 to the 976
## 145441 on the 848
## 77445 for the 836
## 211873 to be 748
tri30 <- tri_s[1:30,]
colnames(tri30) <- c("Word","Frequency")
head (tri30)
## Word Frequency
## 232445 one of the 153
## 4327 a lot of 149
## 127004 going to be 77
## 335335 to be a 77
## 172598 it was a 67
## 152910 i want to 65
ggplot(one30, aes(x=Word,y=Frequency), ) + geom_bar(stat="Identity", fill="blue") +geom_text(aes(label=Frequency), vjust=-0.2)
Currently updating the N-Gram prediction algorithm.