rm(list = ls(all=TRUE))
setwd("C:\\Users\\C5215696\\Desktop\\Data Science\\Text-Mining-BoW")
load required library for handling text
library(tm)
## Warning: package 'tm' was built under R version 3.3.3
## Loading required package: NLP
library(magrittr)
## Warning: package 'magrittr' was built under R version 3.3.3
library(NLP)
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
Create a corpus - a collection of text documents
getSources()
## [1] "DataframeSource" "DirSource" "URISource" "VectorSource"
## [5] "XMLSource" "ZipSource"
getReaders()
## [1] "readDOC" "readPDF"
## [3] "readPlain" "readRCV1"
## [5] "readRCV1asPlain" "readReut21578XML"
## [7] "readReut21578XMLasPlain" "readTabular"
## [9] "readTagged" "readXML"
sci.med = Corpus(DirSource("sci.med"),
readerControl = list(language='en_US'))
sci.space = Corpus(DirSource("sci.space"),
readerControl = list(language='en_US'))
examine the file
sci.med[[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 1846
as.character(sci.med[[1]])
## [1] "Newsgroups: sci.med\nPath: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!darwin.sura.net!sgiblab!sdd.hp.com!decwrl!decwrl!uunet!utcsri!utnut!utzoo!telly!problem!intacc!bed\nFrom: bed@intacc.uucp (Deb Waddington)\nSubject: INFO NEEDED: Gaucher's Disease\nMessage-ID: <1993Mar18.002149.1111@intacc.uucp>\nDate: Thu, 18 Mar 1993 00:21:49 GMT\nDistribution: Everywhere\nExpires: 01 Jun 93\nReply-To: bed@intacc.UUCP (Deb Waddington)\nOrganization: Matrix Artists' Network\nLines: 33\n\n\nI have a 42 yr old male friend, misdiagnosed as having\n osteopporosis for two years, who recently found out that his\n illness is the rare Gaucher's disease. \n\nGaucher's disease symptoms include: brittle bones (he lost 9 \n inches off his hieght); enlarged liver and spleen; internal\n bleeding; and fatigue (all the time). The problem (in Type 1) is\n attributed to a genetic mutation where there is a lack of the\n enzyme glucocerebroside in macrophages so the cells swell up.\n This will eventually cause death.\n\nEnyzme replacement therapy has been successfully developed and\n approved by the FDA in the last few years so that those patients\n administered with this drug (called Ceredase) report a remarkable\n improvement in their condition. Ceredase, which is manufactured\n by biotech biggy company--Genzyme--costs the patient $380,000\n per year. Gaucher's disease has justifyably been called \"the most\n expensive disease in the world\".\n\nNEED INFO:\nI have researched Gaucher's disease at the library but am relying\n on netlanders to provide me with any additional information:\n**news, stories, reports\n**people you know with this disease\n**ideas, articles about Genzyme Corp, how to get a hold of\n enough money to buy some, programs available to help with\n costs.\n**Basically ANY HELP YOU CAN OFFER\n\nThanks so very much!\n\nDeborah "
length(sci.med)
## [1] 1000
inspect(sci.med[1])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
##
## 57110
## Newsgroups: sci.med\nPath: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!darwin.sura.net!sgiblab!sdd.hp.com!decwrl!decwrl!uunet!utcsri!utnut!utzoo!telly!problem!intacc!bed\nFrom: bed@intacc.uucp (Deb Waddington)\nSubject: INFO NEEDED: Gaucher's Disease\nMessage-ID: <1993Mar18.002149.1111@intacc.uucp>\nDate: Thu, 18 Mar 1993 00:21:49 GMT\nDistribution: Everywhere\nExpires: 01 Jun 93\nReply-To: bed@intacc.UUCP (Deb Waddington)\nOrganization: Matrix Artists' Network\nLines: 33\n\n\nI have a 42 yr old male friend, misdiagnosed as having\n osteopporosis for two years, who recently found out that his\n illness is the rare Gaucher's disease. \n\nGaucher's disease symptoms include: brittle bones (he lost 9 \n inches off his hieght); enlarged liver and spleen; internal\n bleeding; and fatigue (all the time). The problem (in Type 1) is\n attributed to a genetic mutation where there is a lack of the\n enzyme glucocerebroside in macrophages so the cells swell up.\n This will eventually cause death.\n\nEnyzme replacement therapy has been successfully developed and\n approved by the FDA in the last few years so that those patients\n administered with this drug (called Ceredase) report a remarkable\n improvement in their condition. Ceredase, which is manufactured\n by biotech biggy company--Genzyme--costs the patient $380,000\n per year. Gaucher's disease has justifyably been called "the most\n expensive disease in the world".\n\nNEED INFO:\nI have researched Gaucher's disease at the library but am relying\n on netlanders to provide me with any additional information:\n**news, stories, reports\n**people you know with this disease\n**ideas, articles about Genzyme Corp, how to get a hold of\n enough money to buy some, programs available to help with\n costs.\n**Basically ANY HELP YOU CAN OFFER\n\nThanks so very much!\n\nDeborah
meta(sci.med[[1]])
## author : character(0)
## datetimestamp: 2017-08-11 17:37:21
## description : character(0)
## heading : character(0)
## id : 57110
## language : en_US
## origin : character(0)
Create a common corpus
corpus_total = c(sci.med,sci.space)
corpus_total = Corpus(VectorSource(unlist(corpus_total)))
rm(sci.med,sci.space)
Necessary preprocessing
getTransformations()
## [1] "removeNumbers" "removePunctuation" "removeWords"
## [4] "stemDocument" "stripWhitespace"
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(corpus_total, toSpace, "/")
docs <- tm_map(corpus_total, toSpace, "@")
docs <- tm_map(corpus_total, toSpace, "\\|")
corpus_total = tm_map(corpus_total, removePunctuation)
corpus_total = tm_map(corpus_total, removeNumbers)
corpus_total = tm_map(corpus_total, tolower)
corpus_total = tm_map(corpus_total, removeWords, stopwords("english"))
corpus_total = tm_map(corpus_total, stemDocument, language="english")
corpus_total = tm_map(corpus_total, stripWhitespace)
as.character(corpus_total[[1]])
## [1] "newsgroup scime path cantaloupesrvcscmueducrabapplesrvcscmuedufsececmuedueuropaenggtefsdcomdarwinsuranetsgiblabsddhpcomdecwrldecwrluunetutcsriutnututzootellyproblemintaccb bedintaccuucp deb waddington subject info need gaucher diseas messageid marintaccuucp date thu mar gmt distribut everywher expir jun replyto bedintaccuucp deb waddington organ matrix artist network line yr old male friend misdiagnos osteopporosi two year recent found ill rare gaucher diseas gaucher diseas symptom includ brittl bone lost inch hieght enlarg liver spleen intern bleed fatigu time problem type attribut genet mutat lack enzym glucocerebrosid macrophag cell swell will eventu caus death enyzm replac therapi success develop approv fda last year patient administ drug call ceredas report remark improv condit ceredas manufactur biotech biggi companygenzymecost patient per year gaucher diseas justify call expens diseas world need info research gaucher diseas librari reli netland provid addit inform news stori report peopl know diseas idea articl genzym corp get hold enough money buy program avail help cost basic help can offer thank much deborah"
Constructs or coerces to a document-term matrix
dt_matrix <-DocumentTermMatrix(corpus_total,
control=list(weighting=weightTfIdf,
minWordLength=2,
minDocFreq=5))
dt_matrix
## <<DocumentTermMatrix (documents: 2002, terms: 26949)>>
## Non-/sparse entries: 245109/53706789
## Sparsity : 100%
## Maximal term length: 248
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
inspect(dt_matrix[1:4,1:5])
## <<DocumentTermMatrix (documents: 4, terms: 5)>>
## Non-/sparse entries: 7/13
## Sparsity : 65%
## Maximal term length: 8
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample :
## Terms
## Docs addit administ approv articl artist
## content.57110 0.028778 0.04937842 0.02942611 0.006554607 0.05272619
## content.58043 0.000000 0.00000000 0.00000000 0.012672240 0.00000000
## content.58044 0.000000 0.00000000 0.00000000 0.000000000 0.00000000
## content.58045 0.000000 0.00000000 0.00000000 0.004670359 0.00000000
Remove sparse terms from a document-term matrix
dt_matrix <- removeSparseTerms(dt_matrix, 0.75)
Applying SVD on Document-Term Matrix
svd = svd(as.matrix(dt_matrix))
matrix = svd$u
View(matrix)
Convert to dataframe
data = as.data.frame(matrix)
data <- data[apply(data, 1, function(x) !all(x==0)),]
Attach Class label
target = as.factor(c(rep('med',1000), rep('space',1000)))
data <- cbind(data,target)
# split the data into train and test
set.seed(586)
train_rows = createDataPartition(data$target, p = 0.8, list = F)
train_target = data[train_rows, names(data) %in% c("target")]
train = data[train_rows,]
test = data[-train_rows,]
Classification Task
Using Decision Tree
library(rpart)
rpart_model = rpart(train_target~.,train)
table(test$target,predict(rpart_model,test,type = "class"))
##
## med space
## med 200 0
## space 0 200
Using Naive Bayes
library(e1071)
## Warning: package 'e1071' was built under R version 3.3.3
nb_model = naiveBayes(x = train,y = train$target)
table(test$target,predict(nb_model,test[,!names(test) %in% c("target")],type = "class"))
##
## med space
## med 155 45
## space 8 192
Using knn classifier
Verifying the ratio of med and space
prop.table(table(data$target))
##
## med space
## 0.5 0.5
prop.table(table(train$target))
##
## med space
## 0.5 0.5
prop.table(table(test$target))
##
## med space
## 0.5 0.5
# remove the target variable
train_withoutclass = subset(train,select=-c(target))
test_withoutclass = subset(test,select=-c(target))
# N = 1/3/5/7
library(class)
Neigh <-3
pred=knn(train_withoutclass, test_withoutclass, train$target, k = Neigh)
a=table(pred,test$target)
a
##
## pred med space
## med 174 31
## space 26 169
accu= sum(diag(a))/sum(a)
accu
## [1] 0.8575