rm(list = ls(all=TRUE))
setwd("C:\\Users\\C5215696\\Desktop\\Data Science\\Text-Mining-BoW")

load required library for handling text

library(tm)
## Warning: package 'tm' was built under R version 3.3.3
## Loading required package: NLP
library(magrittr)         
## Warning: package 'magrittr' was built under R version 3.3.3
library(NLP)
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

Create a corpus - a collection of text documents

getSources()
## [1] "DataframeSource" "DirSource"       "URISource"       "VectorSource"   
## [5] "XMLSource"       "ZipSource"
getReaders()
##  [1] "readDOC"                 "readPDF"                
##  [3] "readPlain"               "readRCV1"               
##  [5] "readRCV1asPlain"         "readReut21578XML"       
##  [7] "readReut21578XMLasPlain" "readTabular"            
##  [9] "readTagged"              "readXML"
sci.med = Corpus(DirSource("sci.med"),
                    readerControl = list(language='en_US'))

sci.space = Corpus(DirSource("sci.space"),
                    readerControl = list(language='en_US'))

examine the file

sci.med[[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 1846
as.character(sci.med[[1]])
## [1] "Newsgroups: sci.med\nPath: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!darwin.sura.net!sgiblab!sdd.hp.com!decwrl!decwrl!uunet!utcsri!utnut!utzoo!telly!problem!intacc!bed\nFrom: bed@intacc.uucp (Deb Waddington)\nSubject: INFO NEEDED: Gaucher's Disease\nMessage-ID: <1993Mar18.002149.1111@intacc.uucp>\nDate: Thu, 18 Mar 1993 00:21:49 GMT\nDistribution: Everywhere\nExpires: 01 Jun 93\nReply-To: bed@intacc.UUCP (Deb Waddington)\nOrganization: Matrix Artists' Network\nLines: 33\n\n\nI have a 42 yr old male friend, misdiagnosed as having\n osteopporosis for two years, who recently found out that his\n illness is the rare Gaucher's disease. \n\nGaucher's disease symptoms include: brittle bones (he lost 9 \n inches off his hieght); enlarged liver and spleen; internal\n bleeding; and fatigue (all the time). The problem (in Type 1) is\n attributed to a genetic mutation where there is a lack of the\n enzyme glucocerebroside in macrophages so the cells swell up.\n This will eventually cause death.\n\nEnyzme replacement therapy has been successfully developed and\n approved by the FDA in the last few years so that those patients\n administered with this drug (called Ceredase) report a remarkable\n improvement in their condition. Ceredase, which is manufactured\n by biotech biggy company--Genzyme--costs the patient $380,000\n per year. Gaucher's disease has justifyably been called \"the most\n expensive disease in the world\".\n\nNEED INFO:\nI have researched Gaucher's disease at the library but am relying\n on netlanders to provide me with any additional information:\n**news, stories, reports\n**people you know with this disease\n**ideas, articles about Genzyme Corp, how to get a hold of\n   enough money to buy some, programs available to help with\n   costs.\n**Basically ANY HELP YOU CAN OFFER\n\nThanks so very much!\n\nDeborah "
length(sci.med)
## [1] 1000
inspect(sci.med[1])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              57110 
## Newsgroups: sci.med\nPath: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!darwin.sura.net!sgiblab!sdd.hp.com!decwrl!decwrl!uunet!utcsri!utnut!utzoo!telly!problem!intacc!bed\nFrom: bed@intacc.uucp (Deb Waddington)\nSubject: INFO NEEDED: Gaucher's Disease\nMessage-ID: <1993Mar18.002149.1111@intacc.uucp>\nDate: Thu, 18 Mar 1993 00:21:49 GMT\nDistribution: Everywhere\nExpires: 01 Jun 93\nReply-To: bed@intacc.UUCP (Deb Waddington)\nOrganization: Matrix Artists' Network\nLines: 33\n\n\nI have a 42 yr old male friend, misdiagnosed as having\n osteopporosis for two years, who recently found out that his\n illness is the rare Gaucher's disease. \n\nGaucher's disease symptoms include: brittle bones (he lost 9 \n inches off his hieght); enlarged liver and spleen; internal\n bleeding; and fatigue (all the time). The problem (in Type 1) is\n attributed to a genetic mutation where there is a lack of the\n enzyme glucocerebroside in macrophages so the cells swell up.\n This will eventually cause death.\n\nEnyzme replacement therapy has been successfully developed and\n approved by the FDA in the last few years so that those patients\n administered with this drug (called Ceredase) report a remarkable\n improvement in their condition. Ceredase, which is manufactured\n by biotech biggy company--Genzyme--costs the patient $380,000\n per year. Gaucher's disease has justifyably been called "the most\n expensive disease in the world".\n\nNEED INFO:\nI have researched Gaucher's disease at the library but am relying\n on netlanders to provide me with any additional information:\n**news, stories, reports\n**people you know with this disease\n**ideas, articles about Genzyme Corp, how to get a hold of\n   enough money to buy some, programs available to help with\n   costs.\n**Basically ANY HELP YOU CAN OFFER\n\nThanks so very much!\n\nDeborah
meta(sci.med[[1]])
##   author       : character(0)
##   datetimestamp: 2017-08-11 17:37:21
##   description  : character(0)
##   heading      : character(0)
##   id           : 57110
##   language     : en_US
##   origin       : character(0)

Create a common corpus

corpus_total = c(sci.med,sci.space)
corpus_total = Corpus(VectorSource(unlist(corpus_total)))
rm(sci.med,sci.space)

Necessary preprocessing

getTransformations()
## [1] "removeNumbers"     "removePunctuation" "removeWords"      
## [4] "stemDocument"      "stripWhitespace"
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(corpus_total, toSpace, "/")
docs <- tm_map(corpus_total, toSpace, "@")
docs <- tm_map(corpus_total, toSpace, "\\|")

corpus_total = tm_map(corpus_total, removePunctuation)
corpus_total = tm_map(corpus_total, removeNumbers)
corpus_total = tm_map(corpus_total, tolower)
corpus_total = tm_map(corpus_total, removeWords, stopwords("english"))
corpus_total = tm_map(corpus_total, stemDocument, language="english")
corpus_total = tm_map(corpus_total, stripWhitespace)

as.character(corpus_total[[1]])
## [1] "newsgroup scime path cantaloupesrvcscmueducrabapplesrvcscmuedufsececmuedueuropaenggtefsdcomdarwinsuranetsgiblabsddhpcomdecwrldecwrluunetutcsriutnututzootellyproblemintaccb bedintaccuucp deb waddington subject info need gaucher diseas messageid marintaccuucp date thu mar gmt distribut everywher expir jun replyto bedintaccuucp deb waddington organ matrix artist network line yr old male friend misdiagnos osteopporosi two year recent found ill rare gaucher diseas gaucher diseas symptom includ brittl bone lost inch hieght enlarg liver spleen intern bleed fatigu time problem type attribut genet mutat lack enzym glucocerebrosid macrophag cell swell will eventu caus death enyzm replac therapi success develop approv fda last year patient administ drug call ceredas report remark improv condit ceredas manufactur biotech biggi companygenzymecost patient per year gaucher diseas justify call expens diseas world need info research gaucher diseas librari reli netland provid addit inform news stori report peopl know diseas idea articl genzym corp get hold enough money buy program avail help cost basic help can offer thank much deborah"

Constructs or coerces to a document-term matrix

dt_matrix <-DocumentTermMatrix(corpus_total, 
                               control=list(weighting=weightTfIdf, 
                                            minWordLength=2, 
                                            minDocFreq=5)) 

dt_matrix
## <<DocumentTermMatrix (documents: 2002, terms: 26949)>>
## Non-/sparse entries: 245109/53706789
## Sparsity           : 100%
## Maximal term length: 248
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
inspect(dt_matrix[1:4,1:5])
## <<DocumentTermMatrix (documents: 4, terms: 5)>>
## Non-/sparse entries: 7/13
## Sparsity           : 65%
## Maximal term length: 8
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample             :
##                Terms
## Docs               addit   administ     approv      articl     artist
##   content.57110 0.028778 0.04937842 0.02942611 0.006554607 0.05272619
##   content.58043 0.000000 0.00000000 0.00000000 0.012672240 0.00000000
##   content.58044 0.000000 0.00000000 0.00000000 0.000000000 0.00000000
##   content.58045 0.000000 0.00000000 0.00000000 0.004670359 0.00000000

Remove sparse terms from a document-term matrix

dt_matrix <- removeSparseTerms(dt_matrix, 0.75)

Display detailed information on a document-term matrix

inspect(dt_matrix[1:4,1:5])
## <<DocumentTermMatrix (documents: 4, terms: 5)>>
## Non-/sparse entries: 16/4
## Sparsity           : 20%
## Maximal term length: 6
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample             :
##                Terms
## Docs                 articl         can         date         get
##   content.57110 0.006554607 0.008315000 9.944649e-06 0.011284892
##   content.58043 0.012672240 0.000000000 1.922632e-05 0.000000000
##   content.58044 0.000000000 0.004808275 1.437661e-06 0.000000000
##   content.58045 0.004670359 0.005924693 3.542934e-06 0.008040832
##                Terms
## Docs                     gmt
##   content.57110 0.0020087533
##   content.58043 0.0038835897
##   content.58044 0.0002903980
##   content.58045 0.0007156492
dim(dt_matrix)
## [1] 2002   34

Applying SVD on Document-Term Matrix

svd = svd(as.matrix(dt_matrix))
matrix = svd$u
View(matrix)

Convert to dataframe

data = as.data.frame(matrix)
data <- data[apply(data, 1, function(x) !all(x==0)),]

Attach Class label

target = as.factor(c(rep('med',1000), rep('space',1000)))
data <- cbind(data,target)

# split the data into train and test
set.seed(586)

train_rows = createDataPartition(data$target, p = 0.8, list = F)

train_target = data[train_rows, names(data) %in% c("target")] 

train = data[train_rows,] 
test = data[-train_rows,]

Classification Task

Using Decision Tree

library(rpart)
rpart_model = rpart(train_target~.,train)
table(test$target,predict(rpart_model,test,type = "class"))
##        
##         med space
##   med   200     0
##   space   0   200

Using Naive Bayes

library(e1071)
## Warning: package 'e1071' was built under R version 3.3.3
nb_model = naiveBayes(x = train,y = train$target)
table(test$target,predict(nb_model,test[,!names(test) %in% c("target")],type = "class"))
##        
##         med space
##   med   155    45
##   space   8   192

Using knn classifier

Verifying the ratio of med and space

prop.table(table(data$target))
## 
##   med space 
##   0.5   0.5
prop.table(table(train$target))
## 
##   med space 
##   0.5   0.5
prop.table(table(test$target))
## 
##   med space 
##   0.5   0.5
# remove the target variable
train_withoutclass = subset(train,select=-c(target))
test_withoutclass = subset(test,select=-c(target))

# N = 1/3/5/7
library(class)
Neigh <-3
pred=knn(train_withoutclass, test_withoutclass, train$target, k = Neigh)
a=table(pred,test$target)
a
##        
## pred    med space
##   med   174    31
##   space  26   169
accu= sum(diag(a))/sum(a)
accu
## [1] 0.8575