For Document Classification assignment, I use Craiglist Housing listings to classify the categories they belong to. I have chosen specifically Staten Island, NY housing listings. Since this is my very first experiment with text mining, I have followed a similar approach described in Chapter 10 of class book to get a better knowledge of how classification functions.

# Packages that will be used later
library(RCurl)
library(XML)
library(stringr)
library(tm)
library(RTextTools)
library(SnowballC)

Web scraping 500 listings and writing them to a file.

# download 500 housing listing links
all_links <- character()
new_results <- "https://newyork.craigslist.org/search/stn/hhh"
for(i in 1:5){
  i <- i+1
  results <- getURL(new_results)
  results_tree <- htmlParse(results)
  all_links <- c(all_links, xpathSApply(results_tree, "//a[@class='hdrlnk']", xmlGetAttr, "href"))
  new_results <- xpathSApply(results_tree, "//link[@rel='next']", xmlGetAttr, "href")
}

# download the links to local drive
for(i in 1:length(all_links)){
  url <- str_c("https://newyork.craigslist.org", all_links[i])
  tmp <- getURL(url)
  write(tmp, str_c("c://housing//",i, ".html"))
  
}

Processing files to get the textual data that is needed for corpus

# parse out listing text and its category
n <- 0
for (i in 1:length(list.files("C://housing//"))){
  tmp <- readLines(str_c("c://housing//", i, ".html"))
  tmp <- str_c(tmp, collapse=" ")
  tmp <- htmlParse(tmp)
  listing <- xpathSApply(tmp, "//section[@id='postingbody']", xmlValue)
  category <- xpathSApply(tmp, "//li[@class='crumb category']//a", xmlValue)

  if(length(listing) !=0){
    n <- n + 1
    if (i==1) { listing_c <- Corpus(VectorSource(listing)) }
    else {
    
    tmp_corpus <- Corpus(VectorSource(listing))
    listing_c <- c(listing_c, tmp_corpus)
    }
    meta(listing_c[[n]], "category") <- category[1]
  }
}

# Place category in a vector.  This will be used to create traning data
category_labels <- meta(listing_c[[1]])$category
for (i in 2:length(listing_c)){
    category_labels <- c(category_labels,meta(listing_c[[i]])$category)
  
}
head(category_labels)
## [1] "apts by owner"           "rooms & shares"         
## [3] "sublets & temporary"     "real estate - by broker"
## [5] "office & commercial"     "real estate - by broker"

Removing unwanted data in corpus

# Data cleanup
listing_corpus <- listing_c
listing_corpus <- tm_map(listing_corpus, removeNumbers)
listing_corpus <- tm_map(listing_corpus, removeWords, words = stopwords("en"))
listing_corpus <- tm_map(listing_corpus, stemDocument)

Creating a document-text matrix

dtm <- DocumentTermMatrix(listing_corpus)
dtm
## <<DocumentTermMatrix (documents: 500, terms: 5298)>>
## Non-/sparse entries: 30779/2618221
## Sparsity           : 99%
## Maximal term length: 108
## Weighting          : term frequency (tf)

Removing sparsed terms as they might indicate errors that we don’t need to analyze

dtm <- removeSparseTerms(dtm, 1-(10/length(listing_corpus)))
dtm
## <<DocumentTermMatrix (documents: 500, terms: 482)>>
## Non-/sparse entries: 20831/220169
## Sparsity           : 91%
## Maximal term length: 28
## Weighting          : term frequency (tf)

Creating a container where first 250 documents are set to training and the rest up to 500 for testing

container <- create_container(
            dtm,
            labels = category_labels,
            trainSize =1:250,
            testSize = 251:500,
            virgin = FALSE
            
)

Using supervised models to classify the data

svm_model <- train_model(container, "SVM")
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")

svm_out <- classify_model(container, svm_model)
tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)

head(svm_out)
##                 SVM_LABEL  SVM_PROB
## 1           apts by owner 0.2437859
## 2 real estate - by broker 0.3952220
## 3 real estate - by broker 0.2583396
## 4 real estate - by broker 0.6466964
## 5 real estate - by broker 0.7249674
## 6     office & commercial 0.3026604
head(tree_out)
##                TREE_LABEL TREE_PROB
## 1  real estate - by owner 0.4000000
## 2 real estate - by broker 0.6111111
## 3 real estate - by broker 0.6111111
## 4 real estate - by broker 0.6250000
## 5 real estate - by broker 0.6250000
## 6     office & commercial 0.7777778
head(maxent_out)
##          MAXENTROPY_LABEL MAXENTROPY_PROB
## 1        vacation rentals       1.0000000
## 2     office & commercial       1.0000000
## 3     office & commercial       1.0000000
## 4 real estate - by broker       1.0000000
## 5 real estate - by broker       1.0000000
## 6          rooms & shares       0.9984689
labels_out <- data.frame(
    correct_label = category_labels[251:500],
    svm = as.character(svm_out[,1]),
    tree = as.character(tree_out[,1]),
    maxent = as.character(maxent_out[,1]),
    stringsAsFactors = F)

table(labels_out[,1]==labels_out[,2])
## 
## FALSE  TRUE 
##   139   111
prop.table(table(labels_out[,1]==labels_out[,2]))
## 
## FALSE  TRUE 
## 0.556 0.444
table(labels_out[,1]==labels_out[,3])
## 
## FALSE  TRUE 
##   176    74
prop.table(table(labels_out[,1]==labels_out[,3]))
## 
## FALSE  TRUE 
## 0.704 0.296
table(labels_out[,1]==labels_out[,4])
## 
## FALSE  TRUE 
##   120   130
prop.table(table(labels_out[,1]==labels_out[,4]))
## 
## FALSE  TRUE 
##  0.48  0.52

It appears all three models do not classify the documents correctly. The maximum entropy model predicts it most closely at 52%. Out of 250 listings, it got 130 correct. The model SVM followed with 43% and random forest did poorly at 30%. There could be number of factors that can be considered for not producing desired results. One could be that there was not enough textual data for listings to analyze or compare. Therefore, we might need more training data to run the results again.