Exploring the Crowdflower Data

We’ll start by using the readr library to read in the train and test sets

library(readr)
setwd("/Volumes/master/Users/shin/dan/projects/kaggle/Crowdflower Search Results Relevance/")
train <- read_csv("data/train.csv")
test  <- read_csv("data/test.csv")
summary(train)
##        id           query           product_title      product_description
##  Min.   :    1   Length:10158       Length:10158       Length:10158       
##  1st Qu.: 8079   Class :character   Class :character   Class :character   
##  Median :16350   Mode  :character   Mode  :character   Mode  :character   
##  Mean   :16353                                                            
##  3rd Qu.:24571                                                            
##  Max.   :32668                                                            
##  median_relevance relevance_variance
##  Min.   :1.00     Min.   :0.0000    
##  1st Qu.:3.00     1st Qu.:0.0000    
##  Median :4.00     Median :0.4710    
##  Mean   :3.31     Mean   :0.3779    
##  3rd Qu.:4.00     3rd Qu.:0.4710    
##  Max.   :4.00     Max.   :1.4700

We can see that train has 6 columns. median_relevance is the one that this competition’s trying to predict.

names(train)
## [1] "id"                  "query"               "product_title"      
## [4] "product_description" "median_relevance"    "relevance_variance"

On the other hand, test has 4 columns.

names(test)
## [1] "id"                  "query"               "product_title"      
## [4] "product_description"

Test doesn’t have the two columns that provide direct information on the target variable.

setdiff(names(train), names(test))
## [1] "median_relevance"   "relevance_variance"
length(setdiff(names(test), names(train)))
## [1] 0
#setdiff(A,B) は、B に含まれない A のデータを返します。

Let’s look at the number of rows in the training and test sets.

nrow(train)
## [1] 10158
nrow(test)
## [1] 22513

Here, the training set has about 10k rows, and the test set is about twice as big as the training set.

Let’s look at the queries in the train set now.

unique(train$query)[1:10]
##  [1] "bridal shower decorations" "led christmas lights"     
##  [3] "projector"                 "wine rack"                
##  [5] "light bulb"                "oakley polarized radar"   
##  [7] "boyfriend jeans"           "screen protector samsung" 
##  [9] "pots and pans set"         "waffle maker"
# The number of unique train queries
length(unique(train$query))
## [1] 261
# The number of unique test queries
length(unique(test$query))
## [1] 261
# Are any queries different between the sets?
length(setdiff(unique(train$query), unique(test$query)))
## [1] 0

It looks like all the queries we see in the training set are also in the test set.

Now let’s look at the product titles

unique(train$product_title)[1:10]
##  [1] "Accent Pillow with Heart Design - Red/Black"                                                           
##  [2] "Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire"                              
##  [3] "ViewSonic Pro8200 DLP Multimedia Projector"                                                            
##  [4] "Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle"          
##  [5] "Wintergreen Lighting Christmas LED Light Bulb (Pack of 25)"                                            
##  [6] "Oakley Sunglasses - Radar Path Polished Black/Gray Sunglasses"                                         
##  [7] "How To Make An American Quilt (DVD)"                                                                   
##  [8] "ZAGG InvisibleShield Cell Phone Screen Protector for Samsung Galaxy S4 Mini"                           
##  [9] "Cook N Home Stainless Steel 4-Piece Pasta Cooker/ Steamer Multi-pots with Encapsulated Bottom, 8-Quart"
## [10] "Presto FlipSide Electric Waffle Maker- 03510"
# The number of unique product titles in the training set
length(unique(train$product_title))
## [1] 9708
# The number of product titles that are only in the train set or only in the test set
length(setdiff(unique(train$product_title), unique(test$product_title)))
## [1] 8680
# The number of product titles that are in both the train and test sets
length(intersect(unique(train$product_title), unique(test$product_title)))
## [1] 1028

This tells us that we only see most of the product titles once, and that the product titles are mostly different between the train and test sets.

Now let’s start with some basic text analysis on the queries. First, we’ll create a helper function

# We'll use the library ggvis for data visualization
# http://ggvis.rstudio.com/
library(ggvis)
# And the library tm to help with text processing
library(tm)

# Creating a function plot_word_counts to plot counts of word occurences in different sets
plot_word_counts <- function(documents) {
  # Keep only unique documents and convert them to lowercase
  corpus <- Corpus(VectorSource(tolower(unique(documents))))
  # Remove punctuation from the documents
  corpus <- tm_map(corpus, removePunctuation)
  # Remove english stopwords, such as "the" and "a"
  corpus <- tm_map(corpus, removeWords, stopwords("english"))#arguments to FUN.
  #stop words usually refer to the most common words in a language, 
  #unfortunately, there is no stopwords(jap) or stopwords(kor)
  
  doc_terms <- DocumentTermMatrix(corpus)
  doc_terms <- as.data.frame(as.matrix(doc_terms))
  word_counts <- data.frame(Words=colnames(doc_terms), Counts=colSums(doc_terms))
  # Sort from the most frequent words to the least frequent words
  word_counts <- word_counts[order(word_counts$Counts, decreasing=TRUE),]
  
  top_words <- word_counts[1:10,]
  top_words$Words <- factor(top_words$Words, levels=top_words$Words)
  
  # Plot the 10 most frequent words with ggvis
  top_words %>%
    ggvis(~Words, ~Counts) %>%
    layer_bars(fill:="#20beff")
}

Now, we’ll apply that function to look at the most common terms in the query, the product title, and the product description.

# The top words in the query 
plot_word_counts(c(train$query, test$query))

# The top words in the product title (from a random sample for computational reasons)
set.seed(0)
plot_word_counts(sample(c(train$product_title, test$product_title), 1000))

# The top words in the product description (from a random sample for computational reasons)
plot_word_counts(sample(c(train$product_description, test$product_description), 1000))