CS 698 – Lab 4

Please write a program to compute how many words (separated by space) in the “product_title” for each product and for each product and for each product and for each product and create a new column “numofwords” to store the results

library(readr)
train <- read_csv("train.csv")
attach(train)

numofwords <- c()
for (i in 1:length(product_title)){
  numofwords[i] <- length(unlist(gregexpr("\\S+",product_title[i])))
}

numofwords <- sapply( gregexpr("\\S+",product_title),length)
# Using "\\S+" instead of " " in strsplit will allow you to consider and ignore multiple spaces between words.

numofwords <- c()
system.time( for (i in 1:length(product_title)){
  numofwords[i] <- length(unlist(gregexpr("\\S+",product_title[i])))
})

##    user  system elapsed 
##    6.58    0.02    6.61

system.time(sapply( gregexpr("\\S+",product_title),length))

##    user  system elapsed 
##    1.26    0.00    1.26

system.time(mapply(length,gregexpr("\\S+",product_title)))

##    user  system elapsed 
##    1.34    0.00    1.35

system.time(vapply(gregexpr("\\S+",product_title),length,numeric(1)))

##    user  system elapsed 
##    1.34    0.00    1.34

library(stringi)
count <- c()
  for (i in 1:length(relevance)){
count[i] <- length(Reduce(`intersect`,stri_extract_all_regex(c(product_title[i],search_term[i]),"\\w+")))
  }
tapply(count,relevance,mean)

##         1      1.25      1.33       1.5      1.67      1.75         2 
## 0.2517815 0.0000000 0.2867598 0.0000000 0.2464602 0.5555556 0.2734868 
##      2.25      2.33       2.5      2.67      2.75         3 
## 0.2727273 0.2846202 0.1578947 0.2927246 0.0000000 0.2786928

CS 698 – Lab 4

Yalin Zhu

February 25, 2016