Clustering Large Text Factors

#Clustering Text Factors from big data using String Distance
#install.packages("stringdist")
library(stringdist) #feature selection for long text

## Warning: package 'stringdist' was built under R version 3.1.2

library(RCurl)

## Warning: package 'RCurl' was built under R version 3.1.2

## Loading required package: bitops

urlData <- getURL('https://raw.githubusercontent.com/hadley/fueleconomy/master/data-raw/vehicles.csv')
vehicles <- read.csv(text = urlData)
nrow(vehicles); names(vehicles)

## [1] 34631

##  [1] "barrels08"       "barrelsA08"      "charge120"      
##  [4] "charge240"       "city08"          "city08U"        
##  [7] "cityA08"         "cityA08U"        "cityCD"         
## [10] "cityE"           "cityUF"          "co2"            
## [13] "co2A"            "co2TailpipeAGpm" "co2TailpipeGpm" 
## [16] "comb08"          "comb08U"         "combA08"        
## [19] "combA08U"        "combE"           "combinedCD"     
## [22] "combinedUF"      "cylinders"       "displ"          
## [25] "drive"           "engId"           "eng_dscr"       
## [28] "feScore"         "fuelCost08"      "fuelCostA08"    
## [31] "fuelType"        "fuelType1"       "ghgScore"       
## [34] "ghgScoreA"       "highway08"       "highway08U"     
## [37] "highwayA08"      "highwayA08U"     "highwayCD"      
## [40] "highwayE"        "highwayUF"       "hlv"            
## [43] "hpv"             "id"              "lv2"            
## [46] "lv4"             "make"            "model"          
## [49] "mpgData"         "phevBlended"     "pv2"            
## [52] "pv4"             "range"           "rangeCity"      
## [55] "rangeCityA"      "rangeHwy"        "rangeHwyA"      
## [58] "trany"           "UCity"           "UCityA"         
## [61] "UHighway"        "UHighwayA"       "VClass"         
## [64] "year"            "youSaveSpend"    "guzzler"        
## [67] "trans_dscr"      "tCharger"        "sCharger"       
## [70] "atvType"         "fuelType2"       "rangeA"         
## [73] "evMotor"         "mfrCode"

# focus on one single feature: car models
length(unique(vehicles$model))

## [1] 3234

#sample data
vehicles_small <- vehicles[1:2000, ]
print(paste('we have', length(unique(vehicles_small)), 'unique model names'))

## [1] "we have 74 unique model names"

#Function: stringdistmatrix for grouping based on similarity  or distance metrics between strings
#1 Jaro-winkler distance
#case1: 200 groups
uniquemodels <- unique(as.character(vehicles_small$model))
length(uniquemodels); head(uniquemodels,10)

## [1] 481

##  [1] "Spider Veloce 2000"  "Testarossa"          "Charger"            
##  [4] "B150/B250 Wagon 2WD" "Legacy AWD Turbo"    "Loyale"             
##  [7] "Corolla"             "Golf III / GTI"      "Jetta III"          
## [10] "240"

distancemodels <- stringdistmatrix(uniquemodels,uniquemodels,method = "jw") 

rownames(distancemodels) <- uniquemodels

hc <- hclust(as.dist(distancemodels))
dfClust <- data.frame(uniquemodels, cutree(hc, k=200))
names(dfClust) <- c('modelname','cluster')

plot(table(dfClust$cluster))

print(paste('Average number of models per cluster:', mean(table(dfClust$cluster))))

## [1] "Average number of models per cluster: 2.405"

#Order to show bigger size clusters on the top
t <- table(dfClust$cluster)
t <- cbind(t,t/length(dfClust$cluster))
t <- t[order(t[,2], decreasing=TRUE),]
t1 <- data.frame(factorName=rownames(t), binCount=t[,1], percentFound=t[,2]) 


dfClust <- merge(x=dfClust, y=t1, by.x = 'cluster', by.y='factorName', all.x=T) 
dfClust <- dfClust[rev(order(dfClust$binCount)),]
names(dfClust) <- c('cluster','modelname')
head(dfClust[c('cluster', 'modelname')], 30)

##     cluster              modelname
## 192      73       K1500 Pickup 4WD
## 191      73         S10 Pickup 2WD
## 190      73        W250 Pickup 4WD
## 189      73        F150 Pickup 2WD
## 188      73         S10 Pickup 4WD
## 187      73   D100/D150 Pickup 2WD
## 186      73        F250 Pickup 2WD
## 185      73       C1500 Pickup 2WD
## 184      73        F150 Pickup 4WD
## 183      73        D250 Pickup 2WD
## 182      73   W100/W150 Pickup 4WD
## 341     123 Postal Cab Chassis 2WD
## 340     123    S10 Cab Chassis 2WD
## 339     123 Dakota Cab Chassis 2WD
## 338     123        Cab/Chassis 2WD
## 337     123        Cab Chassis 2WD
## 336     123    S15 Cab Chassis 2WD
## 335     123  Truck Cab Chassis 2WD
## 334     123   D250 Cab Chassis 2WD
## 236      84         Yukon 1500 4WD
## 235      84       Suburban C10 2WD
## 234      84            SJ 410V 4WD
## 233      84      Suburban 1500 2WD
## 232      84       Suburban K10 4WD
## 231      84        Yukon K1500 4WD
## 230      84             SJ 410 4WD
## 229      84      Suburban 1500 4WD
## 365     130        900 Convertible
## 364     130       318i Convertible
## 363     130            Convertible

Clustering Large Text Factors

SA