#Clustering Text Factors from big data using String Distance
#install.packages("stringdist")
library(stringdist) #feature selection for long text
## Warning: package 'stringdist' was built under R version 3.1.2
library(RCurl)
## Warning: package 'RCurl' was built under R version 3.1.2
## Loading required package: bitops
urlData <- getURL('https://raw.githubusercontent.com/hadley/fueleconomy/master/data-raw/vehicles.csv')
vehicles <- read.csv(text = urlData)
nrow(vehicles); names(vehicles)
## [1] 34631
## [1] "barrels08" "barrelsA08" "charge120"
## [4] "charge240" "city08" "city08U"
## [7] "cityA08" "cityA08U" "cityCD"
## [10] "cityE" "cityUF" "co2"
## [13] "co2A" "co2TailpipeAGpm" "co2TailpipeGpm"
## [16] "comb08" "comb08U" "combA08"
## [19] "combA08U" "combE" "combinedCD"
## [22] "combinedUF" "cylinders" "displ"
## [25] "drive" "engId" "eng_dscr"
## [28] "feScore" "fuelCost08" "fuelCostA08"
## [31] "fuelType" "fuelType1" "ghgScore"
## [34] "ghgScoreA" "highway08" "highway08U"
## [37] "highwayA08" "highwayA08U" "highwayCD"
## [40] "highwayE" "highwayUF" "hlv"
## [43] "hpv" "id" "lv2"
## [46] "lv4" "make" "model"
## [49] "mpgData" "phevBlended" "pv2"
## [52] "pv4" "range" "rangeCity"
## [55] "rangeCityA" "rangeHwy" "rangeHwyA"
## [58] "trany" "UCity" "UCityA"
## [61] "UHighway" "UHighwayA" "VClass"
## [64] "year" "youSaveSpend" "guzzler"
## [67] "trans_dscr" "tCharger" "sCharger"
## [70] "atvType" "fuelType2" "rangeA"
## [73] "evMotor" "mfrCode"
# focus on one single feature: car models
length(unique(vehicles$model))
## [1] 3234
#sample data
vehicles_small <- vehicles[1:2000, ]
print(paste('we have', length(unique(vehicles_small)), 'unique model names'))
## [1] "we have 74 unique model names"
#Function: stringdistmatrix for grouping based on similarity or distance metrics between strings
#1 Jaro-winkler distance
#case1: 200 groups
uniquemodels <- unique(as.character(vehicles_small$model))
length(uniquemodels); head(uniquemodels,10)
## [1] 481
## [1] "Spider Veloce 2000" "Testarossa" "Charger"
## [4] "B150/B250 Wagon 2WD" "Legacy AWD Turbo" "Loyale"
## [7] "Corolla" "Golf III / GTI" "Jetta III"
## [10] "240"
distancemodels <- stringdistmatrix(uniquemodels,uniquemodels,method = "jw")
rownames(distancemodels) <- uniquemodels
hc <- hclust(as.dist(distancemodels))
dfClust <- data.frame(uniquemodels, cutree(hc, k=200))
names(dfClust) <- c('modelname','cluster')
plot(table(dfClust$cluster))

print(paste('Average number of models per cluster:', mean(table(dfClust$cluster))))
## [1] "Average number of models per cluster: 2.405"
#Order to show bigger size clusters on the top
t <- table(dfClust$cluster)
t <- cbind(t,t/length(dfClust$cluster))
t <- t[order(t[,2], decreasing=TRUE),]
t1 <- data.frame(factorName=rownames(t), binCount=t[,1], percentFound=t[,2])
dfClust <- merge(x=dfClust, y=t1, by.x = 'cluster', by.y='factorName', all.x=T)
dfClust <- dfClust[rev(order(dfClust$binCount)),]
names(dfClust) <- c('cluster','modelname')
head(dfClust[c('cluster', 'modelname')], 30)
## cluster modelname
## 192 73 K1500 Pickup 4WD
## 191 73 S10 Pickup 2WD
## 190 73 W250 Pickup 4WD
## 189 73 F150 Pickup 2WD
## 188 73 S10 Pickup 4WD
## 187 73 D100/D150 Pickup 2WD
## 186 73 F250 Pickup 2WD
## 185 73 C1500 Pickup 2WD
## 184 73 F150 Pickup 4WD
## 183 73 D250 Pickup 2WD
## 182 73 W100/W150 Pickup 4WD
## 341 123 Postal Cab Chassis 2WD
## 340 123 S10 Cab Chassis 2WD
## 339 123 Dakota Cab Chassis 2WD
## 338 123 Cab/Chassis 2WD
## 337 123 Cab Chassis 2WD
## 336 123 S15 Cab Chassis 2WD
## 335 123 Truck Cab Chassis 2WD
## 334 123 D250 Cab Chassis 2WD
## 236 84 Yukon 1500 4WD
## 235 84 Suburban C10 2WD
## 234 84 SJ 410V 4WD
## 233 84 Suburban 1500 2WD
## 232 84 Suburban K10 4WD
## 231 84 Yukon K1500 4WD
## 230 84 SJ 410 4WD
## 229 84 Suburban 1500 4WD
## 365 130 900 Convertible
## 364 130 318i Convertible
## 363 130 Convertible