Cleaning

dfweed = read.csv("weedprices20062014.csv", header = T)
dfweed$HighQ = as.numeric(gsub("\\$", "", dfweed$HighQ))
dfweed$MedQ = as.numeric(gsub("\\$", "", dfweed$MedQ))
dfweed$LowQ = as.numeric(gsub("\\$", "", dfweed$LowQ))
dfweed$HighQN = as.numeric(dfweed$HighQN)
dfweed$MedQN = as.numeric(dfweed$MedQN)
dfweed$LowQN = as.numeric(dfweed$LowQN )
str(dfweed)
## 'data.frame':    51 obs. of  7 variables:
##  $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ HighQ : num  341 289 303 350 245 ...
##  $ HighQN: num  1327 316 2308 717 14406 ...
##  $ MedQ  : num  205 262 209 184 192 ...
##  $ MedQN : num  1184 379 2029 677 15892 ...
##  $ LowQ  : num  148 372 186 126 189 ...
##  $ LowQN : num  152 31 273 134 938 158 102 39 47 614 ...
library(robustHD)
## Loading required package: ggplot2
## Loading required package: perry
## Loading required package: parallel
## Loading required package: robustbase
std = standardize(dfweed[-1])

Cluster

rownames(std) = dfweed$State
d <- dist(std, method = "canberra") # distance matrix (Euclidean and Manhattan didn't give me much info)
hc <- hclust(d, method ="average")
plot(hc)

cl1 = rbind(dfweed[3,], dfweed[5,],dfweed[6,],dfweed[10:11,], dfweed[14:15,], dfweed[19,],dfweed[25,], dfweed[27:28,], dfweed[30,], dfweed[34:35,], dfweed[38:39,], dfweed[44,], dfweed[47:48,])


cl2 = rbind(dfweed[1:2,],dfweed[4,],dfweed[7:9,],dfweed[12:13,],dfweed[16:18,],dfweed[20:24,],dfweed[26,],dfweed[29,],dfweed[31:33,],dfweed[36:37,], dfweed[40:43,],dfweed[45:46,],dfweed[49:51,])

summary(cl1)
##         State        HighQ           HighQN           MedQ      
##  Arizona   : 1   Min.   :232.3   Min.   : 1312   Min.   :190.0  
##  California: 1   1st Qu.:302.8   1st Qu.: 2676   1st Qu.:213.1  
##  Colorado  : 1   Median :338.2   Median : 3461   Median :244.4  
##  Florida   : 1   Mean   :324.7   Mean   : 4348   Mean   :241.3  
##  Georgia   : 1   3rd Qu.:353.4   3rd Qu.: 4760   3rd Qu.:272.4  
##  Illinois  : 1   Max.   :368.7   Max.   :14406   Max.   :292.9  
##  (Other)   :13                                                  
##      MedQN            LowQ           LowQN       
##  Min.   : 1112   Min.   :138.8   Min.   : 134.0  
##  1st Qu.: 2376   1st Qu.:148.5   1st Qu.: 253.5  
##  Median : 3055   Median :185.7   Median : 273.0  
##  Mean   : 4129   Mean   :182.9   Mean   : 386.4  
##  3rd Qu.: 4279   3rd Qu.:213.7   3rd Qu.: 456.5  
##  Max.   :15892   Max.   :273.4   Max.   :1026.0  
## 
summary(cl2)
##                   State        HighQ           HighQN      
##  Alabama             : 1   Min.   :209.0   Min.   : 109.0  
##  Alaska              : 1   1st Qu.:316.0   1st Qu.: 455.0  
##  Arkansas            : 1   Median :344.6   Median : 665.5  
##  Connecticut         : 1   Mean   :334.4   Mean   : 889.5  
##  Delaware            : 1   3rd Qu.:358.7   3rd Qu.:1275.2  
##  District of Columbia: 1   Max.   :401.1   Max.   :2403.0  
##  (Other)             :26                                   
##       MedQ           MedQN             LowQ            LowQN       
##  Min.   :146.2   Min.   : 158.0   Min.   : 93.93   Min.   : 12.00  
##  1st Qu.:222.6   1st Qu.: 441.2   1st Qu.:144.34   1st Qu.: 36.25  
##  Median :251.5   Median : 581.5   Median :188.45   Median : 65.50  
##  Mean   :252.6   Mean   : 824.2   Mean   :216.40   Mean   : 82.19  
##  3rd Qu.:276.5   3rd Qu.:1178.0   3rd Qu.:234.45   3rd Qu.:128.00  
##  Max.   :363.3   Max.   :1992.0   Max.   :657.98   Max.   :182.00  
## 
boxplot(cl1$HighQN, cl2$HighQN)

boxplot(cl1$MedQN, cl2$MedQN)

It appears that the clusters were created from the differences in MedQN.