Cleaning
dfweed = read.csv("weedprices20062014.csv", header = T)
dfweed$HighQ = as.numeric(gsub("\\$", "", dfweed$HighQ))
dfweed$MedQ = as.numeric(gsub("\\$", "", dfweed$MedQ))
dfweed$LowQ = as.numeric(gsub("\\$", "", dfweed$LowQ))
dfweed$HighQN = as.numeric(dfweed$HighQN)
dfweed$MedQN = as.numeric(dfweed$MedQN)
dfweed$LowQN = as.numeric(dfweed$LowQN )
str(dfweed)
## 'data.frame': 51 obs. of 7 variables:
## $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ HighQ : num 341 289 303 350 245 ...
## $ HighQN: num 1327 316 2308 717 14406 ...
## $ MedQ : num 205 262 209 184 192 ...
## $ MedQN : num 1184 379 2029 677 15892 ...
## $ LowQ : num 148 372 186 126 189 ...
## $ LowQN : num 152 31 273 134 938 158 102 39 47 614 ...
library(robustHD)
## Loading required package: ggplot2
## Loading required package: perry
## Loading required package: parallel
## Loading required package: robustbase
std = standardize(dfweed[-1])
Cluster
rownames(std) = dfweed$State
d <- dist(std, method = "canberra") # distance matrix (Euclidean and Manhattan didn't give me much info)
hc <- hclust(d, method ="average")
plot(hc)
cl1 = rbind(dfweed[3,], dfweed[5,],dfweed[6,],dfweed[10:11,], dfweed[14:15,], dfweed[19,],dfweed[25,], dfweed[27:28,], dfweed[30,], dfweed[34:35,], dfweed[38:39,], dfweed[44,], dfweed[47:48,])
cl2 = rbind(dfweed[1:2,],dfweed[4,],dfweed[7:9,],dfweed[12:13,],dfweed[16:18,],dfweed[20:24,],dfweed[26,],dfweed[29,],dfweed[31:33,],dfweed[36:37,], dfweed[40:43,],dfweed[45:46,],dfweed[49:51,])
summary(cl1)
## State HighQ HighQN MedQ
## Arizona : 1 Min. :232.3 Min. : 1312 Min. :190.0
## California: 1 1st Qu.:302.8 1st Qu.: 2676 1st Qu.:213.1
## Colorado : 1 Median :338.2 Median : 3461 Median :244.4
## Florida : 1 Mean :324.7 Mean : 4348 Mean :241.3
## Georgia : 1 3rd Qu.:353.4 3rd Qu.: 4760 3rd Qu.:272.4
## Illinois : 1 Max. :368.7 Max. :14406 Max. :292.9
## (Other) :13
## MedQN LowQ LowQN
## Min. : 1112 Min. :138.8 Min. : 134.0
## 1st Qu.: 2376 1st Qu.:148.5 1st Qu.: 253.5
## Median : 3055 Median :185.7 Median : 273.0
## Mean : 4129 Mean :182.9 Mean : 386.4
## 3rd Qu.: 4279 3rd Qu.:213.7 3rd Qu.: 456.5
## Max. :15892 Max. :273.4 Max. :1026.0
##
summary(cl2)
## State HighQ HighQN
## Alabama : 1 Min. :209.0 Min. : 109.0
## Alaska : 1 1st Qu.:316.0 1st Qu.: 455.0
## Arkansas : 1 Median :344.6 Median : 665.5
## Connecticut : 1 Mean :334.4 Mean : 889.5
## Delaware : 1 3rd Qu.:358.7 3rd Qu.:1275.2
## District of Columbia: 1 Max. :401.1 Max. :2403.0
## (Other) :26
## MedQ MedQN LowQ LowQN
## Min. :146.2 Min. : 158.0 Min. : 93.93 Min. : 12.00
## 1st Qu.:222.6 1st Qu.: 441.2 1st Qu.:144.34 1st Qu.: 36.25
## Median :251.5 Median : 581.5 Median :188.45 Median : 65.50
## Mean :252.6 Mean : 824.2 Mean :216.40 Mean : 82.19
## 3rd Qu.:276.5 3rd Qu.:1178.0 3rd Qu.:234.45 3rd Qu.:128.00
## Max. :363.3 Max. :1992.0 Max. :657.98 Max. :182.00
##
boxplot(cl1$HighQN, cl2$HighQN)
boxplot(cl1$MedQN, cl2$MedQN)
It appears that the clusters were created from the differences in MedQN.