library(DT)
library(knitr)
library(ISLR)
library(corrplot)
library(plotrix)
library(reshape)
library(sqldf)
library(MASS)
library(lsr)
library(flexclust)
library(proxy)
library(lattice)
library(latticeExtra)
library(RColorBrewer)
Arguments:
minmaxNorm = function(trData, teData = NULL, minV, maxV)
{
if(is.null(trData))
return("trData Can't Be Null")
checkTest = 0
nameTes = NULL
if(is.data.frame(trData) == FALSE)
trData = as.data.frame(trData)
if(!is.null(teData))
{
warning("It is expected that both trData and teData have the same colomn arrangement")
checkTest = 1
nameTes = names(teData)
if(is.data.frame(teData) == FALSE)
teData = as.data.frame(teData)
a = ncol(trData)
b = ncol(teData)
if(a != b)
return("Equal number of column is expected for both trData and teData in the same order")
}
nameTr = names(trData)
if(is.null(minV))
minV = 0
if(is.null(maxV))
maxV = 1
diffNew = maxV - minV
for(i in 1:ncol(trData))
{
col = trData[, i]
max = max(col)
min = min(col)
diff = max - min
for(j in 1:nrow(trData))
{
vi = trData[j, i]
if(diff == 0)
trData[j, i] = minV
else
trData[j, i] = (vi - min)/diff * (diffNew) + minV
}
maxAfterNorm = max(trData[, i])
minAfterNorm = min(trData[, i])
if(checkTest == 1)
{
for(k in 1:nrow(teData))
{
ci = teData[k, i]
if(diff == 0)
result = minV
else
result = (ci - min)/diff * (diffNew) + minV
if(result > maxAfterNorm)
{
teData[k, i] = maxAfterNorm
}
else if (result < minAfterNorm)
{
teData[k, i] = minAfterNorm
}
else
{
teData[k, i] = result
}
}
}
}
if(!is.null(nameTr))
names(trData) = nameTr
if(!is.null(nameTes))
names(teData) = nameTes
if(!is.null(trData) && !is.null(teData))
return(list("trainData" = trData, "testData" = teData))
else if(!is.null(trData))
return(list("trainData" = trData))
else
return(list("testData" = testData))
}
Arguments:
zscoreNorm = function(trData, teData = NULL, madFlag = FALSE)
{
if(is.null(trData))
return("trData Can't Be Null")
checkTest = 0
nameTes = NULL
if(is.data.frame(trData) == FALSE)
trData = as.data.frame(trData)
if(!is.null(teData))
{
warning("It is expected that both trData and teData have the same colomn arrangement")
checkTest = 1
nameTes = names(teData)
if(is.data.frame(teData) == FALSE)
teData = as.data.frame(teData)
a = ncol(trData)
b = ncol(teData)
if(a != b)
return("Equal number of column is expected for both trData and teData in the same order")
}
nameTr = names(trData)
for(i in 1:ncol(trData))
{
col = trData[, i]
mean = mean(col)
if(madFlag == TRUE)
{
std = aad(col)
}
else
{
std = sd(col)
}
for(j in 1:nrow(trData))
{
vi = trData[j, i]
if(std == 0)
trData[j, i] = vi
else
trData[j, i] = (vi - mean)/std
}
maxAfterNorm = max(trData[, i])
minAfterNorm = min(trData[, i])
if(checkTest == 1)
{
for(k in 1:nrow(teData))
{
ci = teData[k, i]
if(std == 0)
result = ci
else
result = (ci - mean)/std
if(result > maxAfterNorm)
{
teData[k, i] = maxAfterNorm
}
else if (result < minAfterNorm)
{
teData[k, i] = minAfterNorm
}
else
{
teData[k, i] = result
}
}
}
}
if(!is.null(nameTr))
names(trData) = nameTr
if(!is.null(nameTes))
names(teData) = nameTes
if(!is.null(trData) && !is.null(teData))
return(list("trainData" = trData, "testData" = teData))
else if(!is.null(trData))
return(list("trainData" = trData))
else
return(list("testData" = testData))
}
data2 = c(200, 300, 400, 600, 1000)
data2
## [1] 200 300 400 600 1000
data2a = as.data.frame(minmaxNorm(data2, minV = 0, maxV = 1))
names(data2a) = "minmaxNorm"
data2b = as.data.frame(zscoreNorm(data2))
names(data2b) = "zscoreNorm"
data2c = as.data.frame(zscoreNorm(data2, NULL, TRUE))
names(data2c) = "zscoreNorm-MAD"
kable(cbind(data2a, data2b, data2c), row.names = F)
| minmaxNorm | zscoreNorm | zscoreNorm-MAD |
|---|---|---|
| 0.000 | -0.9486833 | -1.2500000 |
| 0.125 | -0.6324555 | -0.8333333 |
| 0.250 | -0.3162278 | -0.4166667 |
| 0.500 | 0.3162278 | 0.4166667 |
| 1.000 | 1.5811388 | 2.0833333 |
name3 = c("sepLeng", "sepWid", "petLeng", "petWid", "class")
data3 = read.table(file = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
sep = ",", strip.white = T, col.names = name3)
v1 = data3$petLeng
v1
## [1] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 1.5 1.6 1.4 1.1 1.2 1.5 1.3
## [18] 1.4 1.7 1.5 1.7 1.5 1.0 1.7 1.9 1.6 1.6 1.5 1.4 1.6 1.6 1.5 1.5 1.4
## [35] 1.5 1.2 1.3 1.5 1.3 1.5 1.3 1.3 1.3 1.6 1.9 1.4 1.6 1.4 1.5 1.4 4.7
## [52] 4.5 4.9 4.0 4.6 4.5 4.7 3.3 4.6 3.9 3.5 4.2 4.0 4.7 3.6 4.4 4.5 4.1
## [69] 4.5 3.9 4.8 4.0 4.9 4.7 4.3 4.4 4.8 5.0 4.5 3.5 3.8 3.7 3.9 5.1 4.5
## [86] 4.5 4.7 4.4 4.1 4.0 4.4 4.6 4.0 3.3 4.2 4.2 4.2 4.3 3.0 4.1 6.0 5.1
## [103] 5.9 5.6 5.8 6.6 4.5 6.3 5.8 6.1 5.1 5.3 5.5 5.0 5.1 5.3 5.5 6.7 6.9
## [120] 5.0 5.7 4.9 6.7 4.9 5.7 6.0 4.8 4.9 5.6 5.8 6.1 6.4 5.6 5.1 5.6 6.1
## [137] 5.6 5.5 4.8 5.4 5.6 5.1 5.1 5.9 5.7 5.2 5.0 5.2 5.4 5.1
v2 = c(1.95, 3.1, 5.68, 6.2)
result3a = minmaxNorm(trData = v1, teData = v2, minV = -1, maxV = 1)
result3a$testData
## teData
## 1 -0.6779661
## 2 -0.2881356
## 3 0.5864407
## 4 0.7627119
result3b = zscoreNorm(trData = v1, teData = v2)
result3b$testData
## teData
## 1 -1.0250769
## 2 -0.3733048
## 3 1.0889317
## 4 1.3836460
par(mfrow = c(2,1))
hist(v1, main = "Histogram of Petal Length")
qqnorm(v1, main = "Normal Q-Q of Petal Length")
#boxplot(v1, main = "Histogram of Petal Length")
The visualizations above show that the underlying distribution of the data is not normal. Thus, it might not be adequate to use zscore normalization. Hence, we think min-max normalization is better.
x1 = c(A = 1.4, B = 1.3, C = 2.9)
x2 = c(A = 1.8, B = 1.4, C = 3.2)
x3 = c(A = 1.3, B = 1.2, C = 2.9)
x4 = c(A = 0.9, B = 3.5, C = 3.1)
x5 = c(A = 1.5, B = 2.1, C = 3.3)
data4 = rbind(x1, x2, x3, x4, x5)
newPoint = c(A = 1.25, B = 1.78, C = 3.01)
data4; newPoint
## A B C
## x1 1.4 1.3 2.9
## x2 1.8 1.4 3.2
## x3 1.3 1.2 2.9
## x4 0.9 3.5 3.1
## x5 1.5 2.1 3.3
## A B C
## 1.25 1.78 3.01
1.25, 1.78, 3.01) and points x1, x2, x3, x4, x5.dt4Manhat = as.data.frame(dist2(data4, newPoint, "manhattan"))
names(dt4Manhat) = "manhattan"
#dist(rbind(data4, newPoint), method="manhattan") #This is an alternative
dt4Eucl = as.data.frame(dist2(data4, newPoint, "euclidean"))
names(dt4Eucl) = "euclidean"
dt4Mink = as.data.frame(dist2(data4, newPoint, "minkowski", 3))
names(dt4Mink) = "minkowski"
dt4Supr = as.data.frame(dist2(data4, newPoint, "maximum"))
names(dt4Supr) = "supremum"
#dist2 didnt work for cosine similarity, thus using dist
#dt4Cos = dist(rbind(data4, newPoint), method="cosine") #This give a weird result
x1 <- c(1.4,1.3,2.9)
x2 <- c(1.8,1.4,3.2)
x3 <- c(1.3,1.2,2.9)
x4 <- c(0.9,3.5,3.1)
x5 <- c(1.5,2.1,3.3)
#New Data Point
data1 <- c(1.25,1.78,3.01)
cosine <- function(data1, data2)
{
((data1[1]*data2[1])+(data1[2]*data2[2])+(data1[3]*data2[3]))/(sqrt(data1[1]^2 + data1[2]^2 + data1[3]^2)
*sqrt(data2[1]^2 + data2[2]^2 + data2[3]^2))
}
cosine = c(cosine(x1, data1), cosine(x2, data1), cosine(x3, data1), cosine(x4, data1), cosine(x5, data1))
dt4Cos = as.matrix(cosine)
dt4Cos = as.data.frame(dt4Cos)
names(dt4Cos) = "cosine"
kable(cbind(dt4Manhat, dt4Eucl, dt4Mink, dt4Supr, dt4Cos))
| manhattan | euclidean | minkowski | supremum | cosine | |
|---|---|---|---|---|---|
| x1 | 0.74 | 0.5147815 | 0.4867141 | 0.48 | 0.9919753 |
| x2 | 1.12 | 0.6949820 | 0.6110061 | 0.55 | 0.9850451 |
| x3 | 0.74 | 0.5924525 | 0.5814392 | 0.58 | 0.9900603 |
| x4 | 2.16 | 1.7575551 | 1.7248990 | 1.72 | 0.9436884 |
| x5 | 0.86 | 0.4989990 | 0.4175175 | 0.32 | 0.9992686 |
data4b = rbind(data4, newPoint)
data4b_Norm = minmaxNorm(trData = data4b, minV = 0, maxV = 1)
newpoint_Norm = as.matrix(data4b_Norm$trainData)["newPoint", ]
new_data4b = as.matrix(data4b_Norm$trainData)[1:5, ]
The new data point is x = (0.3888889, 0.2521739, 0.275) and the the data set is
new_data4b
## A B C
## x1 0.5555556 0.04347826 0.00
## x2 1.0000000 0.08695652 0.75
## x3 0.4444444 0.00000000 0.00
## x4 0.0000000 1.00000000 0.50
## x5 0.6666667 0.39130435 1.00
dt4bEucl = as.data.frame(dist2(new_data4b, newpoint_Norm, "euclidean"))
names(dt4bEucl) = "euclidean"
kable(dt4bEucl)
| euclidean | |
|---|---|
| x1 | 0.3833493 |
| x2 | 0.7914408 |
| x3 | 0.3772308 |
| x4 | 0.8724124 |
| x5 | 0.7887603 |
K-means Clustering
#Prepare Data
sampe <- data.frame("Sample"= 1:8, "X1" = c(1,1,0,2,5,6,4,5), "X2" = c(4,3,4,5,1,2,0,2)
, "Initial.Groups" = c(1,1,2,2,1,2,1,2))
#X1 on x-axis, X2 on y-axis
plot(sampe$X1, sampe$X2, xlab = "X1", ylab = "X2")
#Plot of X1, X2 colored by the initial grouping
xyplot(X2~X1, sampe, groups=sampe$Initial.Groups)
Centroid 1 (2.75, 2)
Centroid 2 (3.25, 3.25)
C1 = Centroid Group 1
C2 = Centroid Group 2
| Sample | Centroid Cluster (Euclidean Distance) |
|---|---|
| 1 | C2 |
| 2 | C1 |
| 3 | C2 |
| 4 | C2 |
| 5 | C1 |
| 6 | C2 |
| 7 | C1 |
| 8 | C2 |
sampe$First.Groups <- c(2,1,2,2,1,2,1,2) #Groupings from part c)
New Centroid 1 (3.3333333,1.3333333)
New Centroid 2 (2.8,3.4)
| Sample | New Centroid Clusters (Euclidean Distance) |
|---|---|
| 1 | C2 |
| 2 | C2 |
| 3 | C2 |
| 4 | C2 |
| 5 | C1 |
| 6 | C1 |
| 7 | C1 |
| 8 | C1 |
sampe$Second.Groups <- c(2,2,2,2,1,1,1,1) #New Groups
New Centroid 1 (5,1.25)
New Centroid 2 (1,4)
Clusters remain the same, but centroids appear more centered and stable.
| Sample | New Centroid Clusters (Euclidean Distance) |
|---|---|
| 1 | C2 |
| 2 | C2 |
| 3 | C2 |
| 4 | C2 |
| 5 | C1 |
| 6 | C1 |
| 7 | C1 |
| 8 | C1 |
#Groups didn't change
groupings <- xyplot(X2~X1, sampe, groups=sampe$Second.Groups)
#Plot with Centroid Points Added
centers <- data.frame("CX"= c(5,1), "CY"=c(1.25,4))
layover <- xyplot(CY~CX, centers, col="black")
groupings + as.layer(layover)
Hierarchical Clustering
Manually Cluster
Complete Linkage Dendrogram Clusters: (C1,C2)(C1,C3)(C4)
Single Linkage Dendrogram Clusters: (C1, C2, C5)(C4)(C3)