Question 1: General-Purpose Functions

Q1a: The Function min-max (linear) Normalization

Arguments:

trData: The train-dataset.
minV: The minimum value of the new range.
maxV: The maximum value of the new range.
teData : The test-dataset, if it is available. Its default value is null.

minmaxNorm = function(trData, teData = NULL, minV, maxV)
{
  if(is.null(trData))
    return("trData Can't Be Null")
  
  checkTest = 0
  nameTes = NULL
  
  if(is.data.frame(trData) == FALSE)
      trData = as.data.frame(trData)
  
  if(!is.null(teData))
  {
    warning("It is expected that both trData and teData have the same colomn arrangement")
    checkTest = 1
    nameTes = names(teData)
    
    if(is.data.frame(teData) == FALSE)
      teData = as.data.frame(teData)
    
    a = ncol(trData)
    b = ncol(teData)
    if(a != b)
      return("Equal number of column is expected for both trData and teData in the same order")
  }

  nameTr = names(trData)
    
  if(is.null(minV))
    minV = 0
    
  if(is.null(maxV))
    maxV = 1
    
  diffNew = maxV - minV
    
  for(i in 1:ncol(trData))
  {
    col = trData[, i]
    max = max(col)
    min = min(col)
    diff = max - min
      
    for(j in 1:nrow(trData))
    {
      vi = trData[j, i]
      
      if(diff == 0)
        trData[j, i] = minV
      else
        trData[j, i] = (vi - min)/diff * (diffNew) + minV
    }
    
    maxAfterNorm = max(trData[, i])
    minAfterNorm = min(trData[, i])
    
    if(checkTest == 1)
    {
      for(k in 1:nrow(teData))
      {
        ci = teData[k, i]
        
        if(diff == 0)
          result = minV
        else
          result = (ci - min)/diff * (diffNew) + minV
        
        if(result > maxAfterNorm)
        {
          teData[k, i] = maxAfterNorm
        }
        else if (result < minAfterNorm)
        {
          teData[k, i] = minAfterNorm
        }
        else
        {
          teData[k, i] = result
        }
      }
    }
  }
    
  if(!is.null(nameTr))
    names(trData) = nameTr
  
  if(!is.null(nameTes))
      names(teData) = nameTes
  
  if(!is.null(trData) && !is.null(teData))
    return(list("trainData" = trData, "testData" = teData))
  else if(!is.null(trData))
    return(list("trainData" = trData))
  else
    return(list("testData" = testData))
}

Q1b: The Function z-score (Gaussian) Normalization

Arguments:

trData: The train-dataset.
madFlag: Mean absolute deviation flag. If true mean absolute deviation will be used in place of standard deviation. Its default value is false.
teData : The test-dataset, if it is available. Its default value is null.

zscoreNorm = function(trData, teData = NULL, madFlag = FALSE)
{
  if(is.null(trData))
    return("trData Can't Be Null")
  
  checkTest = 0
  nameTes = NULL
  
  if(is.data.frame(trData) == FALSE)
      trData = as.data.frame(trData)
  
  if(!is.null(teData))
  {
    warning("It is expected that both trData and teData have the same colomn arrangement")
    checkTest = 1
    nameTes = names(teData)
    
    if(is.data.frame(teData) == FALSE)
      teData = as.data.frame(teData)
    
    a = ncol(trData)
    b = ncol(teData)
    if(a != b)
      return("Equal number of column is expected for both trData and teData in the same order")
  }

  nameTr = names(trData)
    
  for(i in 1:ncol(trData))
  {
    col = trData[, i]
    mean = mean(col)
      
    if(madFlag == TRUE)
      {
      std = aad(col)
      }
    else
      {
      std = sd(col)
      }
      
    for(j in 1:nrow(trData))
    {
      vi = trData[j, i]
      
      if(std == 0)
        trData[j, i] = vi
      else
        trData[j, i] = (vi - mean)/std
    }
    
    maxAfterNorm = max(trData[, i])
    minAfterNorm = min(trData[, i])
    
    if(checkTest == 1)
    {
      for(k in 1:nrow(teData))
      {
        ci = teData[k, i]
        
        if(std == 0)
          result = ci
        else
          result = (ci - mean)/std
        
        if(result > maxAfterNorm)
        {
          teData[k, i] = maxAfterNorm
        }
        else if (result < minAfterNorm)
        {
          teData[k, i] = minAfterNorm
        }
        else
        {
          teData[k, i] = result
        }
      }
    }
  }
    
  if(!is.null(nameTr))
    names(trData) = nameTr
  
  if(!is.null(nameTes))
      names(teData) = nameTes
  
  if(!is.null(trData) && !is.null(teData))
    return(list("trainData" = trData, "testData" = teData))
  else if(!is.null(trData))
    return(list("trainData" = trData))
  else
    return(list("testData" = testData))
}

Question 2: Normalization (Testing the functions above)

data2 = c(200, 300, 400, 600, 1000)
data2

## [1]  200  300  400  600 1000

data2a = as.data.frame(minmaxNorm(data2, minV = 0, maxV = 1))
names(data2a) = "minmaxNorm"

data2b = as.data.frame(zscoreNorm(data2))
names(data2b) = "zscoreNorm"

data2c = as.data.frame(zscoreNorm(data2, NULL, TRUE))
names(data2c) = "zscoreNorm-MAD"

kable(cbind(data2a, data2b, data2c), row.names = F)

minmaxNorm	zscoreNorm	zscoreNorm-MAD
0.000	-0.9486833	-1.2500000
0.125	-0.6324555	-0.8333333
0.250	-0.3162278	-0.4166667
0.500	0.3162278	0.4166667
1.000	1.5811388	2.0833333

Question 3: Normalization (Petal length data)

name3 = c("sepLeng", "sepWid", "petLeng", "petWid", "class")
data3 = read.table(file = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", 
                   sep = ",", strip.white = T, col.names = name3)

v1 = data3$petLeng
v1

##   [1] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 1.5 1.6 1.4 1.1 1.2 1.5 1.3
##  [18] 1.4 1.7 1.5 1.7 1.5 1.0 1.7 1.9 1.6 1.6 1.5 1.4 1.6 1.6 1.5 1.5 1.4
##  [35] 1.5 1.2 1.3 1.5 1.3 1.5 1.3 1.3 1.3 1.6 1.9 1.4 1.6 1.4 1.5 1.4 4.7
##  [52] 4.5 4.9 4.0 4.6 4.5 4.7 3.3 4.6 3.9 3.5 4.2 4.0 4.7 3.6 4.4 4.5 4.1
##  [69] 4.5 3.9 4.8 4.0 4.9 4.7 4.3 4.4 4.8 5.0 4.5 3.5 3.8 3.7 3.9 5.1 4.5
##  [86] 4.5 4.7 4.4 4.1 4.0 4.4 4.6 4.0 3.3 4.2 4.2 4.2 4.3 3.0 4.1 6.0 5.1
## [103] 5.9 5.6 5.8 6.6 4.5 6.3 5.8 6.1 5.1 5.3 5.5 5.0 5.1 5.3 5.5 6.7 6.9
## [120] 5.0 5.7 4.9 6.7 4.9 5.7 6.0 4.8 4.9 5.6 5.8 6.1 6.4 5.6 5.1 5.6 6.1
## [137] 5.6 5.5 4.8 5.4 5.6 5.1 5.1 5.9 5.7 5.2 5.0 5.2 5.4 5.1

v2 = c(1.95, 3.1, 5.68, 6.2)

Q3a

result3a = minmaxNorm(trData = v1, teData = v2, minV = -1, maxV = 1)
result3a$testData

##       teData
## 1 -0.6779661
## 2 -0.2881356
## 3  0.5864407
## 4  0.7627119

Q3b

result3b = zscoreNorm(trData = v1, teData = v2)
result3b$testData

##       teData
## 1 -1.0250769
## 2 -0.3733048
## 3  1.0889317
## 4  1.3836460

Q3c

par(mfrow = c(2,1))
hist(v1, main = "Histogram of Petal Length")
qqnorm(v1, main = "Normal Q-Q of Petal Length")

#boxplot(v1, main = "Histogram of Petal Length")

The visualizations above show that the underlying distribution of the data is not normal. Thus, it might not be adequate to use zscore normalization. Hence, we think min-max normalization is better.

Question 4: Distance Calculation

x1 = c(A = 1.4, B = 1.3, C = 2.9)
x2 = c(A = 1.8, B = 1.4, C = 3.2)
x3 = c(A = 1.3, B = 1.2, C = 2.9)
x4 = c(A = 0.9, B = 3.5, C = 3.1)
x5 = c(A = 1.5, B = 2.1, C = 3.3)

data4 = rbind(x1, x2, x3, x4, x5)
newPoint = c(A = 1.25, B = 1.78, C = 3.01)
data4; newPoint

##      A   B   C
## x1 1.4 1.3 2.9
## x2 1.8 1.4 3.2
## x3 1.3 1.2 2.9
## x4 0.9 3.5 3.1
## x5 1.5 2.1 3.3

##    A    B    C 
## 1.25 1.78 3.01

Q4a

Distance between point (`1.25, 1.78, 3.01`) and points `x1, x2, x3, x4, x5`.

dt4Manhat = as.data.frame(dist2(data4, newPoint, "manhattan"))
names(dt4Manhat) = "manhattan"

#dist(rbind(data4, newPoint), method="manhattan") #This is an alternative

dt4Eucl = as.data.frame(dist2(data4, newPoint, "euclidean"))
names(dt4Eucl) = "euclidean"

dt4Mink = as.data.frame(dist2(data4, newPoint, "minkowski", 3))
names(dt4Mink) = "minkowski"

dt4Supr = as.data.frame(dist2(data4, newPoint, "maximum"))
names(dt4Supr) = "supremum"

#dist2 didnt work for cosine similarity, thus using dist
#dt4Cos = dist(rbind(data4, newPoint), method="cosine") #This give a weird result
x1 <- c(1.4,1.3,2.9)
x2 <- c(1.8,1.4,3.2)
x3 <- c(1.3,1.2,2.9)
x4 <- c(0.9,3.5,3.1)
x5 <- c(1.5,2.1,3.3)
#New Data Point
data1 <- c(1.25,1.78,3.01)

cosine <- function(data1, data2)
{
  ((data1[1]*data2[1])+(data1[2]*data2[2])+(data1[3]*data2[3]))/(sqrt(data1[1]^2 + data1[2]^2 + data1[3]^2)
    *sqrt(data2[1]^2 + data2[2]^2 + data2[3]^2))
}

cosine = c(cosine(x1, data1), cosine(x2, data1), cosine(x3, data1), cosine(x4, data1), cosine(x5, data1)) 


dt4Cos = as.matrix(cosine)
dt4Cos = as.data.frame(dt4Cos)
names(dt4Cos) = "cosine"

kable(cbind(dt4Manhat, dt4Eucl, dt4Mink, dt4Supr, dt4Cos))

	manhattan	euclidean	minkowski	supremum	cosine
x1	0.74	0.5147815	0.4867141	0.48	0.9919753
x2	1.12	0.6949820	0.6110061	0.55	0.9850451
x3	0.74	0.5924525	0.5814392	0.58	0.9900603
x4	2.16	1.7575551	1.7248990	1.72	0.9436884
x5	0.86	0.4989990	0.4175175	0.32	0.9992686

Q4b

data4b = rbind(data4, newPoint)
data4b_Norm = minmaxNorm(trData = data4b, minV = 0, maxV = 1)
newpoint_Norm = as.matrix(data4b_Norm$trainData)["newPoint", ]
new_data4b = as.matrix(data4b_Norm$trainData)[1:5, ]

The new data point is x = (0.3888889, 0.2521739, 0.275) and the the data set is

new_data4b

##            A          B    C
## x1 0.5555556 0.04347826 0.00
## x2 1.0000000 0.08695652 0.75
## x3 0.4444444 0.00000000 0.00
## x4 0.0000000 1.00000000 0.50
## x5 0.6666667 0.39130435 1.00

dt4bEucl = as.data.frame(dist2(new_data4b, newpoint_Norm, "euclidean"))
names(dt4bEucl) = "euclidean"
kable(dt4bEucl)

	euclidean
x1	0.3833493
x2	0.7914408
x3	0.3772308
x4	0.8724124
x5	0.7887603

Question 5

K-means Clustering

Q5a

#Prepare Data
sampe <- data.frame("Sample"= 1:8, "X1" = c(1,1,0,2,5,6,4,5), "X2" = c(4,3,4,5,1,2,0,2)
              , "Initial.Groups" = c(1,1,2,2,1,2,1,2))

#X1 on x-axis, X2 on y-axis
plot(sampe$X1, sampe$X2, xlab = "X1", ylab = "X2")

Q5b

#Plot of X1, X2 colored by the initial grouping
xyplot(X2~X1, sampe, groups=sampe$Initial.Groups)

Centroid 1 (2.75, 2)

Centroid 2 (3.25, 3.25)

Q5c

C1 = Centroid Group 1
C2 = Centroid Group 2

Sample	Centroid Cluster (Euclidean Distance)
1	C2
2	C1
3	C2
4	C2
5	C1
6	C2
7	C1
8	C2

Q5d

sampe$First.Groups <- c(2,1,2,2,1,2,1,2) #Groupings from part c)

New Centroid 1 (3.3333333,1.3333333)

New Centroid 2 (2.8,3.4)

Sample	New Centroid Clusters (Euclidean Distance)
1	C2
2	C2
3	C2
4	C2
5	C1
6	C1
7	C1
8	C1

sampe$Second.Groups <- c(2,2,2,2,1,1,1,1) #New Groups

New Centroid 1 (5,1.25)

New Centroid 2 (1,4)

Clusters remain the same, but centroids appear more centered and stable.

Sample	New Centroid Clusters (Euclidean Distance)
1	C2
2	C2
3	C2
4	C2
5	C1
6	C1
7	C1
8	C1

Q5e

#Groups didn't change
groupings <- xyplot(X2~X1, sampe, groups=sampe$Second.Groups)
#Plot with Centroid Points Added
centers <- data.frame("CX"= c(5,1), "CY"=c(1.25,4))
layover <- xyplot(CY~CX, centers, col="black")
groupings + as.layer(layover)

Project

Ibrahim Odumas Odufowora & Jamie Berger

2017-05-11

Question 1: General-Purpose Functions

Q1a: The Function min-max (linear) Normalization

Q1b: The Function z-score (Gaussian) Normalization

Question 2: Normalization (Testing the functions above)

Question 3: Normalization (Petal length data)

Q3a

Q3b

Q3c

Question 4: Distance Calculation

Q4a

Distance between point (`1.25, 1.78, 3.01`) and points `x1, x2, x3, x4, x5`.

Q4b

Question 5

Q5a

Q5b

Q5c

Q5d

Q5e

The centroid for the pink cluster is overlaying one of the data points because they have the same coordinates.

Question 6

Q6(a,b)

Q6c

Project

Ibrahim Odumas Odufowora & Jamie Berger

2017-05-11

Question 1: General-Purpose Functions

Q1a: The Function min-max (linear) Normalization

Q1b: The Function z-score (Gaussian) Normalization

Question 2: Normalization (Testing the functions above)

Question 3: Normalization (Petal length data)

Q3a

Q3b

Q3c

Question 4: Distance Calculation

Q4a

Distance between point (1.25, 1.78, 3.01) and points x1, x2, x3, x4, x5.

Q4b

Question 5

Q5a

Q5b

Q5c

Q5d

Q5e

The centroid for the pink cluster is overlaying one of the data points because they have the same coordinates.

Question 6

Q6(a,b)

Q6c

Distance between point (`1.25, 1.78, 3.01`) and points `x1, x2, x3, x4, x5`.