Project

Ibrahim Odumas Odufowora & Jamie Berger

2017-05-11

library(DT)
library(knitr)
library(ISLR)
library(corrplot)
library(plotrix)
library(reshape)
library(sqldf)
library(MASS)
library(lsr)
library(flexclust)
library(proxy)
library(lattice)
library(latticeExtra)
library(RColorBrewer)

Question 1: General-Purpose Functions

Q1a: The Function min-max (linear) Normalization

Arguments:

minmaxNorm = function(trData, teData = NULL, minV, maxV)
{
  if(is.null(trData))
    return("trData Can't Be Null")
  
  checkTest = 0
  nameTes = NULL
  
  if(is.data.frame(trData) == FALSE)
      trData = as.data.frame(trData)
  
  if(!is.null(teData))
  {
    warning("It is expected that both trData and teData have the same colomn arrangement")
    checkTest = 1
    nameTes = names(teData)
    
    if(is.data.frame(teData) == FALSE)
      teData = as.data.frame(teData)
    
    a = ncol(trData)
    b = ncol(teData)
    if(a != b)
      return("Equal number of column is expected for both trData and teData in the same order")
  }

  nameTr = names(trData)
    
  if(is.null(minV))
    minV = 0
    
  if(is.null(maxV))
    maxV = 1
    
  diffNew = maxV - minV
    
  for(i in 1:ncol(trData))
  {
    col = trData[, i]
    max = max(col)
    min = min(col)
    diff = max - min
      
    for(j in 1:nrow(trData))
    {
      vi = trData[j, i]
      
      if(diff == 0)
        trData[j, i] = minV
      else
        trData[j, i] = (vi - min)/diff * (diffNew) + minV
    }
    
    maxAfterNorm = max(trData[, i])
    minAfterNorm = min(trData[, i])
    
    if(checkTest == 1)
    {
      for(k in 1:nrow(teData))
      {
        ci = teData[k, i]
        
        if(diff == 0)
          result = minV
        else
          result = (ci - min)/diff * (diffNew) + minV
        
        if(result > maxAfterNorm)
        {
          teData[k, i] = maxAfterNorm
        }
        else if (result < minAfterNorm)
        {
          teData[k, i] = minAfterNorm
        }
        else
        {
          teData[k, i] = result
        }
      }
    }
  }
    
  if(!is.null(nameTr))
    names(trData) = nameTr
  
  if(!is.null(nameTes))
      names(teData) = nameTes
  
  if(!is.null(trData) && !is.null(teData))
    return(list("trainData" = trData, "testData" = teData))
  else if(!is.null(trData))
    return(list("trainData" = trData))
  else
    return(list("testData" = testData))
}

Q1b: The Function z-score (Gaussian) Normalization

Arguments:

zscoreNorm = function(trData, teData = NULL, madFlag = FALSE)
{
  if(is.null(trData))
    return("trData Can't Be Null")
  
  checkTest = 0
  nameTes = NULL
  
  if(is.data.frame(trData) == FALSE)
      trData = as.data.frame(trData)
  
  if(!is.null(teData))
  {
    warning("It is expected that both trData and teData have the same colomn arrangement")
    checkTest = 1
    nameTes = names(teData)
    
    if(is.data.frame(teData) == FALSE)
      teData = as.data.frame(teData)
    
    a = ncol(trData)
    b = ncol(teData)
    if(a != b)
      return("Equal number of column is expected for both trData and teData in the same order")
  }

  nameTr = names(trData)
    
  for(i in 1:ncol(trData))
  {
    col = trData[, i]
    mean = mean(col)
      
    if(madFlag == TRUE)
      {
      std = aad(col)
      }
    else
      {
      std = sd(col)
      }
      
    for(j in 1:nrow(trData))
    {
      vi = trData[j, i]
      
      if(std == 0)
        trData[j, i] = vi
      else
        trData[j, i] = (vi - mean)/std
    }
    
    maxAfterNorm = max(trData[, i])
    minAfterNorm = min(trData[, i])
    
    if(checkTest == 1)
    {
      for(k in 1:nrow(teData))
      {
        ci = teData[k, i]
        
        if(std == 0)
          result = ci
        else
          result = (ci - mean)/std
        
        if(result > maxAfterNorm)
        {
          teData[k, i] = maxAfterNorm
        }
        else if (result < minAfterNorm)
        {
          teData[k, i] = minAfterNorm
        }
        else
        {
          teData[k, i] = result
        }
      }
    }
  }
    
  if(!is.null(nameTr))
    names(trData) = nameTr
  
  if(!is.null(nameTes))
      names(teData) = nameTes
  
  if(!is.null(trData) && !is.null(teData))
    return(list("trainData" = trData, "testData" = teData))
  else if(!is.null(trData))
    return(list("trainData" = trData))
  else
    return(list("testData" = testData))
}

Question 2: Normalization (Testing the functions above)

data2 = c(200, 300, 400, 600, 1000)
data2
## [1]  200  300  400  600 1000
data2a = as.data.frame(minmaxNorm(data2, minV = 0, maxV = 1))
names(data2a) = "minmaxNorm"

data2b = as.data.frame(zscoreNorm(data2))
names(data2b) = "zscoreNorm"

data2c = as.data.frame(zscoreNorm(data2, NULL, TRUE))
names(data2c) = "zscoreNorm-MAD"

kable(cbind(data2a, data2b, data2c), row.names = F)
minmaxNorm zscoreNorm zscoreNorm-MAD
0.000 -0.9486833 -1.2500000
0.125 -0.6324555 -0.8333333
0.250 -0.3162278 -0.4166667
0.500 0.3162278 0.4166667
1.000 1.5811388 2.0833333

Question 3: Normalization (Petal length data)

name3 = c("sepLeng", "sepWid", "petLeng", "petWid", "class")
data3 = read.table(file = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", 
                   sep = ",", strip.white = T, col.names = name3)

v1 = data3$petLeng
v1
##   [1] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 1.5 1.6 1.4 1.1 1.2 1.5 1.3
##  [18] 1.4 1.7 1.5 1.7 1.5 1.0 1.7 1.9 1.6 1.6 1.5 1.4 1.6 1.6 1.5 1.5 1.4
##  [35] 1.5 1.2 1.3 1.5 1.3 1.5 1.3 1.3 1.3 1.6 1.9 1.4 1.6 1.4 1.5 1.4 4.7
##  [52] 4.5 4.9 4.0 4.6 4.5 4.7 3.3 4.6 3.9 3.5 4.2 4.0 4.7 3.6 4.4 4.5 4.1
##  [69] 4.5 3.9 4.8 4.0 4.9 4.7 4.3 4.4 4.8 5.0 4.5 3.5 3.8 3.7 3.9 5.1 4.5
##  [86] 4.5 4.7 4.4 4.1 4.0 4.4 4.6 4.0 3.3 4.2 4.2 4.2 4.3 3.0 4.1 6.0 5.1
## [103] 5.9 5.6 5.8 6.6 4.5 6.3 5.8 6.1 5.1 5.3 5.5 5.0 5.1 5.3 5.5 6.7 6.9
## [120] 5.0 5.7 4.9 6.7 4.9 5.7 6.0 4.8 4.9 5.6 5.8 6.1 6.4 5.6 5.1 5.6 6.1
## [137] 5.6 5.5 4.8 5.4 5.6 5.1 5.1 5.9 5.7 5.2 5.0 5.2 5.4 5.1
v2 = c(1.95, 3.1, 5.68, 6.2)

Q3a

result3a = minmaxNorm(trData = v1, teData = v2, minV = -1, maxV = 1)
result3a$testData
##       teData
## 1 -0.6779661
## 2 -0.2881356
## 3  0.5864407
## 4  0.7627119

Q3b

result3b = zscoreNorm(trData = v1, teData = v2)
result3b$testData
##       teData
## 1 -1.0250769
## 2 -0.3733048
## 3  1.0889317
## 4  1.3836460

Q3c

par(mfrow = c(2,1))
hist(v1, main = "Histogram of Petal Length")
qqnorm(v1, main = "Normal Q-Q of Petal Length")

#boxplot(v1, main = "Histogram of Petal Length")

The visualizations above show that the underlying distribution of the data is not normal. Thus, it might not be adequate to use zscore normalization. Hence, we think min-max normalization is better.


Question 4: Distance Calculation

x1 = c(A = 1.4, B = 1.3, C = 2.9)
x2 = c(A = 1.8, B = 1.4, C = 3.2)
x3 = c(A = 1.3, B = 1.2, C = 2.9)
x4 = c(A = 0.9, B = 3.5, C = 3.1)
x5 = c(A = 1.5, B = 2.1, C = 3.3)

data4 = rbind(x1, x2, x3, x4, x5)
newPoint = c(A = 1.25, B = 1.78, C = 3.01)
data4; newPoint
##      A   B   C
## x1 1.4 1.3 2.9
## x2 1.8 1.4 3.2
## x3 1.3 1.2 2.9
## x4 0.9 3.5 3.1
## x5 1.5 2.1 3.3
##    A    B    C 
## 1.25 1.78 3.01

Q4a

Distance between point (1.25, 1.78, 3.01) and points x1, x2, x3, x4, x5.

dt4Manhat = as.data.frame(dist2(data4, newPoint, "manhattan"))
names(dt4Manhat) = "manhattan"

#dist(rbind(data4, newPoint), method="manhattan") #This is an alternative

dt4Eucl = as.data.frame(dist2(data4, newPoint, "euclidean"))
names(dt4Eucl) = "euclidean"

dt4Mink = as.data.frame(dist2(data4, newPoint, "minkowski", 3))
names(dt4Mink) = "minkowski"

dt4Supr = as.data.frame(dist2(data4, newPoint, "maximum"))
names(dt4Supr) = "supremum"

#dist2 didnt work for cosine similarity, thus using dist
#dt4Cos = dist(rbind(data4, newPoint), method="cosine") #This give a weird result
x1 <- c(1.4,1.3,2.9)
x2 <- c(1.8,1.4,3.2)
x3 <- c(1.3,1.2,2.9)
x4 <- c(0.9,3.5,3.1)
x5 <- c(1.5,2.1,3.3)
#New Data Point
data1 <- c(1.25,1.78,3.01)

cosine <- function(data1, data2)
{
  ((data1[1]*data2[1])+(data1[2]*data2[2])+(data1[3]*data2[3]))/(sqrt(data1[1]^2 + data1[2]^2 + data1[3]^2)
    *sqrt(data2[1]^2 + data2[2]^2 + data2[3]^2))
}

cosine = c(cosine(x1, data1), cosine(x2, data1), cosine(x3, data1), cosine(x4, data1), cosine(x5, data1)) 


dt4Cos = as.matrix(cosine)
dt4Cos = as.data.frame(dt4Cos)
names(dt4Cos) = "cosine"

kable(cbind(dt4Manhat, dt4Eucl, dt4Mink, dt4Supr, dt4Cos))
manhattan euclidean minkowski supremum cosine
x1 0.74 0.5147815 0.4867141 0.48 0.9919753
x2 1.12 0.6949820 0.6110061 0.55 0.9850451
x3 0.74 0.5924525 0.5814392 0.58 0.9900603
x4 2.16 1.7575551 1.7248990 1.72 0.9436884
x5 0.86 0.4989990 0.4175175 0.32 0.9992686

Q4b

data4b = rbind(data4, newPoint)
data4b_Norm = minmaxNorm(trData = data4b, minV = 0, maxV = 1)
newpoint_Norm = as.matrix(data4b_Norm$trainData)["newPoint", ]
new_data4b = as.matrix(data4b_Norm$trainData)[1:5, ]

The new data point is x = (0.3888889, 0.2521739, 0.275) and the the data set is

new_data4b
##            A          B    C
## x1 0.5555556 0.04347826 0.00
## x2 1.0000000 0.08695652 0.75
## x3 0.4444444 0.00000000 0.00
## x4 0.0000000 1.00000000 0.50
## x5 0.6666667 0.39130435 1.00
dt4bEucl = as.data.frame(dist2(new_data4b, newpoint_Norm, "euclidean"))
names(dt4bEucl) = "euclidean"
kable(dt4bEucl)
euclidean
x1 0.3833493
x2 0.7914408
x3 0.3772308
x4 0.8724124
x5 0.7887603

Question 5

K-means Clustering

Q5a

#Prepare Data
sampe <- data.frame("Sample"= 1:8, "X1" = c(1,1,0,2,5,6,4,5), "X2" = c(4,3,4,5,1,2,0,2)
              , "Initial.Groups" = c(1,1,2,2,1,2,1,2))

#X1 on x-axis, X2 on y-axis
plot(sampe$X1, sampe$X2, xlab = "X1", ylab = "X2")

Q5b

#Plot of X1, X2 colored by the initial grouping
xyplot(X2~X1, sampe, groups=sampe$Initial.Groups)

Centroid 1 (2.75, 2)

Centroid 2 (3.25, 3.25)

Q5c

C1 = Centroid Group 1
C2 = Centroid Group 2

Sample Centroid Cluster (Euclidean Distance)
1 C2
2 C1
3 C2
4 C2
5 C1
6 C2
7 C1
8 C2

Q5d

sampe$First.Groups <- c(2,1,2,2,1,2,1,2) #Groupings from part c)

New Centroid 1 (3.3333333,1.3333333)

New Centroid 2 (2.8,3.4)

Sample New Centroid Clusters (Euclidean Distance)
1 C2
2 C2
3 C2
4 C2
5 C1
6 C1
7 C1
8 C1
sampe$Second.Groups <- c(2,2,2,2,1,1,1,1) #New Groups

New Centroid 1 (5,1.25)

New Centroid 2 (1,4)

Clusters remain the same, but centroids appear more centered and stable.

Sample New Centroid Clusters (Euclidean Distance)
1 C2
2 C2
3 C2
4 C2
5 C1
6 C1
7 C1
8 C1

Q5e

#Groups didn't change
groupings <- xyplot(X2~X1, sampe, groups=sampe$Second.Groups)
#Plot with Centroid Points Added
centers <- data.frame("CX"= c(5,1), "CY"=c(1.25,4))
layover <- xyplot(CY~CX, centers, col="black")
groupings + as.layer(layover)


The centroid for the pink cluster is overlaying one of the data points because they have the same coordinates.

Question 6

Hierarchical Clustering

Q6(a,b)

Manually Cluster

Manually Cluster

Q6c

Complete Linkage Dendrogram Clusters: (C1,C2)(C1,C3)(C4)
Single Linkage Dendrogram Clusters: (C1, C2, C5)(C4)(C3)