data(cars, package = "caret")

df <- cars

str(df)
## 'data.frame':    804 obs. of  18 variables:
##  $ Price      : num  22661 21725 29143 30732 33359 ...
##  $ Mileage    : int  20105 13457 31655 22479 17590 23635 17381 27558 25049 17319 ...
##  $ Cylinder   : int  6 6 4 4 4 4 4 4 4 4 ...
##  $ Doors      : int  4 2 2 2 2 2 2 2 2 4 ...
##  $ Cruise     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Sound      : int  0 1 1 0 1 0 1 0 0 0 ...
##  $ Leather    : int  0 0 1 0 1 0 1 1 0 1 ...
##  $ Buick      : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Cadillac   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Chevy      : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Pontiac    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Saab       : int  0 0 1 1 1 1 1 1 1 1 ...
##  $ Saturn     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ convertible: int  0 0 1 1 1 1 1 1 1 0 ...
##  $ coupe      : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ hatchback  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sedan      : int  1 0 0 0 0 0 0 0 0 1 ...
##  $ wagon      : int  0 0 0 0 0 0 0 0 0 0 ...

Correlation between the data

df %>% cor() %>% round(.,digits=ndig) %>% mykable()
Price Mileage Cylinder Doors Cruise Sound Leather Buick Cadillac Chevy Pontiac Saab Saturn convertible coupe hatchback sedan wagon
Price 1.00 -0.14 0.57 -0.14 0.43 -0.12 0.16 -0.02 0.66 -0.40 -0.14 0.34 -0.21 0.51 -0.17 -0.21 -0.03 0.05
Mileage -0.14 1.00 -0.03 -0.02 0.03 -0.03 0.00 0.02 -0.04 -0.02 -0.03 0.06 0.02 0.03 0.00 -0.03 -0.02 0.03
Cylinder 0.57 -0.03 1.00 0.00 0.35 -0.09 0.08 0.18 0.53 -0.16 0.11 -0.37 -0.19 0.06 -0.04 -0.06 0.18 -0.27
Doors -0.14 -0.02 0.00 1.00 -0.05 -0.06 -0.06 0.18 0.09 -0.15 0.04 -0.03 -0.06 -0.46 -0.83 0.16 0.69 0.16
Cruise 0.43 0.03 0.35 -0.05 1.00 -0.09 -0.07 0.19 0.19 -0.29 0.00 0.23 -0.20 0.15 -0.04 -0.26 0.13 -0.04
Sound -0.12 -0.03 -0.09 -0.06 -0.09 1.00 0.17 -0.01 -0.09 0.26 -0.07 -0.09 -0.14 -0.04 0.10 0.07 -0.02 -0.14
Leather 0.16 0.00 0.08 -0.06 -0.07 0.17 1.00 -0.21 0.21 0.16 -0.09 0.00 -0.15 0.01 0.06 0.09 -0.10 0.00
Buick -0.02 0.02 0.18 0.18 0.19 -0.01 -0.21 1.00 -0.11 -0.27 -0.16 -0.14 -0.09 -0.09 -0.15 -0.09 0.27 -0.10
Cadillac 0.66 -0.04 0.53 0.09 0.19 -0.09 0.21 -0.11 1.00 -0.27 -0.16 -0.14 -0.09 0.09 -0.15 -0.09 0.18 -0.10
Chevy -0.40 -0.02 -0.16 -0.15 -0.29 0.26 0.16 -0.27 -0.27 1.00 -0.39 -0.33 -0.23 -0.10 0.23 0.35 -0.18 -0.24
Pontiac -0.14 -0.03 0.11 0.04 0.00 -0.07 -0.09 -0.16 -0.16 -0.39 1.00 -0.19 -0.14 -0.12 0.03 -0.14 -0.01 0.21
Saab 0.34 0.06 -0.37 -0.03 0.23 -0.09 0.00 -0.14 -0.14 -0.33 -0.19 1.00 -0.12 0.34 -0.19 -0.12 -0.14 0.33
Saturn -0.21 0.02 -0.19 -0.06 -0.20 -0.14 -0.15 -0.09 -0.09 -0.23 -0.14 -0.12 1.00 -0.07 0.12 -0.08 0.03 -0.08
convertible 0.51 0.03 0.06 -0.46 0.15 -0.04 0.01 -0.09 0.09 -0.10 -0.12 0.34 -0.07 1.00 -0.12 -0.07 -0.32 -0.08
coupe -0.17 0.00 -0.04 -0.83 -0.04 0.10 0.06 -0.15 -0.15 0.23 0.03 -0.19 0.12 -0.12 1.00 -0.13 -0.57 -0.14
hatchback -0.21 -0.03 -0.06 0.16 -0.26 0.07 0.09 -0.09 -0.09 0.35 -0.14 -0.12 -0.08 -0.07 -0.13 1.00 -0.35 -0.08
sedan -0.03 -0.02 0.18 0.69 0.13 -0.02 -0.10 0.27 0.18 -0.18 -0.01 -0.14 0.03 -0.32 -0.57 -0.35 1.00 -0.37
wagon 0.05 0.03 -0.27 0.16 -0.04 -0.14 0.00 -0.10 -0.10 -0.24 0.21 0.33 -0.08 -0.08 -0.14 -0.08 -0.37 1.00
mx <- cor(df)
corr <- round(mx,ndig)
ggcorrplot(corr)

Transform the data

dftrans <- df %>% mutate(Pricel=log(Price),Mileagel=log(Mileage))
dftrans <- dftrans %>% mutate(PriceMile=Price/Mileage,PriceMilel=log(PriceMile))

The new data set transformed

summary(dftrans)
##      Price          Mileage         Cylinder         Doors      
##  Min.   : 8639   Min.   :  266   Min.   :4.000   Min.   :2.000  
##  1st Qu.:14273   1st Qu.:14624   1st Qu.:4.000   1st Qu.:4.000  
##  Median :18025   Median :20914   Median :6.000   Median :4.000  
##  Mean   :21343   Mean   :19832   Mean   :5.269   Mean   :3.527  
##  3rd Qu.:26717   3rd Qu.:25213   3rd Qu.:6.000   3rd Qu.:4.000  
##  Max.   :70755   Max.   :50387   Max.   :8.000   Max.   :4.000  
##      Cruise           Sound           Leather           Buick       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.7525   Mean   :0.6791   Mean   :0.7239   Mean   :0.0995  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     Cadillac          Chevy          Pontiac            Saab       
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.0995   Mean   :0.398   Mean   :0.1866   Mean   :0.1418  
##  3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##      Saturn         convertible          coupe          hatchback      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.07463   Mean   :0.06219   Mean   :0.1741   Mean   :0.07463  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##      sedan            wagon            Pricel          Mileagel     
##  Min.   :0.0000   Min.   :0.0000   Min.   : 9.064   Min.   : 5.583  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 9.566   1st Qu.: 9.590  
##  Median :1.0000   Median :0.0000   Median : 9.800   Median : 9.948  
##  Mean   :0.6095   Mean   :0.0796   Mean   : 9.879   Mean   : 9.754  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:10.193   3rd Qu.:10.135  
##  Max.   :1.0000   Max.   :1.0000   Max.   :11.167   Max.   :10.827  
##    PriceMile          PriceMilel      
##  Min.   :  0.2484   Min.   :-1.39263  
##  1st Qu.:  0.6575   1st Qu.:-0.41933  
##  Median :  0.9588   Median :-0.04204  
##  Mean   :  1.9734   Mean   : 0.12533  
##  3rd Qu.:  1.6604   3rd Qu.: 0.50705  
##  Max.   :121.3644   Max.   : 4.79880

Pick only the right Variables

dft1 <- dftrans %>% select(Cylinder:Mileagel,PriceMilel)
# Scale Data
dft1s <- dft1 %>% scale()
dft1s <- as.data.frame(dft1s)

Check the new scaled data set

dft1s %>% summarise_all(list(mean,sd))

Questions to Solve: Assignment

For hierarchical Clustering try to explain how to characterize the groups build

set.seed(1350)

hc <- dft1s %>% dist(method = "euclidean") %>% hclust(method = "ward.D2")
fviz_dend(hc,k=15,rect=TRUE)

We now cut the tree and add the new labels to our data set

corte <- cutree(hc,k=15)
dfhc <- df %>% mutate(label=corte)
dfhc %>% group_by(label) %>% summarise(count=n())

We can now check the structure of the label

dfhc %>% head()

Lets use the mean from every group to see the most relevant parameters in each cluster

dfhc %>% group_by(label) %>% summarise_all(mean)
  • For group 1 we can see that the relevant parameters are { 6 cylinder, 4 door(sedans) with cruise and most of them are Buick } and the other parameters are just in the mean or below
  • For group 2 we can see that the relevant parameters are { 2 doors(coupes), above average cruise, sound and leather and most of them are Chevys } and the other parameters are just in the mean or below
  • For group 3 we can see that the relevant parameters are { above average Price, 4 cylinder and 2 doors(convertibles) with cruise and most of them are Saabs} and the other parameters are just in the mean or below
  • For group 4 we can see that the relevant parameters are { 4 cylinder, 4 doors(sedans) with cruise and most of them are Saabs} and the other parameters are just in the mean or below
  • For group 5 we can see that the relevant parameters are { 4 cylinder, 4 doors(wagons) with cruise and most of them with leather and Saabs} and the other parameters are just in the mean or below
  • For group 6 we can see that the relevant parameters are { below averge Price, 4 doors (wagons) and most of them are Pontiacs } and the other parameters are just in the mean or below
  • For group 7 we can see that the relevant parameters are { above average Price, 8 cylinders, with cruise and leather and 2 doors (convertibles) } and the other parameters are just in the mean or below
  • For group 8 we can see that the relevant parameters are { 6 cylinders, 2 doors(coupe), many with sound and leather, and most of them are Pontiacs } and the other parameters are just in the mean or below
  • For group 9 we can see that the relevant parameters are { above average Price, many of them more than 6 Cylinders, with cruise and leather, 4 doors(sedan) and most of them are Cadillacs } and the other parameters are just in the mean or below
  • For group 10 we can see that the relevant parameters are { 6 cylinders, most with cruise, 4 doors(sedan) and Pontiacs } and the other parameters are just in the mean or below
  • For group 11 we can see that the relevant parameters are { low mileage, more than 4 cylinders most, and also with cruise, sound, leather, 4 doors(sedan), some Chevys but most Pontiacs } and the other parameters are just in the mean or below
  • For group 12 we can see that the relevant parameters are { below average price, 4 doors(sedans) and most of them are Saturns } and the other parameters are just in the mean or below
  • For group 13 we can see that the relevant parameters are { below average price, above average cylinders, most with sound, leather and Chevys that are hatchback } and the other parameters are just in the mean or below
  • For group 14 we can see that the relevant parameters are { below average price, many with sound and leather, and most of them sedan Chevys } and the other parameters are just in the mean or below
  • For group 15 we can see that the relevant parameters are { below average price, Saturn coupes } and the other parameters are just in the mean or below

For the K-means Clustering repeat the same problem

set.seed(1350)

kmns1 <- kmeans(dft1s, 15, nstart = 20)

library(cluster)
library(factoextra)

sil <- silhouette(kmns1$cluster, dist(dft1s))
fviz_silhouette(sil)
##    cluster size ave.sil.width
## 1        1   88          0.34
## 2        2   69          0.58
## 3        3   28          0.21
## 4        4   34          0.63
## 5        5   82          0.40
## 6        6   49          0.50
## 7        7  122          0.37
## 8        8   58          0.23
## 9        9   78          0.53
## 10      10   20          0.33
## 11      11   59          0.46
## 12      12   30          0.59
## 13      13   28          0.04
## 14      14   30          0.44
## 15      15   29          0.24

fviz_cluster(kmns1, data = dft1s)

Lets use the mean from every group to see the most relevant parameters in each cluster

dfkm <- df %>% mutate(label=kmns1$cluster)

dfkm %>% group_by(label) %>% summarise_all(mean)
  • For group 1 we can see that the relevant parameters are { below average price, many with cruise, sound and leather, 2 doors(coupe) and most are Chevys } and the other parameters are just in the mean or below
  • For group 2 we can see that the relevant parameters are { above average price, above average cylinders, with cruise and leather, and most are sedan Cadillacs} and the other parameters are just in the mean or below
  • For group 3 we can see that the relevant parameters are { below average price, sedan Chevys } and the other parameters are just in the mean or below
  • For group 4 we can see that the relevant parameters are { with cruise and leather, wagon Saabs} and the other parameters are just in the mean or below
  • For group 5 we can see that the relevant parameters are { above average cylinders, with cruise, and most are Pontiac sedans} and the other parameters are just in the mean or below
  • For group 6 we can see that the relevant parameters are { with cruise, sedan Saabs } and the other parameters are just in the mean or below
  • For group 7 we can see that the relevant parameters are { below average price, most with sound and leather, sedan Chevys} and the other parameters are just in the mean or below
  • For group 8 we can see that the relevant parameters are { below average price, most Saturn sedans, some also coupes } and the other parameters are just in the mean or below
  • For group 9 we can see that the relevant parameters are { above average cylinders, with cruise, sound and most are Buick sedans } and the other parameters are just in the mean or below
  • For group 10 we can see that the relevant parameters are { above average price, above average cylinders, most with cruise, sound, leather, some are Cadillacs some are Chevys all convertibles } and the other parameters are just in the mean or below
  • For group 11 we can see that the relevant parameters are { below average price, many with sound and leather, most are hatchback Chevys } and the other parameters are just in the mean or below
  • For group 12 we can see that the relevant parameters are { above average price, many with cruise, most convertible Saabs} and the other parameters are just in the mean or below
  • For group 13 we can see that the relevant parameters are { below average mileage, most sedans Chevys and Pontiacs} and the other parameters are just in the mean or below
  • For group 14 we can see that the relevant parameters are { below average price, most are wagon Pontiacs } and the other parameters are just in the mean or below
  • For group 15 we can see that the relevant parameters are { above average cylinders, most with cruise, sound and leather, coupes some Chevys but most Pontiacs } and the other parameters are just in the mean or below

For the spectral Clustering repeat the process

set.seed(1350)

scM <- specc(dft1s %>% as.matrix(),centers=15)

labscM <- as.data.frame(factor(scM)) #

labelSp <- as.integer(labscM$`factor(scM)`)

fviz_cluster(list(data = dft1s, cluster = labelSp))+ggtitle("Spectral Clustering")

Lets use the mean from every group to see the most relevant parameters in each cluster

dfsp <- df %>% mutate(label=labelSp)

dfsp %>% group_by(label) %>% summarise_all(mean)

Group Characterization

  • For group 1 we can see that the relevant parameters are { most are Saturn coupes } and the other parameters are just in the mean or below
  • For group 2 we can see that the relevant parameters are { Pontiacs wagons } and the other parameters are just in the mean or below
  • For group 3 we can see that the relevant parameters are { above average price, with cruise, Saab convertibles } and the other parameters are just in the mean or below
  • For group 4 we can see that the relevant parameters are { above average cylinders, many with cruise, sound and leather, most of them coupe Pontiacs } and the other parameters are just in the mean or below
  • For group 5 we can see that the relevant parameters are { above average price, most with 8 cylinders, with cruise and leather, most are sedan Chevys } and the other parameters are just in the mean or below
  • For group 6 we can see that the relevant parameters are { below average price, most sedan Chevys } and the other parameters are just in the mean or below
  • For group 7 we can see that the relevant parameters are { below average price, coupe chevys } and the other parameters are just in the mean or below
  • For group 8 we can see that the relevant parameters are { Saab sedans and wagons, with cruise } and the other parameters are just in the mean or below
  • For group 9 we can see that the relevant parameters are { below average price, hatchbag Chevys } and the other parameters are just in the mean or below
  • For group 10 we can see that the relevant parameters are { above average cylinders, above average cylinders, with cruise, most Pontiac sedans } and the other parameters are just in the mean or below
  • For group 11 we can see that the relevant parameters are { below average mileage, sedan Chevys } and the other parameters are just in the mean or below
  • For group 12 we can see that the relevant parameters are { above average cylinders , sedan Cadillacs } and the other parameters are just in the mean or below
  • For group 13 we can see that the relevant parameters are { most with 6 cylinders , sedan Buicks} and the other parameters are just in the mean or below
  • For group 14 we can see that the relevant parameters are { above average cylinders, with cruise, sedan Chevys } and the other parameters are just in the mean or below
  • For group 15 we can see that the relevant parameters are { below average price, sedan Saturns } and the other parameters are just in the mean or below

For density-based clustering repeat the process

dbM1 <- dbscan(dft1s, eps = 2.5, MinPts = 5)
## Warning in dbscan(dft1s, eps = 2.5, MinPts = 5): converting argument MinPts
## (fpc) to minPts (dbscan)!
fviz_cluster(dbM1, data = dft1s, stand = FALSE,
             ellipse = FALSE, show.clust.cent = FALSE,
             geom = "point",palette = "jco", ggtheme = theme_bw())
## Warning: This manual palette can handle a maximum of 10 values. You have
## supplied 15.
## Warning: Removed 278 rows containing missing values (geom_point).

dfdb <- df %>% mutate(label=dbM1$cluster)

dfdb %>% group_by(label) %>% summarise_all(mean)
  • For group 0 (others) we can see that the relevant parameters are { below average mileage, above average cylinders, most sedan Chevys } and the other parameters are just in the mean or below
  • For group 1 we can see that the relevant parameters are { above average cylinders, with cruise, most are sedan Buicks } and the other parameters are just in the mean or below
  • For group 2 we can see that the relevant parameters are { coupe chevys} and the other parameters are just in the mean or below
  • For group 3 we can see that the relevant parameters are { above average price, with cruise, Saab convertibles } and the other parameters are just in the mean or below
  • For group 4 we can see that the relevant parameters are { with cruise, Saab sedans } and the other parameters are just in the mean or below
  • For group 5 we can see that the relevant parameters are { with cruise and leather, Saab wagons } and the other parameters are just in the mean or below
  • For group 6 we can see that the relevant parameters are { Pontiac wagons} and the other parameters are just in the mean or below
  • For group 7 we can see that the relevant parameters are { above average price, most with 8 cylinders, with cruise and leather, Chevy convertibles } and the other parameters are just in the mean or below
  • For group 8 we can see that the relevant parameters are { most 6 clylinders with leather, Pontiac coupes } and the other parameters are just in the mean or below
  • For group 9 we can see that the relevant parameters are { above average cylinder with cruise and leather, Cadillac sedans } and the other parameters are just in the mean or below
  • For group 10 we can see that the relevant parameters are { most 6 clylinders with cruise, Pontiac sedans } and the other parameters are just in the mean or below
  • For group 11 we can see that the relevant parameters are { above average price, most with 8 cylinders, cruise, sound and leather, Cadillac convertibles } and the other parameters are just in the mean or below
  • For group 12 we can see that the relevant parameters are { Saturn sedans } and the other parameters are just in the mean or below
  • For group 13 we can see that the relevant parameters are { Chevy hatchbacks } and the other parameters are just in the mean or below
  • For group 14 we can see that the relevant parameters are { Chevy sedans } and the other parameters are just in the mean or below
  • For group 15 we can see that the relevant parameters are { Saturn coupes } and the other parameters are just in the mean or below

Questions to Solve: Project

Take the results for the spectral Clustering with 2 groups

Test some classification technique to predict this label

DEEP LEARNING WITH 2 GROUPS

SHOW THE SPECTRAL CLUSTERING MODEL WITH 15 GROUPS

scM1 <- specc(dft1s %>% as.matrix(),centers=2)

labscM1 <- as.data.frame(factor(scM1)) 

labelsCM1 <- as.integer(labscM1$`factor(scM1)`)

fviz_cluster(list(data = dft1s, cluster = labelsCM1))+ggtitle("Spectral Clustering")

LETS USE DEEP LEARNING TO CLASSIFY THE LABELS

# We set the label to -1 so the numbers start at 0 for the model
labelDL <- as.integer(labscM1$`factor(scM1)`)-1

df2 <- dft1s %>% mutate(group = labelDL)

ixTrain <- createDataPartition(df2$group,p=0.6,list=FALSE)
training <- df2[ixTrain,]
testing <- df2[-ixTrain,]

xtrain <- training %>% dplyr::select(-group)
ylabtrain <- training %>% dplyr::select(group)

xtest <- testing %>% dplyr::select(-group)
ylabtest <- testing %>% dplyr::select(group)

xtrainM <- as.matrix(xtrain)
ylabtrainM <- as.matrix(ylabtrain)

xtestM <- as.matrix(xtest)
ylabtestM <- as.matrix(ylabtest)
mod2 <-  keras_model_sequential()
mod2 %>% layer_dense(units = 8, activation = 'relu', input_shape = c(19)) %>% 
        layer_dense(units = 3, activation = 'softmax')

mod2 %>% compile(
     loss = 'sparse_categorical_crossentropy',
     optimizer = 'adam',
     metrics = 'accuracy'
)

history <- mod2 %>% fit(
     xtrainM, 
     ylabtrainM, 
     epochs = 20, 
     batch_size = 10, 
     validation_split = 0.2
)
print(history)
## Trained on 386 samples (batch_size=10, epochs=20)
## Final epoch (plot to see history):
##         loss: 0.0159
##     accuracy: 1
##     val_loss: 0.01461
## val_accuracy: 1
plot(history)
## `geom_smooth()` using formula 'y ~ x'

perf <- mod2 %>% evaluate(xtestM,ylabtestM)
perf
## $loss
## [1] 0.01693813
## 
## $accuracy
## [1] 1
dim <- length(ylabtestM)
class <- mod2 %>% predict_classes(xtestM[1:dim,])

real <- ylabtestM

confmat <- confusionMatrix(as.factor(class),as.factor(real))

dtmodv1 <- confmat$table
dtmodv1 %>% kable(caption = "Confusion Matrix") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>% 
  column_spec(1, bold = T,color = "white", background = "#ff6666") %>% 
  row_spec(0, bold = T,color = "white", background = "#ff4a4a") %>% 
  add_header_above(c(" "=1, "PREDICTED - REAL" = 2))
Confusion Matrix
PREDICTED - REAL
0 1
0 27 0
1 0 294

DEEP LEARNING WITH DBMS WITH 15 GROUPS

SHOW THE DBMS MODEL WITH 15 GROUPS

dbM2 <- dbscan(dft1s, eps = 2.5, MinPts = 5)
## Warning in dbscan(dft1s, eps = 2.5, MinPts = 5): converting argument MinPts
## (fpc) to minPts (dbscan)!
fviz_cluster(dbM2, data = dft1s, stand = FALSE,
             ellipse = FALSE, show.clust.cent = FALSE,
             geom = "point",palette = "jco", ggtheme = theme_bw())
## Warning: This manual palette can handle a maximum of 10 values. You have
## supplied 15.
## Warning: Removed 278 rows containing missing values (geom_point).

LETS USE DEEP LEARNING TO CLASSIFY THE LABELS

labelDBMS <- dbM2$cluster

df3 <- dft1s %>% mutate(group = labelDBMS)

ixTrain <- createDataPartition(df3$group,p=0.6,list=FALSE)
training <- df3[ixTrain,]
testing <- df3[-ixTrain,]

xtrain <- training %>% dplyr::select(-group)
ylabtrain <- training %>% dplyr::select(group)

xtest <- testing %>% dplyr::select(-group)
ylabtest <- testing %>% dplyr::select(group)

xtrainM <- as.matrix(xtrain)
ylabtrainM <- as.matrix(ylabtrain)

xtestM <- as.matrix(xtest)
ylabtestM <- as.matrix(ylabtest)
mod2 <-  keras_model_sequential()
mod2 %>% layer_dense(units = 32, activation = 'relu', input_shape = c(19)) %>% 
        layer_dense(units = 16, activation = 'softmax')

mod2 %>% compile(
     loss = 'sparse_categorical_crossentropy',
     optimizer = 'adam',
     metrics = 'accuracy'
)

history <- mod2 %>% fit(
     xtrainM, 
     ylabtrainM, 
     epochs = 20, 
     batch_size = 10, 
     validation_split = 0.2
)
print(history)
## Trained on 387 samples (batch_size=10, epochs=20)
## Final epoch (plot to see history):
##         loss: 0.05427
##     accuracy: 0.9948
##     val_loss: 0.1755
## val_accuracy: 0.9691
plot(history)
## `geom_smooth()` using formula 'y ~ x'

perf <- mod2 %>% evaluate(xtestM,ylabtestM)
perf
## $loss
## [1] 0.09083202
## 
## $accuracy
## [1] 0.978125
dim <- length(ylabtestM)
class <- mod2 %>% predict_classes(xtestM[1:dim,])

real <- ylabtestM

confmat <- confusionMatrix(as.factor(class),as.factor(real))

dtmodv1 <- confmat$table
dtmodv1 %>% kable(caption = "Confusion Matrix") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>% 
  column_spec(1, bold = T,color = "white", background = "#ff6666") %>% 
  row_spec(0, bold = T,color = "white", background = "#ff4a4a") %>% 
  add_header_above(c(" "=1, "PREDICTED  - REAL" = 16))
Confusion Matrix
PREDICTED - REAL
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 4 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 28 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 1 0 40 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0
4 1 0 0 0 22 0 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 13 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 9 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0
8 1 0 0 0 0 0 0 0 11 0 0 0 0 0 0 0
9 2 0 0 0 0 0 0 0 0 27 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 0 34 0 0 0 0 0
11 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0
12 0 0 0 0 0 0 0 0 0 0 0 0 15 0 0 0
13 0 0 0 0 0 0 0 0 0 0 0 0 0 25 0 0
14 0 0 0 0 0 0 0 0 0 0 0 0 0 0 62 0
15 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7