data(cars, package = "caret")

df <- cars

str(df)

## 'data.frame':    804 obs. of  18 variables:
##  $ Price      : num  22661 21725 29143 30732 33359 ...
##  $ Mileage    : int  20105 13457 31655 22479 17590 23635 17381 27558 25049 17319 ...
##  $ Cylinder   : int  6 6 4 4 4 4 4 4 4 4 ...
##  $ Doors      : int  4 2 2 2 2 2 2 2 2 4 ...
##  $ Cruise     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Sound      : int  0 1 1 0 1 0 1 0 0 0 ...
##  $ Leather    : int  0 0 1 0 1 0 1 1 0 1 ...
##  $ Buick      : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Cadillac   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Chevy      : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Pontiac    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Saab       : int  0 0 1 1 1 1 1 1 1 1 ...
##  $ Saturn     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ convertible: int  0 0 1 1 1 1 1 1 1 0 ...
##  $ coupe      : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ hatchback  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sedan      : int  1 0 0 0 0 0 0 0 0 1 ...
##  $ wagon      : int  0 0 0 0 0 0 0 0 0 0 ...

Correlation between the data

df %>% cor() %>% round(.,digits=ndig) %>% mykable()

	Price	Mileage	Cylinder	Doors	Cruise	Sound	Leather	Buick	Cadillac	Chevy	Pontiac	Saab	Saturn	convertible	coupe	hatchback	sedan	wagon
Price	1.00	-0.14	0.57	-0.14	0.43	-0.12	0.16	-0.02	0.66	-0.40	-0.14	0.34	-0.21	0.51	-0.17	-0.21	-0.03	0.05
Mileage	-0.14	1.00	-0.03	-0.02	0.03	-0.03	0.00	0.02	-0.04	-0.02	-0.03	0.06	0.02	0.03	0.00	-0.03	-0.02	0.03
Cylinder	0.57	-0.03	1.00	0.00	0.35	-0.09	0.08	0.18	0.53	-0.16	0.11	-0.37	-0.19	0.06	-0.04	-0.06	0.18	-0.27
Doors	-0.14	-0.02	0.00	1.00	-0.05	-0.06	-0.06	0.18	0.09	-0.15	0.04	-0.03	-0.06	-0.46	-0.83	0.16	0.69	0.16
Cruise	0.43	0.03	0.35	-0.05	1.00	-0.09	-0.07	0.19	0.19	-0.29	0.00	0.23	-0.20	0.15	-0.04	-0.26	0.13	-0.04
Sound	-0.12	-0.03	-0.09	-0.06	-0.09	1.00	0.17	-0.01	-0.09	0.26	-0.07	-0.09	-0.14	-0.04	0.10	0.07	-0.02	-0.14
Leather	0.16	0.00	0.08	-0.06	-0.07	0.17	1.00	-0.21	0.21	0.16	-0.09	0.00	-0.15	0.01	0.06	0.09	-0.10	0.00
Buick	-0.02	0.02	0.18	0.18	0.19	-0.01	-0.21	1.00	-0.11	-0.27	-0.16	-0.14	-0.09	-0.09	-0.15	-0.09	0.27	-0.10
Cadillac	0.66	-0.04	0.53	0.09	0.19	-0.09	0.21	-0.11	1.00	-0.27	-0.16	-0.14	-0.09	0.09	-0.15	-0.09	0.18	-0.10
Chevy	-0.40	-0.02	-0.16	-0.15	-0.29	0.26	0.16	-0.27	-0.27	1.00	-0.39	-0.33	-0.23	-0.10	0.23	0.35	-0.18	-0.24
Pontiac	-0.14	-0.03	0.11	0.04	0.00	-0.07	-0.09	-0.16	-0.16	-0.39	1.00	-0.19	-0.14	-0.12	0.03	-0.14	-0.01	0.21
Saab	0.34	0.06	-0.37	-0.03	0.23	-0.09	0.00	-0.14	-0.14	-0.33	-0.19	1.00	-0.12	0.34	-0.19	-0.12	-0.14	0.33
Saturn	-0.21	0.02	-0.19	-0.06	-0.20	-0.14	-0.15	-0.09	-0.09	-0.23	-0.14	-0.12	1.00	-0.07	0.12	-0.08	0.03	-0.08
convertible	0.51	0.03	0.06	-0.46	0.15	-0.04	0.01	-0.09	0.09	-0.10	-0.12	0.34	-0.07	1.00	-0.12	-0.07	-0.32	-0.08
coupe	-0.17	0.00	-0.04	-0.83	-0.04	0.10	0.06	-0.15	-0.15	0.23	0.03	-0.19	0.12	-0.12	1.00	-0.13	-0.57	-0.14
hatchback	-0.21	-0.03	-0.06	0.16	-0.26	0.07	0.09	-0.09	-0.09	0.35	-0.14	-0.12	-0.08	-0.07	-0.13	1.00	-0.35	-0.08
sedan	-0.03	-0.02	0.18	0.69	0.13	-0.02	-0.10	0.27	0.18	-0.18	-0.01	-0.14	0.03	-0.32	-0.57	-0.35	1.00	-0.37
wagon	0.05	0.03	-0.27	0.16	-0.04	-0.14	0.00	-0.10	-0.10	-0.24	0.21	0.33	-0.08	-0.08	-0.14	-0.08	-0.37	1.00

mx <- cor(df)
corr <- round(mx,ndig)
ggcorrplot(corr)

Transform the data

dftrans <- df %>% mutate(Pricel=log(Price),Mileagel=log(Mileage))
dftrans <- dftrans %>% mutate(PriceMile=Price/Mileage,PriceMilel=log(PriceMile))

The new data set transformed

summary(dftrans)

##      Price          Mileage         Cylinder         Doors      
##  Min.   : 8639   Min.   :  266   Min.   :4.000   Min.   :2.000  
##  1st Qu.:14273   1st Qu.:14624   1st Qu.:4.000   1st Qu.:4.000  
##  Median :18025   Median :20914   Median :6.000   Median :4.000  
##  Mean   :21343   Mean   :19832   Mean   :5.269   Mean   :3.527  
##  3rd Qu.:26717   3rd Qu.:25213   3rd Qu.:6.000   3rd Qu.:4.000  
##  Max.   :70755   Max.   :50387   Max.   :8.000   Max.   :4.000  
##      Cruise           Sound           Leather           Buick       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.7525   Mean   :0.6791   Mean   :0.7239   Mean   :0.0995  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     Cadillac          Chevy          Pontiac            Saab       
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.0995   Mean   :0.398   Mean   :0.1866   Mean   :0.1418  
##  3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##      Saturn         convertible          coupe          hatchback      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.07463   Mean   :0.06219   Mean   :0.1741   Mean   :0.07463  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##      sedan            wagon            Pricel          Mileagel     
##  Min.   :0.0000   Min.   :0.0000   Min.   : 9.064   Min.   : 5.583  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 9.566   1st Qu.: 9.590  
##  Median :1.0000   Median :0.0000   Median : 9.800   Median : 9.948  
##  Mean   :0.6095   Mean   :0.0796   Mean   : 9.879   Mean   : 9.754  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:10.193   3rd Qu.:10.135  
##  Max.   :1.0000   Max.   :1.0000   Max.   :11.167   Max.   :10.827  
##    PriceMile          PriceMilel      
##  Min.   :  0.2484   Min.   :-1.39263  
##  1st Qu.:  0.6575   1st Qu.:-0.41933  
##  Median :  0.9588   Median :-0.04204  
##  Mean   :  1.9734   Mean   : 0.12533  
##  3rd Qu.:  1.6604   3rd Qu.: 0.50705  
##  Max.   :121.3644   Max.   : 4.79880

Pick only the right Variables

dft1 <- dftrans %>% select(Cylinder:Mileagel,PriceMilel)
# Scale Data
dft1s <- dft1 %>% scale()
dft1s <- as.data.frame(dft1s)

Check the new scaled data set

dft1s %>% summarise_all(list(mean,sd))

Questions to Solve: Assignment

For hierarchical Clustering try to explain how to characterize the groups build

set.seed(1350)

hc <- dft1s %>% dist(method = "euclidean") %>% hclust(method = "ward.D2")
fviz_dend(hc,k=15,rect=TRUE)

We now cut the tree and add the new labels to our data set

corte <- cutree(hc,k=15)
dfhc <- df %>% mutate(label=corte)
dfhc %>% group_by(label) %>% summarise(count=n())

We can now check the structure of the label

dfhc %>% head()

Lets use the mean from every group to see the most relevant parameters in each cluster

dfhc %>% group_by(label) %>% summarise_all(mean)

For group 1 we can see that the relevant parameters are { 6 cylinder, 4 door(sedans) with cruise and most of them are Buick } and the other parameters are just in the mean or below
For group 2 we can see that the relevant parameters are { 2 doors(coupes), above average cruise, sound and leather and most of them are Chevys } and the other parameters are just in the mean or below
For group 3 we can see that the relevant parameters are { above average Price, 4 cylinder and 2 doors(convertibles) with cruise and most of them are Saabs} and the other parameters are just in the mean or below
For group 4 we can see that the relevant parameters are { 4 cylinder, 4 doors(sedans) with cruise and most of them are Saabs} and the other parameters are just in the mean or below
For group 5 we can see that the relevant parameters are { 4 cylinder, 4 doors(wagons) with cruise and most of them with leather and Saabs} and the other parameters are just in the mean or below
For group 6 we can see that the relevant parameters are { below averge Price, 4 doors (wagons) and most of them are Pontiacs } and the other parameters are just in the mean or below
For group 7 we can see that the relevant parameters are { above average Price, 8 cylinders, with cruise and leather and 2 doors (convertibles) } and the other parameters are just in the mean or below
For group 8 we can see that the relevant parameters are { 6 cylinders, 2 doors(coupe), many with sound and leather, and most of them are Pontiacs } and the other parameters are just in the mean or below
For group 9 we can see that the relevant parameters are { above average Price, many of them more than 6 Cylinders, with cruise and leather, 4 doors(sedan) and most of them are Cadillacs } and the other parameters are just in the mean or below
For group 10 we can see that the relevant parameters are { 6 cylinders, most with cruise, 4 doors(sedan) and Pontiacs } and the other parameters are just in the mean or below
For group 11 we can see that the relevant parameters are { low mileage, more than 4 cylinders most, and also with cruise, sound, leather, 4 doors(sedan), some Chevys but most Pontiacs } and the other parameters are just in the mean or below
For group 12 we can see that the relevant parameters are { below average price, 4 doors(sedans) and most of them are Saturns } and the other parameters are just in the mean or below
For group 13 we can see that the relevant parameters are { below average price, above average cylinders, most with sound, leather and Chevys that are hatchback } and the other parameters are just in the mean or below
For group 14 we can see that the relevant parameters are { below average price, many with sound and leather, and most of them sedan Chevys } and the other parameters are just in the mean or below
For group 15 we can see that the relevant parameters are { below average price, Saturn coupes } and the other parameters are just in the mean or below

For the K-means Clustering repeat the same problem

set.seed(1350)

kmns1 <- kmeans(dft1s, 15, nstart = 20)

library(cluster)
library(factoextra)

sil <- silhouette(kmns1$cluster, dist(dft1s))
fviz_silhouette(sil)

##    cluster size ave.sil.width
## 1        1   88          0.34
## 2        2   69          0.58
## 3        3   28          0.21
## 4        4   34          0.63
## 5        5   82          0.40
## 6        6   49          0.50
## 7        7  122          0.37
## 8        8   58          0.23
## 9        9   78          0.53
## 10      10   20          0.33
## 11      11   59          0.46
## 12      12   30          0.59
## 13      13   28          0.04
## 14      14   30          0.44
## 15      15   29          0.24

fviz_cluster(kmns1, data = dft1s)

Lets use the mean from every group to see the most relevant parameters in each cluster

dfkm <- df %>% mutate(label=kmns1$cluster)

dfkm %>% group_by(label) %>% summarise_all(mean)

For group 1 we can see that the relevant parameters are { below average price, many with cruise, sound and leather, 2 doors(coupe) and most are Chevys } and the other parameters are just in the mean or below
For group 2 we can see that the relevant parameters are { above average price, above average cylinders, with cruise and leather, and most are sedan Cadillacs} and the other parameters are just in the mean or below
For group 3 we can see that the relevant parameters are { below average price, sedan Chevys } and the other parameters are just in the mean or below
For group 4 we can see that the relevant parameters are { with cruise and leather, wagon Saabs} and the other parameters are just in the mean or below
For group 5 we can see that the relevant parameters are { above average cylinders, with cruise, and most are Pontiac sedans} and the other parameters are just in the mean or below
For group 6 we can see that the relevant parameters are { with cruise, sedan Saabs } and the other parameters are just in the mean or below
For group 7 we can see that the relevant parameters are { below average price, most with sound and leather, sedan Chevys} and the other parameters are just in the mean or below
For group 8 we can see that the relevant parameters are { below average price, most Saturn sedans, some also coupes } and the other parameters are just in the mean or below
For group 9 we can see that the relevant parameters are { above average cylinders, with cruise, sound and most are Buick sedans } and the other parameters are just in the mean or below
For group 10 we can see that the relevant parameters are { above average price, above average cylinders, most with cruise, sound, leather, some are Cadillacs some are Chevys all convertibles } and the other parameters are just in the mean or below
For group 11 we can see that the relevant parameters are { below average price, many with sound and leather, most are hatchback Chevys } and the other parameters are just in the mean or below
For group 12 we can see that the relevant parameters are { above average price, many with cruise, most convertible Saabs} and the other parameters are just in the mean or below
For group 13 we can see that the relevant parameters are { below average mileage, most sedans Chevys and Pontiacs} and the other parameters are just in the mean or below
For group 14 we can see that the relevant parameters are { below average price, most are wagon Pontiacs } and the other parameters are just in the mean or below
For group 15 we can see that the relevant parameters are { above average cylinders, most with cruise, sound and leather, coupes some Chevys but most Pontiacs } and the other parameters are just in the mean or below

For the spectral Clustering repeat the process

set.seed(1350)

scM <- specc(dft1s %>% as.matrix(),centers=15)

labscM <- as.data.frame(factor(scM)) #

labelSp <- as.integer(labscM$`factor(scM)`)

fviz_cluster(list(data = dft1s, cluster = labelSp))+ggtitle("Spectral Clustering")

Lets use the mean from every group to see the most relevant parameters in each cluster

dfsp <- df %>% mutate(label=labelSp)

dfsp %>% group_by(label) %>% summarise_all(mean)

Group Characterization

For group 1 we can see that the relevant parameters are { most are Saturn coupes } and the other parameters are just in the mean or below
For group 2 we can see that the relevant parameters are { Pontiacs wagons } and the other parameters are just in the mean or below
For group 3 we can see that the relevant parameters are { above average price, with cruise, Saab convertibles } and the other parameters are just in the mean or below
For group 4 we can see that the relevant parameters are { above average cylinders, many with cruise, sound and leather, most of them coupe Pontiacs } and the other parameters are just in the mean or below
For group 5 we can see that the relevant parameters are { above average price, most with 8 cylinders, with cruise and leather, most are sedan Chevys } and the other parameters are just in the mean or below
For group 6 we can see that the relevant parameters are { below average price, most sedan Chevys } and the other parameters are just in the mean or below
For group 7 we can see that the relevant parameters are { below average price, coupe chevys } and the other parameters are just in the mean or below
For group 8 we can see that the relevant parameters are { Saab sedans and wagons, with cruise } and the other parameters are just in the mean or below
For group 9 we can see that the relevant parameters are { below average price, hatchbag Chevys } and the other parameters are just in the mean or below
For group 10 we can see that the relevant parameters are { above average cylinders, above average cylinders, with cruise, most Pontiac sedans } and the other parameters are just in the mean or below
For group 11 we can see that the relevant parameters are { below average mileage, sedan Chevys } and the other parameters are just in the mean or below
For group 12 we can see that the relevant parameters are { above average cylinders , sedan Cadillacs } and the other parameters are just in the mean or below
For group 13 we can see that the relevant parameters are { most with 6 cylinders , sedan Buicks} and the other parameters are just in the mean or below
For group 14 we can see that the relevant parameters are { above average cylinders, with cruise, sedan Chevys } and the other parameters are just in the mean or below
For group 15 we can see that the relevant parameters are { below average price, sedan Saturns } and the other parameters are just in the mean or below

For density-based clustering repeat the process

dbM1 <- dbscan(dft1s, eps = 2.5, MinPts = 5)

## Warning in dbscan(dft1s, eps = 2.5, MinPts = 5): converting argument MinPts
## (fpc) to minPts (dbscan)!

fviz_cluster(dbM1, data = dft1s, stand = FALSE,
             ellipse = FALSE, show.clust.cent = FALSE,
             geom = "point",palette = "jco", ggtheme = theme_bw())

## Warning: This manual palette can handle a maximum of 10 values. You have
## supplied 15.

## Warning: Removed 278 rows containing missing values (geom_point).

dfdb <- df %>% mutate(label=dbM1$cluster)

dfdb %>% group_by(label) %>% summarise_all(mean)

For group 0 (others) we can see that the relevant parameters are { below average mileage, above average cylinders, most sedan Chevys } and the other parameters are just in the mean or below
For group 1 we can see that the relevant parameters are { above average cylinders, with cruise, most are sedan Buicks } and the other parameters are just in the mean or below
For group 2 we can see that the relevant parameters are { coupe chevys} and the other parameters are just in the mean or below
For group 3 we can see that the relevant parameters are { above average price, with cruise, Saab convertibles } and the other parameters are just in the mean or below
For group 4 we can see that the relevant parameters are { with cruise, Saab sedans } and the other parameters are just in the mean or below
For group 5 we can see that the relevant parameters are { with cruise and leather, Saab wagons } and the other parameters are just in the mean or below
For group 6 we can see that the relevant parameters are { Pontiac wagons} and the other parameters are just in the mean or below
For group 7 we can see that the relevant parameters are { above average price, most with 8 cylinders, with cruise and leather, Chevy convertibles } and the other parameters are just in the mean or below
For group 8 we can see that the relevant parameters are { most 6 clylinders with leather, Pontiac coupes } and the other parameters are just in the mean or below
For group 9 we can see that the relevant parameters are { above average cylinder with cruise and leather, Cadillac sedans } and the other parameters are just in the mean or below
For group 10 we can see that the relevant parameters are { most 6 clylinders with cruise, Pontiac sedans } and the other parameters are just in the mean or below
For group 11 we can see that the relevant parameters are { above average price, most with 8 cylinders, cruise, sound and leather, Cadillac convertibles } and the other parameters are just in the mean or below
For group 12 we can see that the relevant parameters are { Saturn sedans } and the other parameters are just in the mean or below
For group 13 we can see that the relevant parameters are { Chevy hatchbacks } and the other parameters are just in the mean or below
For group 14 we can see that the relevant parameters are { Chevy sedans } and the other parameters are just in the mean or below
For group 15 we can see that the relevant parameters are { Saturn coupes } and the other parameters are just in the mean or below

Questions to Solve: Project

Take the results for the spectral Clustering with 2 groups

Test some classification technique to predict this label

DEEP LEARNING WITH 2 GROUPS

SHOW THE SPECTRAL CLUSTERING MODEL WITH 15 GROUPS

scM1 <- specc(dft1s %>% as.matrix(),centers=2)

labscM1 <- as.data.frame(factor(scM1)) 

labelsCM1 <- as.integer(labscM1$`factor(scM1)`)

fviz_cluster(list(data = dft1s, cluster = labelsCM1))+ggtitle("Spectral Clustering")

LETS USE DEEP LEARNING TO CLASSIFY THE LABELS

# We set the label to -1 so the numbers start at 0 for the model
labelDL <- as.integer(labscM1$`factor(scM1)`)-1

df2 <- dft1s %>% mutate(group = labelDL)

ixTrain <- createDataPartition(df2$group,p=0.6,list=FALSE)
training <- df2[ixTrain,]
testing <- df2[-ixTrain,]

xtrain <- training %>% dplyr::select(-group)
ylabtrain <- training %>% dplyr::select(group)

xtest <- testing %>% dplyr::select(-group)
ylabtest <- testing %>% dplyr::select(group)

xtrainM <- as.matrix(xtrain)
ylabtrainM <- as.matrix(ylabtrain)

xtestM <- as.matrix(xtest)
ylabtestM <- as.matrix(ylabtest)

mod2 <-  keras_model_sequential()
mod2 %>% layer_dense(units = 8, activation = 'relu', input_shape = c(19)) %>% 
        layer_dense(units = 3, activation = 'softmax')

mod2 %>% compile(
     loss = 'sparse_categorical_crossentropy',
     optimizer = 'adam',
     metrics = 'accuracy'
)

history <- mod2 %>% fit(
     xtrainM, 
     ylabtrainM, 
     epochs = 20, 
     batch_size = 10, 
     validation_split = 0.2
)

print(history)

## Trained on 386 samples (batch_size=10, epochs=20)
## Final epoch (plot to see history):
##         loss: 0.0159
##     accuracy: 1
##     val_loss: 0.01461
## val_accuracy: 1

plot(history)

## `geom_smooth()` using formula 'y ~ x'

perf <- mod2 %>% evaluate(xtestM,ylabtestM)
perf

## $loss
## [1] 0.01693813
## 
## $accuracy
## [1] 1

dim <- length(ylabtestM)
class <- mod2 %>% predict_classes(xtestM[1:dim,])

real <- ylabtestM

confmat <- confusionMatrix(as.factor(class),as.factor(real))

dtmodv1 <- confmat$table
dtmodv1 %>% kable(caption = "Confusion Matrix") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>% 
  column_spec(1, bold = T,color = "white", background = "#ff6666") %>% 
  row_spec(0, bold = T,color = "white", background = "#ff4a4a") %>% 
  add_header_above(c(" "=1, "PREDICTED - REAL" = 2))

Confusion Matrix
	PREDICTED - REAL
	0	1
0	27	0
1	0	294

DEEP LEARNING WITH DBMS WITH 15 GROUPS

SHOW THE DBMS MODEL WITH 15 GROUPS

dbM2 <- dbscan(dft1s, eps = 2.5, MinPts = 5)

## Warning in dbscan(dft1s, eps = 2.5, MinPts = 5): converting argument MinPts
## (fpc) to minPts (dbscan)!

fviz_cluster(dbM2, data = dft1s, stand = FALSE,
             ellipse = FALSE, show.clust.cent = FALSE,
             geom = "point",palette = "jco", ggtheme = theme_bw())

## Warning: This manual palette can handle a maximum of 10 values. You have
## supplied 15.

## Warning: Removed 278 rows containing missing values (geom_point).

LETS USE DEEP LEARNING TO CLASSIFY THE LABELS

labelDBMS <- dbM2$cluster

df3 <- dft1s %>% mutate(group = labelDBMS)

ixTrain <- createDataPartition(df3$group,p=0.6,list=FALSE)
training <- df3[ixTrain,]
testing <- df3[-ixTrain,]

xtrain <- training %>% dplyr::select(-group)
ylabtrain <- training %>% dplyr::select(group)

xtest <- testing %>% dplyr::select(-group)
ylabtest <- testing %>% dplyr::select(group)

xtrainM <- as.matrix(xtrain)
ylabtrainM <- as.matrix(ylabtrain)

xtestM <- as.matrix(xtest)
ylabtestM <- as.matrix(ylabtest)

mod2 <-  keras_model_sequential()
mod2 %>% layer_dense(units = 32, activation = 'relu', input_shape = c(19)) %>% 
        layer_dense(units = 16, activation = 'softmax')

mod2 %>% compile(
     loss = 'sparse_categorical_crossentropy',
     optimizer = 'adam',
     metrics = 'accuracy'
)

history <- mod2 %>% fit(
     xtrainM, 
     ylabtrainM, 
     epochs = 20, 
     batch_size = 10, 
     validation_split = 0.2
)

print(history)

## Trained on 387 samples (batch_size=10, epochs=20)
## Final epoch (plot to see history):
##         loss: 0.05427
##     accuracy: 0.9948
##     val_loss: 0.1755
## val_accuracy: 0.9691

plot(history)

## `geom_smooth()` using formula 'y ~ x'

perf <- mod2 %>% evaluate(xtestM,ylabtestM)
perf

## $loss
## [1] 0.09083202
## 
## $accuracy
## [1] 0.978125

dim <- length(ylabtestM)
class <- mod2 %>% predict_classes(xtestM[1:dim,])

real <- ylabtestM

confmat <- confusionMatrix(as.factor(class),as.factor(real))

dtmodv1 <- confmat$table
dtmodv1 %>% kable(caption = "Confusion Matrix") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>% 
  column_spec(1, bold = T,color = "white", background = "#ff6666") %>% 
  row_spec(0, bold = T,color = "white", background = "#ff4a4a") %>% 
  add_header_above(c(" "=1, "PREDICTED  - REAL" = 16))

Confusion Matrix
	PREDICTED - REAL
	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
0	4	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	0	28	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	1	0	40	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	0	8	0	0	0	0	0	0	0	0	0	0	0	0
4	1	0	0	0	22	0	0	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	13	0	0	0	0	0	0	0	0	0	0
6	0	0	0	0	0	0	9	0	0	0	0	0	0	0	0	0
7	0	0	0	0	0	0	0	4	0	0	0	0	0	0	0	0
8	1	0	0	0	0	0	0	0	11	0	0	0	0	0	0	0
9	2	0	0	0	0	0	0	0	0	27	0	0	0	0	0	0
10	0	0	0	0	0	0	0	0	0	0	34	0	0	0	0	0
11	0	0	0	0	0	0	0	0	0	0	0	4	0	0	0	0
12	0	0	0	0	0	0	0	0	0	0	0	0	15	0	0	0
13	0	0	0	0	0	0	0	0	0	0	0	0	0	25	0	0
14	0	0	0	0	0	0	0	0	0	0	0	0	0	0	62	0
15	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	7

lab2ErikVarga

Erik Varga

28/6/2020

Correlation between the data

Transform the data

The new data set transformed

Pick only the right Variables

Check the new scaled data set

Questions to Solve: Assignment

For hierarchical Clustering try to explain how to characterize the groups build

We now cut the tree and add the new labels to our data set

We can now check the structure of the label

Lets use the mean from every group to see the most relevant parameters in each cluster

For the K-means Clustering repeat the same problem

Lets use the mean from every group to see the most relevant parameters in each cluster

For the spectral Clustering repeat the process

Lets use the mean from every group to see the most relevant parameters in each cluster

Group Characterization

For density-based clustering repeat the process

Questions to Solve: Project

Take the results for the spectral Clustering with 2 groups

Test some classification technique to predict this label

DEEP LEARNING WITH 2 GROUPS

SHOW THE SPECTRAL CLUSTERING MODEL WITH 15 GROUPS

LETS USE DEEP LEARNING TO CLASSIFY THE LABELS

DEEP LEARNING WITH DBMS WITH 15 GROUPS

SHOW THE DBMS MODEL WITH 15 GROUPS

LETS USE DEEP LEARNING TO CLASSIFY THE LABELS

	PREDICTED - REAL
	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
0	4	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	0	28	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	1	0	40	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	0	8	0	0	0	0	0	0	0	0	0	0	0	0
4	1	0	0	0	22	0	0	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	13	0	0	0	0	0	0	0	0	0	0
6	0	0	0	0	0	0	9	0	0	0	0	0	0	0	0	0
7	0	0	0	0	0	0	0	4	0	0	0	0	0	0	0	0
8	1	0	0	0	0	0	0	0	11	0	0	0	0	0	0	0
9	2	0	0	0	0	0	0	0	0	27	0	0	0	0	0	0
10	0	0	0	0	0	0	0	0	0	0	34	0	0	0	0	0
11	0	0	0	0	0	0	0	0	0	0	0	4	0	0	0	0
12	0	0	0	0	0	0	0	0	0	0	0	0	15	0	0	0
13	0	0	0	0	0	0	0	0	0	0	0	0	0	25	0	0
14	0	0	0	0	0	0	0	0	0	0	0	0	0	0	62	0
15	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	7

	PREDICTED - REAL
	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
0	4	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	0	28	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	1	0	40	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	0	8	0	0	0	0	0	0	0	0	0	0	0	0
4	1	0	0	0	22	0	0	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	13	0	0	0	0	0	0	0	0	0	0
6	0	0	0	0	0	0	9	0	0	0	0	0	0	0	0	0
7	0	0	0	0	0	0	0	4	0	0	0	0	0	0	0	0
8	1	0	0	0	0	0	0	0	11	0	0	0	0	0	0	0
9	2	0	0	0	0	0	0	0	0	27	0	0	0	0	0	0
10	0	0	0	0	0	0	0	0	0	0	34	0	0	0	0	0
11	0	0	0	0	0	0	0	0	0	0	0	4	0	0	0	0
12	0	0	0	0	0	0	0	0	0	0	0	0	15	0	0	0
13	0	0	0	0	0	0	0	0	0	0	0	0	0	25	0	0
14	0	0	0	0	0	0	0	0	0	0	0	0	0	0	62	0
15	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	7

	PREDICTED - REAL
	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
0	4	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	0	28	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	1	0	40	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	0	8	0	0	0	0	0	0	0	0	0	0	0	0
4	1	0	0	0	22	0	0	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	13	0	0	0	0	0	0	0	0	0	0
6	0	0	0	0	0	0	9	0	0	0	0	0	0	0	0	0
7	0	0	0	0	0	0	0	4	0	0	0	0	0	0	0	0
8	1	0	0	0	0	0	0	0	11	0	0	0	0	0	0	0
9	2	0	0	0	0	0	0	0	0	27	0	0	0	0	0	0
10	0	0	0	0	0	0	0	0	0	0	34	0	0	0	0	0
11	0	0	0	0	0	0	0	0	0	0	0	4	0	0	0	0
12	0	0	0	0	0	0	0	0	0	0	0	0	15	0	0	0
13	0	0	0	0	0	0	0	0	0	0	0	0	0	25	0	0
14	0	0	0	0	0	0	0	0	0	0	0	0	0	0	62	0
15	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	7