data(cars, package = "caret")
df <- cars
str(df)
## 'data.frame': 804 obs. of 18 variables:
## $ Price : num 22661 21725 29143 30732 33359 ...
## $ Mileage : int 20105 13457 31655 22479 17590 23635 17381 27558 25049 17319 ...
## $ Cylinder : int 6 6 4 4 4 4 4 4 4 4 ...
## $ Doors : int 4 2 2 2 2 2 2 2 2 4 ...
## $ Cruise : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Sound : int 0 1 1 0 1 0 1 0 0 0 ...
## $ Leather : int 0 0 1 0 1 0 1 1 0 1 ...
## $ Buick : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Cadillac : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Chevy : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Pontiac : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Saab : int 0 0 1 1 1 1 1 1 1 1 ...
## $ Saturn : int 0 0 0 0 0 0 0 0 0 0 ...
## $ convertible: int 0 0 1 1 1 1 1 1 1 0 ...
## $ coupe : int 0 1 0 0 0 0 0 0 0 0 ...
## $ hatchback : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sedan : int 1 0 0 0 0 0 0 0 0 1 ...
## $ wagon : int 0 0 0 0 0 0 0 0 0 0 ...
df %>% cor() %>% round(.,digits=ndig) %>% mykable()
| Price | Mileage | Cylinder | Doors | Cruise | Sound | Leather | Buick | Cadillac | Chevy | Pontiac | Saab | Saturn | convertible | coupe | hatchback | sedan | wagon | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Price | 1.00 | -0.14 | 0.57 | -0.14 | 0.43 | -0.12 | 0.16 | -0.02 | 0.66 | -0.40 | -0.14 | 0.34 | -0.21 | 0.51 | -0.17 | -0.21 | -0.03 | 0.05 |
| Mileage | -0.14 | 1.00 | -0.03 | -0.02 | 0.03 | -0.03 | 0.00 | 0.02 | -0.04 | -0.02 | -0.03 | 0.06 | 0.02 | 0.03 | 0.00 | -0.03 | -0.02 | 0.03 |
| Cylinder | 0.57 | -0.03 | 1.00 | 0.00 | 0.35 | -0.09 | 0.08 | 0.18 | 0.53 | -0.16 | 0.11 | -0.37 | -0.19 | 0.06 | -0.04 | -0.06 | 0.18 | -0.27 |
| Doors | -0.14 | -0.02 | 0.00 | 1.00 | -0.05 | -0.06 | -0.06 | 0.18 | 0.09 | -0.15 | 0.04 | -0.03 | -0.06 | -0.46 | -0.83 | 0.16 | 0.69 | 0.16 |
| Cruise | 0.43 | 0.03 | 0.35 | -0.05 | 1.00 | -0.09 | -0.07 | 0.19 | 0.19 | -0.29 | 0.00 | 0.23 | -0.20 | 0.15 | -0.04 | -0.26 | 0.13 | -0.04 |
| Sound | -0.12 | -0.03 | -0.09 | -0.06 | -0.09 | 1.00 | 0.17 | -0.01 | -0.09 | 0.26 | -0.07 | -0.09 | -0.14 | -0.04 | 0.10 | 0.07 | -0.02 | -0.14 |
| Leather | 0.16 | 0.00 | 0.08 | -0.06 | -0.07 | 0.17 | 1.00 | -0.21 | 0.21 | 0.16 | -0.09 | 0.00 | -0.15 | 0.01 | 0.06 | 0.09 | -0.10 | 0.00 |
| Buick | -0.02 | 0.02 | 0.18 | 0.18 | 0.19 | -0.01 | -0.21 | 1.00 | -0.11 | -0.27 | -0.16 | -0.14 | -0.09 | -0.09 | -0.15 | -0.09 | 0.27 | -0.10 |
| Cadillac | 0.66 | -0.04 | 0.53 | 0.09 | 0.19 | -0.09 | 0.21 | -0.11 | 1.00 | -0.27 | -0.16 | -0.14 | -0.09 | 0.09 | -0.15 | -0.09 | 0.18 | -0.10 |
| Chevy | -0.40 | -0.02 | -0.16 | -0.15 | -0.29 | 0.26 | 0.16 | -0.27 | -0.27 | 1.00 | -0.39 | -0.33 | -0.23 | -0.10 | 0.23 | 0.35 | -0.18 | -0.24 |
| Pontiac | -0.14 | -0.03 | 0.11 | 0.04 | 0.00 | -0.07 | -0.09 | -0.16 | -0.16 | -0.39 | 1.00 | -0.19 | -0.14 | -0.12 | 0.03 | -0.14 | -0.01 | 0.21 |
| Saab | 0.34 | 0.06 | -0.37 | -0.03 | 0.23 | -0.09 | 0.00 | -0.14 | -0.14 | -0.33 | -0.19 | 1.00 | -0.12 | 0.34 | -0.19 | -0.12 | -0.14 | 0.33 |
| Saturn | -0.21 | 0.02 | -0.19 | -0.06 | -0.20 | -0.14 | -0.15 | -0.09 | -0.09 | -0.23 | -0.14 | -0.12 | 1.00 | -0.07 | 0.12 | -0.08 | 0.03 | -0.08 |
| convertible | 0.51 | 0.03 | 0.06 | -0.46 | 0.15 | -0.04 | 0.01 | -0.09 | 0.09 | -0.10 | -0.12 | 0.34 | -0.07 | 1.00 | -0.12 | -0.07 | -0.32 | -0.08 |
| coupe | -0.17 | 0.00 | -0.04 | -0.83 | -0.04 | 0.10 | 0.06 | -0.15 | -0.15 | 0.23 | 0.03 | -0.19 | 0.12 | -0.12 | 1.00 | -0.13 | -0.57 | -0.14 |
| hatchback | -0.21 | -0.03 | -0.06 | 0.16 | -0.26 | 0.07 | 0.09 | -0.09 | -0.09 | 0.35 | -0.14 | -0.12 | -0.08 | -0.07 | -0.13 | 1.00 | -0.35 | -0.08 |
| sedan | -0.03 | -0.02 | 0.18 | 0.69 | 0.13 | -0.02 | -0.10 | 0.27 | 0.18 | -0.18 | -0.01 | -0.14 | 0.03 | -0.32 | -0.57 | -0.35 | 1.00 | -0.37 |
| wagon | 0.05 | 0.03 | -0.27 | 0.16 | -0.04 | -0.14 | 0.00 | -0.10 | -0.10 | -0.24 | 0.21 | 0.33 | -0.08 | -0.08 | -0.14 | -0.08 | -0.37 | 1.00 |
mx <- cor(df)
corr <- round(mx,ndig)
ggcorrplot(corr)
dftrans <- df %>% mutate(Pricel=log(Price),Mileagel=log(Mileage))
dftrans <- dftrans %>% mutate(PriceMile=Price/Mileage,PriceMilel=log(PriceMile))
summary(dftrans)
## Price Mileage Cylinder Doors
## Min. : 8639 Min. : 266 Min. :4.000 Min. :2.000
## 1st Qu.:14273 1st Qu.:14624 1st Qu.:4.000 1st Qu.:4.000
## Median :18025 Median :20914 Median :6.000 Median :4.000
## Mean :21343 Mean :19832 Mean :5.269 Mean :3.527
## 3rd Qu.:26717 3rd Qu.:25213 3rd Qu.:6.000 3rd Qu.:4.000
## Max. :70755 Max. :50387 Max. :8.000 Max. :4.000
## Cruise Sound Leather Buick
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.7525 Mean :0.6791 Mean :0.7239 Mean :0.0995
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Cadillac Chevy Pontiac Saab
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.0995 Mean :0.398 Mean :0.1866 Mean :0.1418
## 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
## Saturn convertible coupe hatchback
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.07463 Mean :0.06219 Mean :0.1741 Mean :0.07463
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## sedan wagon Pricel Mileagel
## Min. :0.0000 Min. :0.0000 Min. : 9.064 Min. : 5.583
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 9.566 1st Qu.: 9.590
## Median :1.0000 Median :0.0000 Median : 9.800 Median : 9.948
## Mean :0.6095 Mean :0.0796 Mean : 9.879 Mean : 9.754
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:10.193 3rd Qu.:10.135
## Max. :1.0000 Max. :1.0000 Max. :11.167 Max. :10.827
## PriceMile PriceMilel
## Min. : 0.2484 Min. :-1.39263
## 1st Qu.: 0.6575 1st Qu.:-0.41933
## Median : 0.9588 Median :-0.04204
## Mean : 1.9734 Mean : 0.12533
## 3rd Qu.: 1.6604 3rd Qu.: 0.50705
## Max. :121.3644 Max. : 4.79880
dft1 <- dftrans %>% select(Cylinder:Mileagel,PriceMilel)
# Scale Data
dft1s <- dft1 %>% scale()
dft1s <- as.data.frame(dft1s)
dft1s %>% summarise_all(list(mean,sd))
set.seed(1350)
hc <- dft1s %>% dist(method = "euclidean") %>% hclust(method = "ward.D2")
fviz_dend(hc,k=15,rect=TRUE)
corte <- cutree(hc,k=15)
dfhc <- df %>% mutate(label=corte)
dfhc %>% group_by(label) %>% summarise(count=n())
dfhc %>% head()
dfhc %>% group_by(label) %>% summarise_all(mean)
set.seed(1350)
kmns1 <- kmeans(dft1s, 15, nstart = 20)
library(cluster)
library(factoextra)
sil <- silhouette(kmns1$cluster, dist(dft1s))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 88 0.34
## 2 2 69 0.58
## 3 3 28 0.21
## 4 4 34 0.63
## 5 5 82 0.40
## 6 6 49 0.50
## 7 7 122 0.37
## 8 8 58 0.23
## 9 9 78 0.53
## 10 10 20 0.33
## 11 11 59 0.46
## 12 12 30 0.59
## 13 13 28 0.04
## 14 14 30 0.44
## 15 15 29 0.24
fviz_cluster(kmns1, data = dft1s)
dfkm <- df %>% mutate(label=kmns1$cluster)
dfkm %>% group_by(label) %>% summarise_all(mean)
set.seed(1350)
scM <- specc(dft1s %>% as.matrix(),centers=15)
labscM <- as.data.frame(factor(scM)) #
labelSp <- as.integer(labscM$`factor(scM)`)
fviz_cluster(list(data = dft1s, cluster = labelSp))+ggtitle("Spectral Clustering")
dfsp <- df %>% mutate(label=labelSp)
dfsp %>% group_by(label) %>% summarise_all(mean)
dbM1 <- dbscan(dft1s, eps = 2.5, MinPts = 5)
## Warning in dbscan(dft1s, eps = 2.5, MinPts = 5): converting argument MinPts
## (fpc) to minPts (dbscan)!
fviz_cluster(dbM1, data = dft1s, stand = FALSE,
ellipse = FALSE, show.clust.cent = FALSE,
geom = "point",palette = "jco", ggtheme = theme_bw())
## Warning: This manual palette can handle a maximum of 10 values. You have
## supplied 15.
## Warning: Removed 278 rows containing missing values (geom_point).
dfdb <- df %>% mutate(label=dbM1$cluster)
dfdb %>% group_by(label) %>% summarise_all(mean)
scM1 <- specc(dft1s %>% as.matrix(),centers=2)
labscM1 <- as.data.frame(factor(scM1))
labelsCM1 <- as.integer(labscM1$`factor(scM1)`)
fviz_cluster(list(data = dft1s, cluster = labelsCM1))+ggtitle("Spectral Clustering")
# We set the label to -1 so the numbers start at 0 for the model
labelDL <- as.integer(labscM1$`factor(scM1)`)-1
df2 <- dft1s %>% mutate(group = labelDL)
ixTrain <- createDataPartition(df2$group,p=0.6,list=FALSE)
training <- df2[ixTrain,]
testing <- df2[-ixTrain,]
xtrain <- training %>% dplyr::select(-group)
ylabtrain <- training %>% dplyr::select(group)
xtest <- testing %>% dplyr::select(-group)
ylabtest <- testing %>% dplyr::select(group)
xtrainM <- as.matrix(xtrain)
ylabtrainM <- as.matrix(ylabtrain)
xtestM <- as.matrix(xtest)
ylabtestM <- as.matrix(ylabtest)
mod2 <- keras_model_sequential()
mod2 %>% layer_dense(units = 8, activation = 'relu', input_shape = c(19)) %>%
layer_dense(units = 3, activation = 'softmax')
mod2 %>% compile(
loss = 'sparse_categorical_crossentropy',
optimizer = 'adam',
metrics = 'accuracy'
)
history <- mod2 %>% fit(
xtrainM,
ylabtrainM,
epochs = 20,
batch_size = 10,
validation_split = 0.2
)
print(history)
## Trained on 386 samples (batch_size=10, epochs=20)
## Final epoch (plot to see history):
## loss: 0.0159
## accuracy: 1
## val_loss: 0.01461
## val_accuracy: 1
plot(history)
## `geom_smooth()` using formula 'y ~ x'
perf <- mod2 %>% evaluate(xtestM,ylabtestM)
perf
## $loss
## [1] 0.01693813
##
## $accuracy
## [1] 1
dim <- length(ylabtestM)
class <- mod2 %>% predict_classes(xtestM[1:dim,])
real <- ylabtestM
confmat <- confusionMatrix(as.factor(class),as.factor(real))
dtmodv1 <- confmat$table
dtmodv1 %>% kable(caption = "Confusion Matrix") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1, bold = T,color = "white", background = "#ff6666") %>%
row_spec(0, bold = T,color = "white", background = "#ff4a4a") %>%
add_header_above(c(" "=1, "PREDICTED - REAL" = 2))
| 0 | 1 | |
|---|---|---|
| 0 | 27 | 0 |
| 1 | 0 | 294 |
dbM2 <- dbscan(dft1s, eps = 2.5, MinPts = 5)
## Warning in dbscan(dft1s, eps = 2.5, MinPts = 5): converting argument MinPts
## (fpc) to minPts (dbscan)!
fviz_cluster(dbM2, data = dft1s, stand = FALSE,
ellipse = FALSE, show.clust.cent = FALSE,
geom = "point",palette = "jco", ggtheme = theme_bw())
## Warning: This manual palette can handle a maximum of 10 values. You have
## supplied 15.
## Warning: Removed 278 rows containing missing values (geom_point).
labelDBMS <- dbM2$cluster
df3 <- dft1s %>% mutate(group = labelDBMS)
ixTrain <- createDataPartition(df3$group,p=0.6,list=FALSE)
training <- df3[ixTrain,]
testing <- df3[-ixTrain,]
xtrain <- training %>% dplyr::select(-group)
ylabtrain <- training %>% dplyr::select(group)
xtest <- testing %>% dplyr::select(-group)
ylabtest <- testing %>% dplyr::select(group)
xtrainM <- as.matrix(xtrain)
ylabtrainM <- as.matrix(ylabtrain)
xtestM <- as.matrix(xtest)
ylabtestM <- as.matrix(ylabtest)
mod2 <- keras_model_sequential()
mod2 %>% layer_dense(units = 32, activation = 'relu', input_shape = c(19)) %>%
layer_dense(units = 16, activation = 'softmax')
mod2 %>% compile(
loss = 'sparse_categorical_crossentropy',
optimizer = 'adam',
metrics = 'accuracy'
)
history <- mod2 %>% fit(
xtrainM,
ylabtrainM,
epochs = 20,
batch_size = 10,
validation_split = 0.2
)
print(history)
## Trained on 387 samples (batch_size=10, epochs=20)
## Final epoch (plot to see history):
## loss: 0.05427
## accuracy: 0.9948
## val_loss: 0.1755
## val_accuracy: 0.9691
plot(history)
## `geom_smooth()` using formula 'y ~ x'
perf <- mod2 %>% evaluate(xtestM,ylabtestM)
perf
## $loss
## [1] 0.09083202
##
## $accuracy
## [1] 0.978125
dim <- length(ylabtestM)
class <- mod2 %>% predict_classes(xtestM[1:dim,])
real <- ylabtestM
confmat <- confusionMatrix(as.factor(class),as.factor(real))
dtmodv1 <- confmat$table
dtmodv1 %>% kable(caption = "Confusion Matrix") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1, bold = T,color = "white", background = "#ff6666") %>%
row_spec(0, bold = T,color = "white", background = "#ff4a4a") %>%
add_header_above(c(" "=1, "PREDICTED - REAL" = 16))
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1 | 0 | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1 | 0 | 0 | 0 | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 0 | 0 | 0 | 0 | 0 | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 6 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 27 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 34 | 0 | 0 | 0 | 0 | 0 |
| 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 |
| 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 0 | 0 | 0 |
| 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25 | 0 | 0 |
| 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 62 | 0 |
| 15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |