Samoucząca się sieć neuronowa - SOM
Biblioteki
library(kohonen)
library(ggplot2)
library(clusterSim)
library(dplyr)
Zbiór danych
Wczytanie danych
dane <- read.csv("C:/Users/majko/OneDrive/Dokumenty/Sztuczne_scie_neuronowe_R/uzytkownicy_portalu_spolecznosciowego.csv") %>%
glimpse()
## Rows: 5,000
## Columns: 5
## $ user_id <chr> "U001", "U002", "U003", "U004", "U005", "U006", "U007"…
## $ czas_na_stronie <dbl> 4.2172863, 2.8830514, 6.6452743, 0.1578868, 0.2810549,…
## $ liczba_klikniec <int> 8, 12, 12, 8, 14, 10, 7, 10, 6, 7, 10, 2, 8, 7, 11, 10…
## $ liczba_sesji <int> 2, 6, 9, 5, 3, 6, 4, 7, 7, 4, 3, 4, 5, 4, 3, 1, 6, 4, …
## $ konwersja <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, …
dane=as.data.frame(dane)
Podział na train/test
set.seed(123)
# indeksy
train_idx <- sample(1:nrow(dane), size = 0.7 * nrow(dane))
train_raw <- dane[train_idx, ]
test_raw <- dane[-train_idx, ]
train_scaled <- scale(train_raw[, -1]) # usuń tylko ID
test_scaled <- scale(test_raw[, -1])
Budowa sieci
Struktura sieci:
Liczba neuronów
Kształt sieci:
-arkusz
-cylinder
-toroid
Topologia sieci:
-prostokątna - “rectangular”
-heksagonalna - “hexagonal”
set.seed(100)
ads.grid <- somgrid(xdim = 5, ydim = 5, topo = "rectangular")
Utworzenie modelu SOM
set.seed(100)
ads.model <- som(train_scaled, ads.grid, rlen = 500, radius = 2, keep.data = TRUE,
dist.fcts = "euclidean")
str(ads.model)
## List of 14
## $ data :List of 1
## ..$ : num [1:3500, 1:4] -0.554 -0.881 2.95 -0.974 -0.29 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:3500] "2463" "2511" "2227" "526" ...
## .. .. ..$ : chr [1:4] "czas_na_stronie" "liczba_klikniec" "liczba_sesji" "konwersja"
## .. ..- attr(*, "scaled:center")= Named num [1:4] 5.059 9.95 5.057 0.297
## .. .. ..- attr(*, "names")= chr [1:4] "czas_na_stronie" "liczba_klikniec" "liczba_sesji" "konwersja"
## .. ..- attr(*, "scaled:scale")= Named num [1:4] 5.145 3.13 2.259 0.457
## .. .. ..- attr(*, "names")= chr [1:4] "czas_na_stronie" "liczba_klikniec" "liczba_sesji" "konwersja"
## $ unit.classif : num [1:3500] 15 21 2 22 10 8 3 6 16 24 ...
## $ distances : num [1:3500] 0.307 1.154 0.881 0.785 0.769 ...
## $ grid :List of 6
## ..$ pts : int [1:25, 1:2] 1 2 3 4 5 1 2 3 4 5 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : NULL
## .. .. ..$ : chr [1:2] "x" "y"
## ..$ xdim : num 5
## ..$ ydim : num 5
## ..$ topo : chr "rectangular"
## ..$ neighbourhood.fct: Factor w/ 2 levels "bubble","gaussian": 1
## ..$ toroidal : logi FALSE
## ..- attr(*, "class")= chr "somgrid"
## $ codes :List of 1
## ..$ : num [1:25, 1:4] 5.24 2.638 0.889 -0.514 -0.428 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:25] "V1" "V2" "V3" "V4" ...
## .. .. ..$ : chr [1:4] "czas_na_stronie" "liczba_klikniec" "liczba_sesji" "konwersja"
## $ changes : num [1:500, 1] 0.00983 0.00971 0.00965 0.00962 0.00961 ...
## $ alpha : num [1:2] 0.05 0.01
## $ radius : num [1:2] 2 0
## $ na.rows : int(0)
## $ user.weights : num 1
## $ distance.weights: num 1
## $ whatmap : int 1
## $ maxNA.fraction : int 0
## $ dist.fcts : chr "euclidean"
## - attr(*, "class")= chr "kohonen"
head(ads.model$unit.classif, 10 )
## [1] 15 21 2 22 10 8 3 6 16 24
table(ads.model$unit.classif)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 19 20 21
## 19 87 147 239 125 57 116 260 307 157 140 138 204 166 233 140 200 69 43 93
## 22 23 24 25
## 100 163 193 104
ads.model$grid
## $pts
## x y
## [1,] 1 1
## [2,] 2 1
## [3,] 3 1
## [4,] 4 1
## [5,] 5 1
## [6,] 1 2
## [7,] 2 2
## [8,] 3 2
## [9,] 4 2
## [10,] 5 2
## [11,] 1 3
## [12,] 2 3
## [13,] 3 3
## [14,] 4 3
## [15,] 5 3
## [16,] 1 4
## [17,] 2 4
## [18,] 3 4
## [19,] 4 4
## [20,] 5 4
## [21,] 1 5
## [22,] 2 5
## [23,] 3 5
## [24,] 4 5
## [25,] 5 5
##
## $xdim
## [1] 5
##
## $ydim
## [1] 5
##
## $topo
## [1] "rectangular"
##
## $neighbourhood.fct
## [1] bubble
## Levels: bubble gaussian
##
## $toroidal
## [1] FALSE
##
## attr(,"class")
## [1] "somgrid"
plot(ads.model, type = "mapping", pchs = 19, shape = "round")
plot(ads.model, type = "codes", main = "Codes Plot", palette.name = rainbow)
plot(ads.model, type = "changes")
plot(ads.model, type = "counts")
plot(ads.model, type = "dist.neighbours")
Wkład poszczególnych zmiennych
heatmap.som <- function(model){
for (i in 1:4) {
plot(model, type = "property", property = getCodes(model)[,i],
main = colnames(getCodes(model))[i])
}
}
par(mfrow=c(2,2))
heatmap.som(ads.model)
### Sprawdzenie na zbiorze testowym
test_mapping <- predict(ads.model, newdata = test_scaled)
# przypisanie klastrów
grupy <- kmeans(ads.model$codes[[1]], centers = 3)
train_raw$grupa <- grupy$cluster[ads.model$unit.classif]
test_raw$grupa <- grupy$cluster[test_mapping$unit.classif]
# porównanie
aggregate(konwersja ~ grupa, data = train_raw, mean)
## grupa konwersja
## 1 1 0.1414956
## 2 2 0.4196668
## 3 3 0.2363636
aggregate(konwersja ~ grupa, data = test_raw, mean)
## grupa konwersja
## 1 1 0.1477080
## 2 2 0.4142313
## 3 3 0.1612903
par(mfrow = c(1,2))
plot(ads.model, type = "mapping",
main = "Train")
plot(ads.model, type = "mapping",
bgcol = test_mapping$unit.classif,
main = "Test (projekcja)")
Zadanie 1
Proszę zmieniając liczbę neuronów na 7x7, 10x10 sprawdzić co się zmieni.
KLASYFIKACJA
set.seed(100)
ads.grid <- somgrid(xdim = 10, ydim = 10, topo = "hexagonal")
set.seed(100)
ads.model <- som(train_scaled, ads.grid, rlen = 500, radius = 2, keep.data = TRUE,
dist.fcts = "euclidean")
str(ads.model)
## List of 14
## $ data :List of 1
## ..$ : num [1:3500, 1:4] -0.554 -0.881 2.95 -0.974 -0.29 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:3500] "2463" "2511" "2227" "526" ...
## .. .. ..$ : chr [1:4] "czas_na_stronie" "liczba_klikniec" "liczba_sesji" "konwersja"
## .. ..- attr(*, "scaled:center")= Named num [1:4] 5.059 9.95 5.057 0.297
## .. .. ..- attr(*, "names")= chr [1:4] "czas_na_stronie" "liczba_klikniec" "liczba_sesji" "konwersja"
## .. ..- attr(*, "scaled:scale")= Named num [1:4] 5.145 3.13 2.259 0.457
## .. .. ..- attr(*, "names")= chr [1:4] "czas_na_stronie" "liczba_klikniec" "liczba_sesji" "konwersja"
## $ unit.classif : num [1:3500] 53 80 19 88 71 74 7 29 69 46 ...
## $ distances : num [1:3500] 0.296 0.559 0.794 0.79 0.434 ...
## $ grid :List of 6
## ..$ pts : num [1:100, 1:2] 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : NULL
## .. .. ..$ : chr [1:2] "x" "y"
## ..$ xdim : num 10
## ..$ ydim : num 10
## ..$ topo : chr "hexagonal"
## ..$ neighbourhood.fct: Factor w/ 2 levels "bubble","gaussian": 1
## ..$ toroidal : logi FALSE
## ..- attr(*, "class")= chr "somgrid"
## $ codes :List of 1
## ..$ : num [1:100, 1:4] 0.173 1.953 2.15 2.551 0.583 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:100] "V1" "V2" "V3" "V4" ...
## .. .. ..$ : chr [1:4] "czas_na_stronie" "liczba_klikniec" "liczba_sesji" "konwersja"
## $ changes : num [1:500, 1] 0.00764 0.00737 0.00719 0.00705 0.00701 ...
## $ alpha : num [1:2] 0.05 0.01
## $ radius : num [1:2] 2 0
## $ na.rows : int(0)
## $ user.weights : num 1
## $ distance.weights: num 1
## $ whatmap : int 1
## $ maxNA.fraction : int 0
## $ dist.fcts : chr "euclidean"
## - attr(*, "class")= chr "kohonen"
plot(ads.model, type = "mapping", pchs = 19, shape = "round")
neurony<-as.data.frame( ads.model$codes)
dd = (dist(neurony, method = "euclidean"))
fitc <- hclust(dd, method="ward.D")
plot(fitc, hang=-1)
source("http://addictedtor.free.fr/packages/A2R/lastVersion/R/code.R")
op = par(bg = "#EFEFEF")
par(mfrow=c(1,1))
wys<-c(0,fitc$height)
Mojena1<-mean(wys)+1.25* sd(wys)
Mojena1
## [1] 9.354347
A2Rplot(fitc, k = 6, boxes = TRUE, col.up = "gray50",
col.down = c("#FF6B6B","#8470FF","green4","#66CDAA","#8B7E66","grey"), main="Klasyfikacja" )
groupes <- cutree(fitc,k=6)
table(groupes)
## groupes
## 1 2 3 4 5 6
## 16 16 22 21 17 8
plot(ads.model,type="mapping",bgcol=c("steelblue1","sienna1","yellowgreen","grey","pink")[groupes])
add.cluster.boundaries(ads.model,clustering=groupes)
wnk.korelacje.kofenetyczne <- c()
met.dist <- c( "euclidean")
met.hcl <- c("single", "complete", "average","ward.D","centroid")
for(i in 1:1){ for(j in 1:5){ dds = dist(neurony, method = met.dist[i])
hcdds = hclust((dds), method = met.hcl[j])
hcdds.c = cophenetic(hcdds)
korelacja.kofenetyczna = cor(hcdds.c, dds)
wnk.korelacje.kofenetyczne = c(wnk.korelacje.kofenetyczne, korelacja.kofenetyczna)}}
wnk.korelacje.kofenetyczne = matrix(round( wnk.korelacje.kofenetyczne, 5), ncol = 1)
colnames(wnk.korelacje.kofenetyczne)<-"euclidean"
met.hcl1 <- c("single", "complete", "average","ward.D", "centroid")
rownames(wnk.korelacje.kofenetyczne)<-met.hcl1
wnk.korelacje.kofenetyczne
## euclidean
## single 0.68683
## complete 0.58632
## average 0.71043
## ward.D 0.53592
## centroid 0.64653