biopsy = read.csv("C:\\Thach\\VLU workshop (Jun2023)\\Datasets\\biopsy.csv")
dim(biopsy)
## [1] 683 11
head(biopsy)
## ID V1 V2 V3 V4 V5 V6 V7 V8 V9 class.n
## 1 1000025 5 1 1 1 2 1 3 1 1 benign
## 2 1002945 5 4 4 5 7 10 3 2 1 benign
## 3 1015425 3 1 1 1 2 2 3 1 1 benign
## 4 1016277 6 8 8 1 3 4 3 7 1 unknown
## 5 1017023 4 1 1 3 2 1 3 1 1 benign
## 6 1017122 8 10 10 8 7 10 9 7 1 malignant
Chuẩn hóa dữ liệu
biopsy[2:10] = scale(biopsy[2:10])
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(234)
index = createDataPartition(biopsy$class, p = 0.6, list = FALSE)
train = biopsy[index, ]
dim(train)
## [1] 411 11
test = biopsy[-index, ]
dim(test)
## [1] 272 11
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked _by_ '.GlobalEnv':
##
## biopsy
m.lda = lda(class.n ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9, data = train)
m.lda
## Call:
## lda(class.n ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9, data = train)
##
## Prior probabilities of groups:
## benign malignant unknown
## 0.64476886 0.35036496 0.00486618
##
## Group means:
## V1 V2 V3 V4 V5 V6
## benign -0.5554157 -0.6413725 -0.6376916 -0.53219326 -0.5484118 -0.6382764
## malignant 0.9141739 1.1425145 1.1084026 0.97566007 0.9910727 1.0549900
## unknown 0.3750169 1.5820442 1.2664115 0.05928967 0.7942742 1.2227000
## V7 V8 V9
## benign -0.58836739 -0.55808745 -0.2980529
## malignant 1.05145192 1.00950970 0.6177676
## unknown 0.02241291 0.04268644 -0.3481446
##
## Coefficients of linear discriminants:
## LD1 LD2
## V1 0.41593647 -0.27145819
## V2 0.62412386 1.75461339
## V3 0.20961207 0.05880605
## V4 0.13352925 -0.48912666
## V5 0.23447786 -0.28817629
## V6 0.98433350 0.58029247
## V7 0.33880717 -0.89697990
## V8 0.33974836 -0.63026554
## V9 -0.07696674 -0.23431148
##
## Proportion of trace:
## LD1 LD2
## 0.9921 0.0079
predicted = predict(m.lda, test)
names(predicted)
## [1] "class" "posterior" "x"
head(predicted$class)
## [1] malignant benign malignant benign benign benign
## Levels: benign malignant unknown
mean(predicted$class == test$class.n)
## [1] 0.9411765
Đọc dữ liệu vào R
library(MASS)
data(iris)
dim(iris)
## [1] 150 5
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
Quan sát dữ liệu
pairs(iris[1:4], pch = 21, bg = c("red", "blue", "orange")[unclass(iris$Species)])
s.iris = scale(iris[1:4], center = TRUE, scale = TRUE)
head(s.iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] -0.8976739 1.01560199 -1.335752 -1.311052
## [2,] -1.1392005 -0.13153881 -1.335752 -1.311052
## [3,] -1.3807271 0.32731751 -1.392399 -1.311052
## [4,] -1.5014904 0.09788935 -1.279104 -1.311052
## [5,] -1.0184372 1.24503015 -1.335752 -1.311052
## [6,] -0.5353840 1.93331463 -1.165809 -1.048667
s.corr = cor(s.iris)
round(s.corr, 2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.00 -0.12 0.87 0.82
## Sepal.Width -0.12 1.00 -0.43 -0.37
## Petal.Length 0.87 -0.43 1.00 0.96
## Petal.Width 0.82 -0.37 0.96 1.00
eigen = eigen(s.corr)
eigen
## eigen() decomposition
## $values
## [1] 2.91849782 0.91403047 0.14675688 0.02071484
##
## $vectors
## [,1] [,2] [,3] [,4]
## [1,] 0.5210659 -0.37741762 0.7195664 0.2612863
## [2,] -0.2693474 -0.92329566 -0.2443818 -0.1235096
## [3,] 0.5804131 -0.02449161 -0.1421264 -0.8014492
## [4,] 0.5648565 -0.06694199 -0.6342727 0.5235971
pca = prcomp(iris[1:4], center = T, scale = T)
pca
## Standard deviations (1, .., p=4):
## [1] 1.7083611 0.9560494 0.3830886 0.1439265
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## Sepal.Length 0.5210659 -0.37741762 0.7195664 0.2612863
## Sepal.Width -0.2693474 -0.92329566 -0.2443818 -0.1235096
## Petal.Length 0.5804131 -0.02449161 -0.1421264 -0.8014492
## Petal.Width 0.5648565 -0.06694199 -0.6342727 0.5235971
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.7084 0.9560 0.38309 0.14393
## Proportion of Variance 0.7296 0.2285 0.03669 0.00518
## Cumulative Proportion 0.7296 0.9581 0.99482 1.00000
Vẽ biểu đồ các thành phần
library(devtools)
## Loading required package: usethis
#install.packages("remotes")
#remotes::install_github("vqv/ggbiplot")
library(ggbiplot)
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
ggbiplot(pca, obs.scale = 1, var.scale = 1, groups = iris$Species, ellipse = T, circle = T) + scale_color_discrete(name = '') + theme(legend.direction = 'horizontal', legend.position = "top")
Đọc dữ liệu iris vào R
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
Tạo tập dữ liệu mới không có biến Species
dat = iris
dat$Species = NULL
head(dat)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
Xác định các cụm
# Tính khoảng cách (distance)
d = dist(dat)
# Xác định các cụm
hc = hclust(d)
plot(hc)
Đọc dữ liệu USArrests vào R
library(datasets)
data("USArrests")
dim(USArrests)
## [1] 50 4
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
Đọc dữ liệu USArrests
library(datasets)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(NbClust)
library(cluster)
s.USArrests = scale(USArrests)
head(s.USArrests)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
Xác định số cụm tối ưu
fviz_nbclust(s.USArrests, kmeans, nstart = 25, method = "gap_stat", nboot = 50) + labs(subtitle = "Gap statistic method")
# from 'cluster' package
g.stat = clusGap(s.USArrests, FUN = kmeans, nstart = 25, K.max = 10, B = 100)
fviz_gap_stat(g.stat)
km = kmeans(s.USArrests, 3, nstart = 25)
fviz_cluster(km, s.USArrests)
sil = silhouette(km$cluster, dist(s.USArrests))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 17 0.32
## 2 2 13 0.37
## 3 3 20 0.26
hc = eclust(s.USArrests, "hclust")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fviz_dend(hc, rect = TRUE)