library(dataset)
str()
row.names()
##Data preprocessing
sum(!complete.cases(iris))
summary(iris)
##Remove orimputing missing objects
df <- na.omit()
##rescale (or normalization)
df <- scale (df, center = T, scale = T)
summary(df)
##standardization
apply(iris, 2, sd)
apply(iris, 2, mean)
apply(df, 2, sd)
library(factoextra)
distance <- get_dist(df)
fviz_dist(distance, gradient = list(low = “#00AFBB”, mid = “white”, high = “FC4E07”))
km_output <- kmeans(df, centers = 2, nstart = 25, iter.max = 100, algorithm = “Hartigan-Wong”)
str(km_output)
names(km_output)
typeof(km_output)
length(km_output)
km_output$cluster
km_output$totss
km_output$withinss # distance without and within clusters
km_output$betweenss
sum(c(km_output$withinss, km_output$betweenss)
cohesion <- sum(km_output$withinss)/ km_output$totss
cohesion
fviz_cluster(km_output, data = df)
library(dplyr)
library(ggplot2)
df %>%
as.data.frame()%>%
mutate(cluster = km_output$cluster, species = row.names(dataset)) %>%
ggplot(aes(x = UrbanPop, y = Murder, color = factor(cluster), label = species)) + geom_text( )
cluster_df <- data.frame(species = tolower(row.names(dataset)), cluster = unname(km_output$cluster))
library(maps)
objects_names <- map_data(“species”)
objects_names %>%
left_join(cluster_df, by = c(“region” = “species”)) %>%
ggplot( ) +
geom_polygon(aes(x = long, y = lat, fill = as.factor(cluster), group =group), color = “white”) +
coord_fixed(1.3) +
guides(fill = F) +
theme_bw( ) +
theme(panel.grid.major = element_blank( ), panel.grid.minor = element_blank( ),
panel.border = element_blank( ),
axis.line = element_blank( ),
axis.text = element_blank( ),
axis.ticks = element_blank( ),
axis.title = element_blank( ))
set.seed(8)
wss <-function(k) {
return(kmeans(df, k, nstart = 25)$tot.withinss)}
k_values <- 1:15
wss_values <- purrr::map_dbl(k_values, wss)
plot(x = k_values, y = wss_values,
type = “b”, frame = F,
xlab = “Number of clusters K”,
ylab = “Total within-clusters sum of square”)
hac_output <- hclust(dist(dataset, method = “Euclidean”), method = “complete”)
plot(hac_output) # Calculating distance using hierarchical clustering, using Euclidean distance
# and using complete linkage for hierarchical clustering
hac_cut <- cutree(hac_output, 2)
for ( i in 1:length(hac_cut)) {
if( hac_cut[i] != km_output$cluster[i]) print(names(hac_cut) [i])
}