seeds_dataset <- read.delim("C:/Users/Ozili Nwokobia/OneDrive/Desktop/seeds_dataset.txt", header=FALSE)
View(seeds_dataset)
str(seeds_dataset)
## 'data.frame': 221 obs. of 8 variables:
## $ V1: num 15.3 14.9 14.3 13.8 16.1 ...
## $ V2: num 14.8 14.6 14.1 13.9 15 ...
## $ V3: num 0.871 0.881 0.905 0.895 0.903 ...
## $ V4: num 5.76 5.55 5.29 5.32 5.66 ...
## $ V5: num 3.31 3.33 3.34 3.38 3.56 ...
## $ V6: num 2.22 1.02 2.7 2.26 1.35 ...
## $ V7: num 5.22 4.96 4.83 4.8 5.17 ...
## $ V8: num 1 1 1 1 1 1 1 5 NA 1 ...
feature_name<-c('area', 'perimeter', 'compactness', 'length_of_kernel', 'width.of.kernel', 'asymetry.coefficient', 'length.of.kernel.groove', 'type.of.seed' )
colnames(seeds_dataset)<-feature_name
View(seeds_dataset)
str(seeds_dataset)
## 'data.frame': 221 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.871 0.881 0.905 0.895 0.903 ...
## $ length_of_kernel : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width.of.kernel : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymetry.coefficient : num 2.22 1.02 2.7 2.26 1.35 ...
## $ length.of.kernel.groove: num 5.22 4.96 4.83 4.8 5.17 ...
## $ type.of.seed : num 1 1 1 1 1 1 1 5 NA 1 ...
View(seeds_dataset)
summary(seeds_dataset)
## area perimeter compactness length_of_kernel
## Min. : 1.00 Min. : 1.00 Min. :0.8081 Min. :0.8189
## 1st Qu.:12.11 1st Qu.:13.43 1st Qu.:0.8577 1st Qu.:5.2447
## Median :14.13 Median :14.29 Median :0.8735 Median :5.5180
## Mean :14.29 Mean :14.43 Mean :0.8713 Mean :5.5639
## 3rd Qu.:17.09 3rd Qu.:15.69 3rd Qu.:0.8877 3rd Qu.:5.9798
## Max. :21.18 Max. :17.25 Max. :0.9183 Max. :6.6750
## NA's :1 NA's :9 NA's :14 NA's :11
## width.of.kernel asymetry.coefficient length.of.kernel.groove type.of.seed
## Min. :2.630 Min. :0.7651 Min. :3.485 Min. :1.000
## 1st Qu.:2.956 1st Qu.:2.6002 1st Qu.:5.045 1st Qu.:1.000
## Median :3.245 Median :3.5990 Median :5.226 Median :2.000
## Mean :3.281 Mean :3.6935 Mean :5.408 Mean :2.084
## 3rd Qu.:3.566 3rd Qu.:4.7687 3rd Qu.:5.879 3rd Qu.:3.000
## Max. :5.325 Max. :8.4560 Max. :6.735 Max. :5.439
## NA's :12 NA's :11 NA's :15 NA's :15
any(is.na(seeds_dataset))
## [1] TRUE
seeds_dataset<-na.omit(seeds_dataset)
str(seeds_dataset)
## 'data.frame': 199 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.871 0.881 0.905 0.895 0.903 ...
## $ length_of_kernel : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width.of.kernel : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymetry.coefficient : num 2.22 1.02 2.7 2.26 1.35 ...
## $ length.of.kernel.groove: num 5.22 4.96 4.83 4.8 5.17 ...
## $ type.of.seed : num 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:22] 8 9 37 38 63 64 72 73 111 112 ...
## ..- attr(*, "names")= chr [1:22] "8" "9" "37" "38" ...
seeds_dataset<-seeds_dataset[,-8]
seeds_dataset_sc<-as.data.frame(scale(seeds_dataset))
summary(seeds_dataset_sc)
## area perimeter compactness length_of_kernel
## Min. :-1.4825 Min. :-1.6680 Min. :-2.6891 Min. :-1.6776
## 1st Qu.:-0.8866 1st Qu.:-0.8591 1st Qu.:-0.5879 1st Qu.:-0.8480
## Median :-0.1674 Median :-0.1723 Median : 0.1110 Median :-0.2303
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.8686 3rd Qu.: 0.9227 3rd Qu.: 0.6857 3rd Qu.: 0.8090
## Max. : 2.1443 Max. : 2.0254 Max. : 2.0364 Max. : 2.3261
## width.of.kernel asymetry.coefficient length.of.kernel.groove
## Min. :-1.67987 Min. :-1.99450 Min. :-1.8300
## 1st Qu.:-0.82214 1st Qu.:-0.76760 1st Qu.:-0.7604
## Median :-0.05427 Median :-0.04637 Median :-0.3910
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.79025 3rd Qu.: 0.74759 3rd Qu.: 0.9302
## Max. : 2.02861 Max. : 3.13764 Max. : 2.2921
str(seeds_dataset_sc)
## 'data.frame': 199 obs. of 7 variables:
## $ area : num 0.1169 -0.0133 -0.2153 -0.3694 0.4182 ...
## $ perimeter : num 0.1863 -0.0197 -0.386 -0.5005 0.3008 ...
## $ compactness : num 0.00812 0.44123 1.4661 1.05872 1.39749 ...
## $ length_of_kernel : num 0.2702 -0.201 -0.7939 -0.7195 0.0335 ...
## $ width.of.kernel : num 0.123 0.178 0.189 0.3 0.784 ...
## $ asymetry.coefficient : num -1.005 -1.823 -0.68 -0.979 -1.594 ...
## $ length.of.kernel.groove: num -0.407 -0.943 -1.209 -1.25 -0.499 ...
dist_mat<-dist(seeds_dataset_sc, method = 'euclidean')
hclust_avg<-hclust(dist_mat, method = 'average')
cut_age<-cutree(hclust_avg, k =3)
View(cut_age)
plot(cut_age)

plot(hclust_avg)
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree

avg_dend_obj<-as.dendrogram(hclust_avg)
avg_col_bend<-color_branches(avg_dend_obj)
avg_col_bend<-color_branches(avg_dend_obj, h= 3)
plot(avg_col_bend)

library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
seeds_df_cl<-mutate(seeds_dataset, cluster = cut_age)
plot(seeds_df_cl)

count(seeds_df_cl,cluster)
## cluster n
## 1 1 63
## 2 2 72
## 3 3 64
library(ggplot2)
ggplot(seeds_df_cl, aes(x=area, y = perimeter, color = factor(cluster))) +geom_point()
