seeds_dataset <- read.delim("C:/Users/Ozili Nwokobia/OneDrive/Desktop/seeds_dataset.txt", header=FALSE)
View(seeds_dataset)
 str(seeds_dataset)
## 'data.frame':    221 obs. of  8 variables:
##  $ V1: num  15.3 14.9 14.3 13.8 16.1 ...
##  $ V2: num  14.8 14.6 14.1 13.9 15 ...
##  $ V3: num  0.871 0.881 0.905 0.895 0.903 ...
##  $ V4: num  5.76 5.55 5.29 5.32 5.66 ...
##  $ V5: num  3.31 3.33 3.34 3.38 3.56 ...
##  $ V6: num  2.22 1.02 2.7 2.26 1.35 ...
##  $ V7: num  5.22 4.96 4.83 4.8 5.17 ...
##  $ V8: num  1 1 1 1 1 1 1 5 NA 1 ...
 feature_name<-c('area', 'perimeter', 'compactness', 'length_of_kernel', 'width.of.kernel', 'asymetry.coefficient', 'length.of.kernel.groove', 'type.of.seed' )
 colnames(seeds_dataset)<-feature_name
 View(seeds_dataset)
 str(seeds_dataset)
## 'data.frame':    221 obs. of  8 variables:
##  $ area                   : num  15.3 14.9 14.3 13.8 16.1 ...
##  $ perimeter              : num  14.8 14.6 14.1 13.9 15 ...
##  $ compactness            : num  0.871 0.881 0.905 0.895 0.903 ...
##  $ length_of_kernel       : num  5.76 5.55 5.29 5.32 5.66 ...
##  $ width.of.kernel        : num  3.31 3.33 3.34 3.38 3.56 ...
##  $ asymetry.coefficient   : num  2.22 1.02 2.7 2.26 1.35 ...
##  $ length.of.kernel.groove: num  5.22 4.96 4.83 4.8 5.17 ...
##  $ type.of.seed           : num  1 1 1 1 1 1 1 5 NA 1 ...
 View(seeds_dataset)
 summary(seeds_dataset)
##       area         perimeter      compactness     length_of_kernel
##  Min.   : 1.00   Min.   : 1.00   Min.   :0.8081   Min.   :0.8189  
##  1st Qu.:12.11   1st Qu.:13.43   1st Qu.:0.8577   1st Qu.:5.2447  
##  Median :14.13   Median :14.29   Median :0.8735   Median :5.5180  
##  Mean   :14.29   Mean   :14.43   Mean   :0.8713   Mean   :5.5639  
##  3rd Qu.:17.09   3rd Qu.:15.69   3rd Qu.:0.8877   3rd Qu.:5.9798  
##  Max.   :21.18   Max.   :17.25   Max.   :0.9183   Max.   :6.6750  
##  NA's   :1       NA's   :9       NA's   :14       NA's   :11      
##  width.of.kernel asymetry.coefficient length.of.kernel.groove  type.of.seed  
##  Min.   :2.630   Min.   :0.7651       Min.   :3.485           Min.   :1.000  
##  1st Qu.:2.956   1st Qu.:2.6002       1st Qu.:5.045           1st Qu.:1.000  
##  Median :3.245   Median :3.5990       Median :5.226           Median :2.000  
##  Mean   :3.281   Mean   :3.6935       Mean   :5.408           Mean   :2.084  
##  3rd Qu.:3.566   3rd Qu.:4.7687       3rd Qu.:5.879           3rd Qu.:3.000  
##  Max.   :5.325   Max.   :8.4560       Max.   :6.735           Max.   :5.439  
##  NA's   :12      NA's   :11           NA's   :15              NA's   :15
 any(is.na(seeds_dataset))
## [1] TRUE
 seeds_dataset<-na.omit(seeds_dataset)
 str(seeds_dataset)
## 'data.frame':    199 obs. of  8 variables:
##  $ area                   : num  15.3 14.9 14.3 13.8 16.1 ...
##  $ perimeter              : num  14.8 14.6 14.1 13.9 15 ...
##  $ compactness            : num  0.871 0.881 0.905 0.895 0.903 ...
##  $ length_of_kernel       : num  5.76 5.55 5.29 5.32 5.66 ...
##  $ width.of.kernel        : num  3.31 3.33 3.34 3.38 3.56 ...
##  $ asymetry.coefficient   : num  2.22 1.02 2.7 2.26 1.35 ...
##  $ length.of.kernel.groove: num  5.22 4.96 4.83 4.8 5.17 ...
##  $ type.of.seed           : num  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:22] 8 9 37 38 63 64 72 73 111 112 ...
##   ..- attr(*, "names")= chr [1:22] "8" "9" "37" "38" ...
 seeds_dataset<-seeds_dataset[,-8]
 seeds_dataset_sc<-as.data.frame(scale(seeds_dataset))
 summary(seeds_dataset_sc)
##       area           perimeter        compactness      length_of_kernel 
##  Min.   :-1.4825   Min.   :-1.6680   Min.   :-2.6891   Min.   :-1.6776  
##  1st Qu.:-0.8866   1st Qu.:-0.8591   1st Qu.:-0.5879   1st Qu.:-0.8480  
##  Median :-0.1674   Median :-0.1723   Median : 0.1110   Median :-0.2303  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.8686   3rd Qu.: 0.9227   3rd Qu.: 0.6857   3rd Qu.: 0.8090  
##  Max.   : 2.1443   Max.   : 2.0254   Max.   : 2.0364   Max.   : 2.3261  
##  width.of.kernel    asymetry.coefficient length.of.kernel.groove
##  Min.   :-1.67987   Min.   :-1.99450     Min.   :-1.8300        
##  1st Qu.:-0.82214   1st Qu.:-0.76760     1st Qu.:-0.7604        
##  Median :-0.05427   Median :-0.04637     Median :-0.3910        
##  Mean   : 0.00000   Mean   : 0.00000     Mean   : 0.0000        
##  3rd Qu.: 0.79025   3rd Qu.: 0.74759     3rd Qu.: 0.9302        
##  Max.   : 2.02861   Max.   : 3.13764     Max.   : 2.2921
 str(seeds_dataset_sc)
## 'data.frame':    199 obs. of  7 variables:
##  $ area                   : num  0.1169 -0.0133 -0.2153 -0.3694 0.4182 ...
##  $ perimeter              : num  0.1863 -0.0197 -0.386 -0.5005 0.3008 ...
##  $ compactness            : num  0.00812 0.44123 1.4661 1.05872 1.39749 ...
##  $ length_of_kernel       : num  0.2702 -0.201 -0.7939 -0.7195 0.0335 ...
##  $ width.of.kernel        : num  0.123 0.178 0.189 0.3 0.784 ...
##  $ asymetry.coefficient   : num  -1.005 -1.823 -0.68 -0.979 -1.594 ...
##  $ length.of.kernel.groove: num  -0.407 -0.943 -1.209 -1.25 -0.499 ...
 dist_mat<-dist(seeds_dataset_sc, method = 'euclidean')
 hclust_avg<-hclust(dist_mat, method = 'average')
 cut_age<-cutree(hclust_avg, k =3)
 View(cut_age)
 plot(cut_age)

 plot(hclust_avg)
 library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree

 avg_dend_obj<-as.dendrogram(hclust_avg)
 avg_col_bend<-color_branches(avg_dend_obj)
 avg_col_bend<-color_branches(avg_dend_obj, h= 3)
 plot(avg_col_bend)

 library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
seeds_df_cl<-mutate(seeds_dataset, cluster = cut_age)
 plot(seeds_df_cl)

 count(seeds_df_cl,cluster)
##   cluster  n
## 1       1 63
## 2       2 72
## 3       3 64
 library(ggplot2)
 ggplot(seeds_df_cl, aes(x=area, y = perimeter, color = factor(cluster))) +geom_point()