options(repos = c(CRAN = "https://cloud.r-project.org"))
#installing packages
install.packages("ggplot2")
## Installing package into 'C:/Users/DELL/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\DELL\AppData\Local\Temp\RtmpMjUeez\downloaded_packages
install.packages("cluster")
## Installing package into 'C:/Users/DELL/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'cluster' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'cluster'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\DELL\AppData\Local\R\win-library\4.2\00LOCK\cluster\libs\x64\cluster.dll
## to C:\Users\DELL\AppData\Local\R\win-library\4.2\cluster\libs\x64\cluster.dll:
## Permission denied
## Warning: restored 'cluster'
## 
## The downloaded binary packages are in
##  C:\Users\DELL\AppData\Local\Temp\RtmpMjUeez\downloaded_packages
install.packages("factoextra")
## Installing package into 'C:/Users/DELL/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'factoextra' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\DELL\AppData\Local\Temp\RtmpMjUeez\downloaded_packages
install.packages("dendextend")
## Installing package into 'C:/Users/DELL/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'dendextend' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\DELL\AppData\Local\Temp\RtmpMjUeez\downloaded_packages
install.packages("dplyr")
## Installing package into 'C:/Users/DELL/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\DELL\AppData\Local\R\win-library\4.2\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\DELL\AppData\Local\R\win-library\4.2\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
## 
## The downloaded binary packages are in
##  C:\Users\DELL\AppData\Local\Temp\RtmpMjUeez\downloaded_packages
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(cluster)
## Warning: package 'cluster' was built under R version 4.2.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.2.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.2.3
## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
rainfall_data <- read.csv("C:/Users/DELL/Desktop/rainfal india/rainfaLLIndia.csv")
head(rainfall_data)
sum(is.na(rainfall_data))
## [1] 0
rainfall_data_clean <- na.omit(rainfall_data)
rainfall_data_scaled <- rainfall_data_clean %>%
  mutate(across(JUN:SEP, scale))
head(rainfall_data_scaled)
summary(rainfall_data_scaled)
##  subdivision             YEAR            JUN.V1              JUL.V1       
##  Length:4332        Min.   :1901   Min.   :-1.411029   Min.   :-1.656272  
##  Class :character   1st Qu.:1931   1st Qu.:-0.677255   1st Qu.:-0.635599  
##  Mode  :character   Median :1961   Median :-0.387493   Median :-0.227833  
##                     Mean   :1961   Mean   : 0.000000   Mean   : 0.000000  
##                     3rd Qu.:1991   3rd Qu.: 0.328077   3rd Qu.: 0.267731  
##                     Max.   :2021   Max.   : 5.956016   Max.   : 7.525030  
##        AUG.V1              SEP.V1           JUN.SEP      
##  Min.   :-2.047104   Min.   :-2.175125   Min.   : -99.9  
##  1st Qu.:-0.705033   1st Qu.:-0.713518   1st Qu.: 565.9  
##  Median :-0.162728   Median :-0.179248   Median : 877.3  
##  Mean   : 0.000000   Mean   : 0.000000   Mean   :1057.1  
##  3rd Qu.: 0.467108   3rd Qu.: 0.505554   3rd Qu.:1287.9  
##  Max.   : 7.243149   Max.   : 7.496245   Max.   :4536.9
ggplot(rainfall_data_clean, aes(x = YEAR, y = JUN)) +
  geom_line() +
  ggtitle("Rainfall in June over Years")

ggplot(rainfall_data_clean, aes(x = YEAR, y = JUL)) +
  geom_line() +
  ggtitle("Rainfall in July over Years")

df <- rainfall_data_scaled %>%
  select(JUN:SEP)
res.dist <- dist(df, method = "euclidean")
res.hc <- hclust(d = res.dist, method = "ward.D2")
#Done on the subdata of df, it consists 10% of the dataset df. to get reproductivity (same answer) we have used the set.seed()
set.seed(123)
sampled_data <- df[sample(nrow(df), 0.1 * nrow(df)), ]

res.hc <- hclust(dist(sampled_data))

fviz_dend(res.hc, cex = 0.5)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_dend(res.hc, k = 4, 
          k_colors = c("blue", "green3", "red", "orange"),
          cex = 0.5)

nc <- cutree(res.hc, k = 4)
table(nc)
## nc
##   1   2   3   4 
## 193 200  39   1
install.packages("fastcluster")
## Installing package into 'C:/Users/DELL/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'fastcluster' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'fastcluster'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\DELL\AppData\Local\R\win-library\4.2\00LOCK\fastcluster\libs\x64\fastcluster.dll
## to
## C:\Users\DELL\AppData\Local\R\win-library\4.2\fastcluster\libs\x64\fastcluster.dll:
## Permission denied
## Warning: restored 'fastcluster'
## 
## The downloaded binary packages are in
##  C:\Users\DELL\AppData\Local\Temp\RtmpMjUeez\downloaded_packages
library(fastcluster)
## Warning: package 'fastcluster' was built under R version 4.2.3
## 
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
## 
##     hclust
df_subset <- df[sample(nrow(df), 100), ]

res.dist_subset <- dist(df_subset)
hc_average_subset <- hclust(res.dist_subset, method = "average")
hc_wardD_subset <- hclust(res.dist_subset, method = "ward.D")

d1_subset <- as.dendrogram(hc_average_subset)
d2_subset <- as.dendrogram(hc_wardD_subset)
dend_list_subset <- dendlist(d1_subset, d2_subset)
tanglegram(d1_subset, d2_subset)

cor.dendlist(dend_list_subset, method = "cophenetic")
##           [,1]      [,2]
## [1,] 1.0000000 0.7348767
## [2,] 0.7348767 1.0000000
set.seed(123)
df_subset <- df[sample(nrow(df), 100), ]
dend1 <-df_subset %>% dist %>% hclust("complete") %>% as.dendrogram()
dend2 <-df_subset %>% dist %>% hclust("average") %>% as.dendrogram()
dend3 <-df_subset %>% dist %>% hclust("ward.D") %>% as.dendrogram()
dend4 <-df_subset %>% dist %>% hclust("median") %>% as.dendrogram()
dend5 <- df_subset %>% dist %>% hclust("single") %>% as.dendrogram()
dend6 <- df_subset %>% dist %>% hclust("centroid") %>% as.dendrogram()
dend7 <-df_subset %>% dist %>% hclust("ward.D2") %>% as.dendrogram()
dend_list <- dendlist("WARD.D2" = dend7, "CENTROID" = dend6, 
                      "SINGLE" = dend5, "MEDIAN" = dend4, 
                      "WARD.D" = dend3, "AVERAGE" = dend2, 
                      "COMPLETE" = dend1)
corre <- round(cor.dendlist(dend_list), 2)
rownames(corre) <- toupper(rownames(corre))
colnames(corre) <- toupper(colnames(corre))
corre
##          WARD.D2 CENTROID SINGLE MEDIAN WARD.D AVERAGE COMPLETE
## WARD.D2     1.00     0.26   0.25   0.27   0.68    0.40     0.58
## CENTROID    0.26     1.00   0.95   0.67   0.15    0.89     0.55
## SINGLE      0.25     0.95   1.00   0.59   0.13    0.87     0.54
## MEDIAN      0.27     0.67   0.59   1.00   0.24    0.65     0.41
## WARD.D      0.68     0.15   0.13   0.24   1.00    0.29     0.59
## AVERAGE     0.40     0.89   0.87   0.65   0.29    1.00     0.60
## COMPLETE    0.58     0.55   0.54   0.41   0.59    0.60     1.00