#loading Infertility after Spontaneous and Induced Abortion dataset
data <- infert
#This dataset has 248 observations and 8 variable where 7 numerical and 1 factor
#Variables - Education 0 = 0-5 years,1 = 6-11 years, 2 = 12+ years
#Age - age in years of case, parity - Parity is defined as the number of times that she has given 
#birth to a fetus(counts), number of induced abortions 0 = 0,1 = 1,2 = 2 or more, case status 1 case still
#there and 0 controlled case, number of prior spontaneous abortions 0 = 0 and 1=1, 2=2 more,
#matched set number and stratum numbers
#seeing the structure of the dataset
str(data)
## 'data.frame':    248 obs. of  8 variables:
##  $ education     : Factor w/ 3 levels "0-5yrs","6-11yrs",..: 1 1 1 1 2 2 2 2 2 2 ...
##  $ age           : num  26 42 39 34 35 36 23 32 21 28 ...
##  $ parity        : num  6 1 6 4 3 4 1 2 1 2 ...
##  $ induced       : num  1 1 2 2 1 2 0 0 0 0 ...
##  $ case          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ spontaneous   : num  2 0 0 0 1 1 0 0 1 0 ...
##  $ stratum       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ pooled.stratum: num  3 1 4 2 32 36 6 22 5 19 ...
#removing factorial column and saving in new dataset in new variable
data2 = data
data2$education <- NULL
#plotting data to see the patterns
plot(data2$parity~data$age)

plot(data2$induced~data$age)

plot(data2$spontaneous~data$age)

replicate1 <- data2[1:82,]
replicate2 <- data2[83:164,]
replicate3 <- data2[165:246,]
Case <- c(sum(replicate1$induced),sum(replicate1$spontaneous))
Control1 <- c(sum(replicate2$induced),sum(replicate2$spontaneous))
Control2 <- c(sum(replicate3$induced),sum(replicate3$spontaneous))
repl <- data.frame(Case, Control1, Control2)
barplot(as.matrix(repl), main= "Replicated Abortion Cases", ylab = "Induced + Spontaneous",space=0.3, cex.axis=0.8, col= cm.colors(2))
legend("topright", c("Induced", "Spontaneous"), fill=cm.colors(2))

#1 Its shows comparisons between age and parity - above 25 and below 40 has more parity
#2 It shows comparions between age and induced cases - above 25 and below 40 2 or more induced cases
# 0 induced cases in each age group.
#3 It shows comparions between age and spontenous cases - above 35 age has more less spontenous cases
#below 35 has more spontenous cases
#4 It shows comparions between induced and spontenous cases - woman with secondary infertility (Case), 
#have much more spontaneous abortions that those that don't have secondary infertility (Control1 and 
#Control2), while induced abortions are constant.

#kemans clustering, 3 centers
results <- kmeans(data2,3,nstart=10)
#K-means clustering with 3 clusters of sizes
results$size
## [1]  75 116  57
#Kmeans means of each columns
results$centers
##        age   parity   induced      case spontaneous  stratum
## 1 31.64000 2.480000 0.6533333 0.3333333   0.5600000 13.00000
## 2 29.72414 1.862069 0.6120690 0.3362069   0.6293103 63.91379
## 3 34.94737 2.052632 0.3859649 0.3333333   0.4912281 35.00000
##   pooled.stratum
## 1       17.64000
## 2       49.37069
## 3       22.42105
#cluster1 says age 32 has parity of more then 2 and induced cases 65% where as spontaneous cases 56%
#cluster2 says age 30 has parity of more then 1 and induced cases 61% where as spontaneous cases 62%
#cluster3 syas age 35 has parity 2 and induced cases 38% where as spontaneous cases 49%

#Comparing the clusters to the categorical data column
table(data$education,results$cluster)
##          
##             1   2   3
##   0-5yrs   12   0   0
##   6-11yrs  63   0  57
##   12+ yrs   0 116   0
# 0-5yrs of eduction is 12 and no errors
#6-11 yrs of education got splitted its showing clustered 63 and 57 errors
#12+years of education is 116 and no errors

#check which data falls into which cluster using cbind
c1 <- cbind(results$cluster)
library(cluster)
clusplot(data2, results$cluster,main ="2D representation of Cluster", shade=TRUE, labels =2, lines = 0)