HW 2

library(readxl)

## Warning: package 'readxl' was built under R version 3.4.4

ClusterAnalysisData <- read_excel("~/Downloads/ClusterAnalysisData.xlsx")
View(ClusterAnalysisData)

head(ClusterAnalysisData)

## # A tibble: 6 x 3
##    math  lang manual
##   <dbl> <dbl>  <dbl>
## 1   143   123    123
## 2   133   116    104
## 3   124   110    110
## 4   145   119    112
## 5   120   110    123
## 6   172   104    128

str(ClusterAnalysisData)

## Classes 'tbl_df', 'tbl' and 'data.frame':    52 obs. of  3 variables:
##  $ math  : num  143 133 124 145 120 172 147 141 145 124 ...
##  $ lang  : num  123 116 110 119 110 104 107 117 103 95 ...
##  $ manual: num  123 104 110 112 123 128 108 109 106 102 ...

summary(ClusterAnalysisData)

##       math            lang           manual     
##  Min.   : 93.0   Min.   : 95.0   Min.   : 87.0  
##  1st Qu.:119.0   1st Qu.:126.0   1st Qu.:104.8  
##  Median :132.0   Median :148.0   Median :111.5  
##  Mean   :135.2   Mean   :143.8   Mean   :112.2  
##  3rd Qu.:145.5   3rd Qu.:159.5   3rd Qu.:120.2  
##  Max.   :200.0   Max.   :193.0   Max.   :142.0

ClusterAnalysisData <- scale(ClusterAnalysisData)
head(ClusterAnalysisData)

##            math      lang      manual
## [1,]  0.3072918 -0.915562  0.88765139
## [2,] -0.0862839 -1.224426 -0.67007432
## [3,] -0.4405020 -1.489167 -0.17816094
## [4,]  0.3860069 -1.092056 -0.01418981
## [5,] -0.5979323 -1.489167  0.88765139
## [6,]  1.4486614 -1.753908  1.29757921

library(factoextra)

## Loading required package: ggplot2

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

set.seed(1)
ClusterAnalysisData=na.omit(ClusterAnalysisData)
head(ClusterAnalysisData)

##            math      lang      manual
## [1,]  0.3072918 -0.915562  0.88765139
## [2,] -0.0862839 -1.224426 -0.67007432
## [3,] -0.4405020 -1.489167 -0.17816094
## [4,]  0.3860069 -1.092056 -0.01418981
## [5,] -0.5979323 -1.489167  0.88765139
## [6,]  1.4486614 -1.753908  1.29757921

fviz_nbclust(ClusterAnalysisData, kmeans,method = "wss")

kmeans_ClusterAnalysisdata=kmeans(ClusterAnalysisData,3)
help("kmeans")
kmeans_ClusterAnalysisdata

## K-means clustering with 3 clusters of sizes 15, 7, 30
## 
## Cluster means:
##         math       lang      manual
## 1  0.2679342 -1.3362058 -0.03605263
## 2  1.8759721  1.1582411  1.09847141
## 3 -0.5716939  0.3978466 -0.23828368
## 
## Clustering vector:
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 1 2 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 14.827480  6.717068 44.848552
##  (between_SS / total_SS =  56.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

plot(ClusterAnalysisData, col=kmeans_ClusterAnalysisdata$cluster)

fviz_cluster(kmeans_ClusterAnalysisdata,data = ClusterAnalysisData)

z=rnorm(40)*10.39
z

##  [1]  -5.8233420  -2.3915441  16.1949794   0.7325822   1.3432996
##  [6]  17.8195252   4.7889194 -13.1439862  -7.1364011  -4.6304279
## [11]  12.7182099   3.7384657   4.1640154   1.1499934  -5.7751894
## [16]  18.5660275   5.1726665 -20.4331523   7.2870878  -4.9123027
## [21] -11.0946883  -2.2647594 -10.6601862  -7.5731799  -6.4941580
## [26] -17.5247435   8.7046074   1.5935467 -11.8252428  13.0271370
## [31]   4.4309633  -3.0657927   9.3003556   9.1238069   8.5362274
## [36]   7.1549722   5.7552044  -0.6432627  -3.1789521  -3.9530937

z=z+17.2
z1=dnorm(z,mean = 17.2,sd=10.34)
plot(z,z1)

pnorm(30,mean = 17.2,sd=10.34,lower.tail = FALSE)

## [1] 0.1078745

h=rnorm(50)*10.34
h=h+17.2
hist(h,probability = TRUE,xlim = c(-10,40))

hist(h,xlim = c(-10,40))

hist(h,probability = TRUE,xlim = c(-10,40))
lines(h[order(h)],dnorm(h[order(h)],mean = 17.2,sd=10.34),col="blue")

h=rgamma(40,2.73,rate = 0.15)