library(ggplot2)
library(dplyr)

## 
## 载入程辑包：'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(olsrr)

## 
## 载入程辑包：'olsrr'

## The following object is masked from 'package:datasets':
## 
##     rivers

library(pscl)

## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

library(InformationValue)
library(hablar)

## 
## 载入程辑包：'hablar'

## The following object is masked from 'package:dplyr':
## 
##     na_if

library(openxlsx)
library(psych)

## 
## 载入程辑包：'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(caret)

## 载入需要的程辑包：lattice

## 
## 载入程辑包：'caret'

## The following objects are masked from 'package:InformationValue':
## 
##     confusionMatrix, precision, sensitivity, specificity

library(forestplot)

## 载入需要的程辑包：grid

## 载入需要的程辑包：magrittr

## 载入需要的程辑包：checkmate

library(meta)

## Loading 'meta' package (version 5.5-0).
## Type 'help(meta)' for a brief overview.
## Readers of 'Meta-Analysis with R (Use R!)' should install
## older version of 'meta' package: https://tinyurl.com/dt4y5drs

问题一：玻璃文物的表面风化与其他参数的相关性及差异性分析

这里的参数均为离散型变量，因此我们用卡方检验来分析相关性
1.玻璃类型与是否风化
H0：玻璃类型不影响文物的风化
H1：玻璃类型影响文物的风化

comp <- c(6,12,28,12)
dim <- c(2,2)
chisq.test(comp,correct = FALSE)

## 
##  Chi-squared test for given probabilities
## 
## data:  comp
## X-squared = 18.414, df = 3, p-value = 0.0003613

得到的p value <- 0.05,因此有理由拒绝H0,接受H1，即不同的玻璃类型对玻璃文物表面风化影响不同。
2.文物纹饰是否影响风化
H0：文物纹饰不影响风化
H1：文物纹饰影响风化

wenshi <- c(11,11,6,0,17,13)
dim1 <- c(2,3)
chisq.test(wenshi,correct = FALSE)

## 
##  Chi-squared test for given probabilities
## 
## data:  wenshi
## X-squared = 18.138, df = 5, p-value = 0.002778

得到的p value <- 0.05,因此有理由拒绝H0,接受H1，即不同的纹饰类型对玻璃文物表面是否风化也有影响。

3.文物颜色是否影响风化
H0：文物颜色不影响风化
H1：文物颜色影响风化

color <- c(2,2,4,3,0,2,1,2,12,8,0,1,9,6,2,0,4,0)
dim2 <- c(2,9)
chisq.test(color,correct = FALSE)

## Warning in chisq.test(color, correct = FALSE): Chi-squared近似算法有可能不准

## 
##  Chi-squared test for given probabilities
## 
## data:  color
## X-squared = 62.414, df = 17, p-value = 4.177e-07

得到的p value <- 0.05,因此有理由拒绝H0,接受H1，即不同的文物颜色对玻璃文物表面是否风化也有影响。
这里提到卡方检验可能不准，我们可以使用 Spearman test（非参检验）来验证是否风化与表面颜色的相关性

group1 <- c(2,4,0,1,12,0,9,2,4)
group2 <- c(2,3,2,2,8,1,6,0,0)
cor.test(group1,group2,method = "spearman",alternative = "two.sided",conf.level = 0.94)

## 
##  Spearman's rank correlation rho
## 
## data:  group1 and group2
## S = 54.825, p-value = 0.1307
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5431236

P value > 0.05,因此我们认为文物颜色与文物是否风化没有关联，拒绝H0.

文物样品表面有无风化化学成分含量的统计规律

component <- read.csv("D://modeling//component.csv",header = F,encoding = "UTF-8")
component$V1[1]=69.33
component<- component %>%
  as_tibble() %>%
  convert(num(V1),num(V16))

for (i in 1:67){
  if(component$V16[i]==0){
    component$label[i] = "未风化"
  }
  if(component$V16[i]==1){
    component$label[i] = "风化"
  }
}

## Warning: Unknown or uninitialised column: `label`.

对各个化学组成是否导致风化进行显著性检验
由于数据量大于50，使用t.test具有鲁棒性，因此使用t.test检验该化学成分含量对于文物是否风化有影响

# 二氧化硅
component %>%
 ggplot(aes(x=label, y=V1, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="二氧化硅对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

# 独立T检验
t.test(component$V1,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V1 and component$V16
## t = 16.338, df = 132, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  42.66698 54.42227
## sample estimates:
##  mean of x  mean of y 
## 49.0222388  0.4776119

# 氧化钠对风化的影响
component %>%
 ggplot(aes(x=label, y=V2, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="氧化钠对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V2,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V2 and component$V16
## t = 1.4603, df = 132, p-value = 0.1466
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1093279  0.7260443
## sample estimates:
## mean of x mean of y 
## 0.7859701 0.4776119

# 氧化钾对风化的影响
component %>%
 ggplot(aes(x=label, y=V3, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="氧化钾对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V3,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V3 and component$V16
## t = 2.8648, df = 132, p-value = 0.004856
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.4237717 2.3144372
## sample estimates:
## mean of x mean of y 
## 1.8467164 0.4776119

# 氧化钙对风化的影响
component %>%
 ggplot(aes(x=label, y=V4, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="氧化钙对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V4,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V4 and component$V16
## t = 7.0703, df = 132, p-value = 8.092e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.479794 2.629460
## sample estimates:
## mean of x mean of y 
## 2.5322388 0.4776119

# 氧化镁对风化的影响
component %>%
 ggplot(aes(x=label, y=V5, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="氧化镁对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V5,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V5 and component$V16
## t = 2.0441, df = 132, p-value = 0.04294
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.006626473 0.404119795
## sample estimates:
## mean of x mean of y 
## 0.6829851 0.4776119

# 氧化铝对风化的影响
component %>%
 ggplot(aes(x=label, y=V6, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="氧化铝对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V6,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V6 and component$V16
## t = 9.3833, df = 132, p-value = 2.437e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.811989 4.314280
## sample estimates:
## mean of x mean of y 
## 4.0407463 0.4776119

# 氧化铁对风化的影响
component %>%
 ggplot(aes(x=label, y=V7, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="氧化铁对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V7,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V7 and component$V16
## t = 2.3745, df = 132, p-value = 0.01901
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.06206131 0.68152077
## sample estimates:
## mean of x mean of y 
## 0.8494030 0.4776119

# 氧化铜对风化的影响
component %>%
 ggplot(aes(x=label, y=V8, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="氧化铜对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V8,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V8 and component$V16
## t = 5.258, df = 132, p-value = 5.711e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.9207875 2.0314513
## sample estimates:
## mean of x mean of y 
## 1.9537313 0.4776119

# 氧化铅对风化的影响
component %>%
 ggplot(aes(x=label, y=V9, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="氧化铅对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V9,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V9 and component$V16
## t = 10.058, df = 132, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  19.26872 28.70292
## sample estimates:
##  mean of x  mean of y 
## 24.4634328  0.4776119

# 氧化钡对风化的影响
component %>%
 ggplot(aes(x=label, y=V10, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="氧化钡对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V10,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V10 and component$V16
## t = 7.0816, df = 132, p-value = 7.631e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5.262065 9.341219
## sample estimates:
## mean of x mean of y 
## 7.7792537 0.4776119

# 五氧化二磷对风化的影响
component %>%
 ggplot(aes(x=label, y=V11, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="五氧化二磷对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V11,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V11 and component$V16
## t = 5.0462, df = 132, p-value = 1.462e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.341690 3.071742
## sample estimates:
## mean of x mean of y 
## 2.6843284 0.4776119

# 氧化锶对风化的影响
component %>%
 ggplot(aes(x=label, y=V12, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="氧化锶对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V12,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V12 and component$V16
## t = -3.098, df = 132, p-value = 0.002381
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.35338135 -0.07796193
## sample estimates:
## mean of x mean of y 
## 0.2619403 0.4776119

# 氧化锡对风化的影响
component %>%
 ggplot(aes(x=label, y=V13, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="氧化锡对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V13,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V13 and component$V16
## t = -5.4023, df = 132, p-value = 2.97e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5462588 -0.2534427
## sample estimates:
##  mean of x  mean of y 
## 0.07776119 0.47761194

# 二氧化硫对风化的影响
component %>%
 ggplot(aes(x=label, y=V14, group=label,fill=label)) +
  geom_boxplot() +
  labs(title="氧化镁对风化的影响", 
       x="是否风化", y="化学成分占比") +
  theme_classic()

t.test(component$V14,component$V16,alternative="two.sided", var.equal=T)

## 
##  Two Sample t-test
## 
## data:  component$V14 and component$V16
## t = 0.37395, df = 132, p-value = 0.709
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5378215  0.7885678
## sample estimates:
## mean of x mean of y 
## 0.6029851 0.4776119

# 分析化学组分间的相关性
ggpairs(component,columns=1:14, columnLabels =c("SiO2","Na2O","K2O","CaO","MgO","Al2O3","Fe2O3","CuO",
                                          "PbO","BaO","P2O5","SrO","SnO2","SO2"),
        cardinality_threshold=NULL)

# 另一种表达
comp<-component
comp$label<-NULL
comp$V17 <- NULL
comp$V16 <- NULL
comp$V15 <- NULL
pairs.panels(comp)

# Logistic regression
colnames(component) <- c("SiO2","Na2O","K2O","CaO","MgO","Al2O3","Fe2O3","CuO","PbO","BaO","P2O5","SrO","SnO2","SO2","sum","weathering","type","label") 

logistic <- glm(weathering~SiO2+Na2O+K2O+CaO+MgO+Al2O3+Fe2O3+PbO+BaO+CuO+P2O5+SrO+SnO2+SO2,data=component,family=binomial)
summary(logistic)

## 
## Call:
## glm(formula = weathering ~ SiO2 + Na2O + K2O + CaO + MgO + Al2O3 + 
##     Fe2O3 + PbO + BaO + CuO + P2O5 + SrO + SnO2 + SO2, family = binomial, 
##     data = component)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.45188  -0.20057  -0.00058   0.47824   1.65313  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  7.00602   32.43615   0.216    0.829
## SiO2        -0.06676    0.32866  -0.203    0.839
## Na2O        -0.63691    0.61520  -1.035    0.301
## K2O         -1.15148    0.88057  -1.308    0.191
## CaO          0.13048    0.75992   0.172    0.864
## MgO         -0.32798    1.07553  -0.305    0.760
## Al2O3       -0.30761    0.41601  -0.739    0.460
## Fe2O3       -1.05304    0.76141  -1.383    0.167
## PbO          0.02671    0.32040   0.083    0.934
## BaO         -0.34802    0.48224  -0.722    0.470
## CuO          0.59770    0.38306   1.560    0.119
## P2O5         0.23230    0.54152   0.429    0.668
## SrO         -2.34772    2.68614  -0.874    0.382
## SnO2         3.81792    2.57040   1.485    0.137
## SO2          0.31056    0.27791   1.118    0.264
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92.747  on 66  degrees of freedom
## Residual deviance: 40.447  on 52  degrees of freedom
## AIC: 70.447
## 
## Number of Fisher Scoring iterations: 8

pR2(logistic)["McFadden"]

## fitting null model for pseudo-r2

##  McFadden 
## 0.5638999

# Find optimal cutoff probability to use to maximize accuracy
predicted_value <- predict(logistic, component,type = "response")
optimalCutoff(component$weathering, predicted_value)[1]

## [1] 0.3397779

没有某一个变量非常显著，将二氧化硅剔除试一试

# Logistic regression
logistic2 <- glm(weathering~Na2O+K2O+CaO+MgO+Al2O3+Fe2O3+PbO+BaO+CuO+P2O5+SrO+SnO2+SO2,data=component,family=binomial)
summary(logistic2)

## 
## Call:
## glm(formula = weathering ~ Na2O + K2O + CaO + MgO + Al2O3 + Fe2O3 + 
##     PbO + BaO + CuO + P2O5 + SrO + SnO2 + SO2, family = binomial, 
##     data = component)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.51338  -0.19804  -0.00079   0.48601   1.61306  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  0.42373    1.29412   0.327   0.7433  
## Na2O        -0.55791    0.47458  -1.176   0.2398  
## K2O         -1.05287    0.71650  -1.469   0.1417  
## CaO          0.23381    0.57351   0.408   0.6835  
## MgO         -0.29967    1.06159  -0.282   0.7777  
## Al2O3       -0.24814    0.29098  -0.853   0.3938  
## Fe2O3       -1.02162    0.75360  -1.356   0.1752  
## PbO          0.09120    0.04819   1.892   0.0584 .
## BaO         -0.25465    0.13690  -1.860   0.0629 .
## CuO          0.60412    0.38237   1.580   0.1141  
## P2O5         0.33176    0.23898   1.388   0.1651  
## SrO         -2.51239    2.57293  -0.976   0.3288  
## SnO2         3.80291    2.53785   1.498   0.1340  
## SO2          0.32011    0.27194   1.177   0.2392  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92.747  on 66  degrees of freedom
## Residual deviance: 40.489  on 53  degrees of freedom
## AIC: 68.489
## 
## Number of Fisher Scoring iterations: 8

氧化铅与氧化钡非常显著

# Logistic regression,剔除氧化硅，氧化铅，氧化钡，氧化钾
logistic1 <- glm(weathering~Na2O+K2O+CaO+MgO+Al2O3+Fe2O3+CuO+P2O5+SrO+SnO2+SO2,data=component,family=binomial)
summary(logistic1)

## 
## Call:
## glm(formula = weathering ~ Na2O + K2O + CaO + MgO + Al2O3 + Fe2O3 + 
##     CuO + P2O5 + SrO + SnO2 + SO2, family = binomial, data = component)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.69577  -0.41787  -0.00704   0.55230   1.92026  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  1.31236    0.90857   1.444   0.1486  
## Na2O        -0.55042    0.38705  -1.422   0.1550  
## K2O         -0.79019    0.46497  -1.699   0.0892 .
## CaO          0.58077    0.41669   1.394   0.1634  
## MgO         -0.57619    0.93939  -0.613   0.5396  
## Al2O3       -0.29290    0.22382  -1.309   0.1907  
## Fe2O3       -1.21317    0.69694  -1.741   0.0817 .
## CuO         -0.07010    0.18187  -0.385   0.6999  
## P2O5         0.34956    0.18154   1.925   0.0542 .
## SrO         -0.67430    1.72682  -0.390   0.6962  
## SnO2         2.41072    1.72872   1.395   0.1632  
## SO2         -0.09002    0.21527  -0.418   0.6758  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92.747  on 66  degrees of freedom
## Residual deviance: 46.832  on 55  degrees of freedom
## AIC: 70.832
## 
## Number of Fisher Scoring iterations: 8

AIC值越小说明预测效果越好，因此logistic2的拟合效果最好

用meta森林图预测风化前的化学成分

# 构建数据集
component$new_col <- 1-component$weathering

metal <- metabin(component$new_col,c(rep(2,67)),component$weathering,c(rep(2,67)),data=component,method = "MH",sm="OR")
forest(metal,family="sans",fontsize = 9.5,lab.e="未风化",lab.c="风化组")

问题二：聚类模型搭建

# 因子分析
# 铅钡数据的因子分析
data <- read.xlsx("D:\\modeling\\铅钡组.xlsx",rows=1:50,cols = 1:15)

# 数据标准化
df <- scale(data)
# 因子分析
data_cor <- cor(df)

# 生成碎石图
fa.parallel(data_cor)

## Parallel analysis suggests that the number of factors =  3  and the number of components =  3

# 由碎石图可知保留3个主成分即可

# 因子分析
fa_model1 <- fa(data_cor,nfactors = 3,rotate = "varimax",fm="ml")
fa_model1

## Factor Analysis using method =  ml
## Call: fa(r = data_cor, nfactors = 3, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##              ML1   ML2   ML3    h2     u2 com
## SiO2       -0.91 -0.36  0.16 0.995 0.0048 1.4
## Na2O       -0.39 -0.02  0.04 0.155 0.8451 1.0
## K2O        -0.07  0.01  0.18 0.037 0.9627 1.3
## CaO         0.62 -0.03  0.46 0.599 0.4011 1.8
## MgO         0.13 -0.31  0.59 0.464 0.5357 1.6
## Al2O3      -0.27 -0.16  0.63 0.493 0.5074 1.5
## Fe2O3       0.06 -0.20  0.39 0.197 0.8025 1.5
## CuO         0.08  0.70 -0.16 0.523 0.4769 1.1
## PbO         0.86 -0.31 -0.41 0.995 0.0049 1.7
## BaO         0.05  0.96 -0.28 0.995 0.0049 1.2
## P2O5        0.66  0.09  0.49 0.689 0.3106 1.9
## SrO         0.48  0.11 -0.13 0.261 0.7386 1.2
## SnO2       -0.04 -0.01  0.24 0.060 0.9401 1.1
## SO2         0.16  0.65  0.00 0.453 0.5468 1.1
## weathering  0.82  0.11 -0.11 0.695 0.3054 1.1
## 
##                        ML1  ML2  ML3
## SS loadings           3.59 2.26 1.77
## Proportion Var        0.24 0.15 0.12
## Cumulative Var        0.24 0.39 0.51
## Proportion Explained  0.47 0.30 0.23
## Cumulative Proportion 0.47 0.77 1.00
## 
## Mean item complexity =  1.4
## Test of the hypothesis that 3 factors are sufficient.
## 
## The degrees of freedom for the null model are  105  and the objective function was  11.39
## The degrees of freedom for the model are 63  and the objective function was  3 
## 
## The root mean square of the residuals (RMSR) is  0.08 
## The df corrected root mean square of the residuals is  0.11 
## 
## Fit based upon off diagonal values = 0.92
## Measures of factor score adequacy             
##                                                    ML1  ML2  ML3
## Correlation of (regression) scores with factors   1.00 1.00 0.98
## Multiple R square of scores with factors          0.99 0.99 0.95
## Minimum correlation of possible factor scores     0.99 0.99 0.91

fa函数中，第一个参数是数据，第二个参数说明要保留两个主成分，第三个参数为旋转方法，为none，即不进行主成分旋转，varimax意味着进行正交旋转；第四个参数表示提取公因子的方法为最大似然法。

# 对旋转结果进行可视化,这个在论文里不用写
factor.plot(fa_model1)

fa.diagram(fa_model1,simple = FALSE)

# 这个图跟SPSS分析的因子载荷矩阵热力图类似但更好理解！

# 高钾数据的因子分析，操作同上
data1 <- read.xlsx("D:\\modeling\\高钾组.xlsx",rows=1:50,cols = 1:15)
# 数据标准化
df1 <- scale(data1)
data_cor1 <- cor(data1)

# 生成碎石图
fa.parallel(data_cor1)

## Warning in fa.parallel(data_cor1): It seems as if you are using a correlation
## matrix, but have not specified the number of cases. The number of subjects is
## arbitrarily set to be 100

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Parallel analysis suggests that the number of factors =  5  and the number of components =  4

# 由碎石图可知保留3个主成分即可

# 因子分析
fa_model2 <- fa(data_cor,nfactors = 3,rotate = "varimax",fm="ml")
fa_model2

## Factor Analysis using method =  ml
## Call: fa(r = data_cor, nfactors = 3, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##              ML1   ML2   ML3    h2     u2 com
## SiO2       -0.91 -0.36  0.16 0.995 0.0048 1.4
## Na2O       -0.39 -0.02  0.04 0.155 0.8451 1.0
## K2O        -0.07  0.01  0.18 0.037 0.9627 1.3
## CaO         0.62 -0.03  0.46 0.599 0.4011 1.8
## MgO         0.13 -0.31  0.59 0.464 0.5357 1.6
## Al2O3      -0.27 -0.16  0.63 0.493 0.5074 1.5
## Fe2O3       0.06 -0.20  0.39 0.197 0.8025 1.5
## CuO         0.08  0.70 -0.16 0.523 0.4769 1.1
## PbO         0.86 -0.31 -0.41 0.995 0.0049 1.7
## BaO         0.05  0.96 -0.28 0.995 0.0049 1.2
## P2O5        0.66  0.09  0.49 0.689 0.3106 1.9
## SrO         0.48  0.11 -0.13 0.261 0.7386 1.2
## SnO2       -0.04 -0.01  0.24 0.060 0.9401 1.1
## SO2         0.16  0.65  0.00 0.453 0.5468 1.1
## weathering  0.82  0.11 -0.11 0.695 0.3054 1.1
## 
##                        ML1  ML2  ML3
## SS loadings           3.59 2.26 1.77
## Proportion Var        0.24 0.15 0.12
## Cumulative Var        0.24 0.39 0.51
## Proportion Explained  0.47 0.30 0.23
## Cumulative Proportion 0.47 0.77 1.00
## 
## Mean item complexity =  1.4
## Test of the hypothesis that 3 factors are sufficient.
## 
## The degrees of freedom for the null model are  105  and the objective function was  11.39
## The degrees of freedom for the model are 63  and the objective function was  3 
## 
## The root mean square of the residuals (RMSR) is  0.08 
## The df corrected root mean square of the residuals is  0.11 
## 
## Fit based upon off diagonal values = 0.92
## Measures of factor score adequacy             
##                                                    ML1  ML2  ML3
## Correlation of (regression) scores with factors   1.00 1.00 0.98
## Multiple R square of scores with factors          0.99 0.99 0.95
## Minimum correlation of possible factor scores     0.99 0.99 0.91

# 对旋转结果进行可视化
factor.plot(fa_model2)

fa.diagram(fa_model2,simple = FALSE)

# 这个图跟SPSS分析的因子载荷矩阵热力图类似但更好理解！

K_means 聚类分析

对铅钡数据进行分类，先尝试分为三类

res <- kmeans(data,centers = 2)
pairs(data,col=res$cluster+1)

ggpairs(data,mapping=aes(colour=as.character(res$cluster)))

好像除了二氧化硅有明显的区别，其他区别都看不出来

library(factoextra)

## Warning: 程辑包'factoextra'是用R版本4.2.1 来建造的

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

# 确定最佳聚类数目
fviz_nbclust(df,kmeans,method = "wss")+geom_vline(xintercept=6,linetype=2)

# 另一种确定最佳聚类数目的方法
library(mclust)

## Warning: 程辑包'mclust'是用R版本4.2.1 来建造的

## Package 'mclust' version 5.4.10
## Type 'citation("mclust")' for citing this R package in publications.

## 
## 载入程辑包：'mclust'

## The following object is masked from 'package:psych':
## 
##     sim

m_clust <- Mclust(df,G=1:10)
summary(m_clust)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEI (diagonal, equal shape) model with 6 components: 
## 
##  log-likelihood  n  df       BIC       ICL
##       -629.5898 49 115 -1706.739 -1706.783
## 
## Clustering table:
##  1  2  3  4  5  6 
## 18  2  2  4 14  9

14个指标随聚类数目变化的走势图

plot(m_clust,"BIC")

最后确定聚类数目为6

#设置随机数种子，保证实验的可重复进行
set.seed(123)
#利用k-mean是进行聚类
km_result <- kmeans(df, 6, nstart = 24)
#查看聚类的一些结果
print(km_result)

## K-means clustering with 6 clusters of sizes 19, 8, 5, 1, 4, 12
## 
## Cluster means:
##         SiO2        Na2O         K2O         CaO          MgO       Al2O3
## 1  0.9862609  0.62465478  0.06191179 -0.62400301  0.007126134  0.34801828
## 2 -0.5326350 -0.16659070 -0.45269585 -0.52000251 -0.915424825 -0.72326470
## 3 -1.2246563 -0.49895590 -0.33926725 -0.08075333 -1.024543755 -0.82537515
## 4  0.7751866 -0.05764112  0.53186438  0.47106110  1.419720291  3.31753415
## 5  0.2407690 -0.49895590  1.60263035  0.68365036  0.388050401  0.19772338
## 6 -0.8410715 -0.49895590 -0.23340056  1.10118178  0.778233244 -0.06731514
##        Fe2O3         CuO        PbO         BaO       P2O5        SrO
## 1 -0.2759327 -0.24888539 -0.8744205 -0.26891265 -0.5600830 -0.4955971
## 2 -0.4938787 -0.43447496  1.1347068  0.02995811 -0.4563306 -0.0966186
## 3 -0.6915701  2.16820506 -0.2278124  2.48045834  0.2044766  0.8654238
## 4  0.3944144 -0.76083458 -1.1800907 -0.38173937 -0.5608908 -0.3717589
## 5  2.2896156 -0.68999683  0.1261642 -0.53717617 -0.4201982 -0.0681558
## 6  0.2582271  0.07370155  0.7792360 -0.41684768  1.2926268  0.5422130
##          SnO2        SO2 weathering
## 1 -0.21623363 -0.1933795 -1.0523140
## 2 -0.27306555 -0.2547526  0.9308931
## 3 -0.27306555  2.0086049  0.5342517
## 4  5.87713715 -0.2547526  0.9308931
## 5  0.71284481 -0.2547526 -0.5565122
## 6 -0.08918544 -0.2547526  0.9308931
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  5  3  3  6  6  1  1  3  1  3  3  1  1  5  5  5  1  1  2  1  2  1  2  2  2  6 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 
##  1  1  6  6  1  1  1  1  4  6  1  6  1  6  6  2  1  6  6  1  2  2  6 
## 
## Within cluster sum of squares by cluster:
## [1] 129.81834  19.06685  46.65616   0.00000  42.47087  79.03740
##  (between_SS / total_SS =  56.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

#提取类标签并且与原始数据进行合并
dd <- cbind(data, cluster = km_result$cluster)

# 查看每一类的数目
table(dd$cluster)

## 
##  1  2  3  4  5  6 
## 19  8  5  1  4 12

#进行可视化展示
fviz_cluster(km_result, data = df,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07","#fac0b3","#b892ff"),
             ellipse.type = "euclid",
             star.plot = TRUE, 
             repel = TRUE,
             ggtheme = theme_minimal()
)

## Too few points to calculate an ellipse

我们看到最后有一个数据是一类，不合理，因此将聚类数目改为5

#设置随机数种子，保证实验的可重复进行
set.seed(123)
#利用k-mean是进行聚类
km_result <- kmeans(df, 5, nstart = 24)
#查看聚类的一些结果
print(km_result)

## K-means clustering with 5 clusters of sizes 8, 4, 5, 12, 20
## 
## Cluster means:
##          SiO2       Na2O         K2O         CaO         MgO       Al2O3
## 1 -0.53263504 -0.1665907 -0.45269585 -0.52000251 -0.91542482 -0.72326470
## 2  0.07210428 -0.3886272  1.89300756  0.87023949  0.64596787  1.07343058
## 3 -1.22465632 -0.4989559 -0.33926725 -0.08075333 -1.02454376 -0.82537515
## 4 -0.84107153 -0.4989559 -0.23340056  1.10118178  0.77823324 -0.06731514
## 5  1.00944016  0.5684742  0.02733398 -0.60656763  0.02617235  0.32135263
##         Fe2O3         CuO        PbO         BaO       P2O5        SrO
## 1 -0.49387875 -0.43447496  1.1347068  0.02995811 -0.4563306 -0.0966186
## 2  1.35124063 -0.73452341  0.1121149 -0.42044853 -0.4534528 -0.1155938
## 3 -0.69157011  2.16820506 -0.2278124  2.48045834  0.2044766  0.8654238
## 4  0.25822706  0.07370155  0.7792360 -0.41684768  1.2926268  0.5422130
## 5 -0.05474033 -0.26557753 -0.8868942 -0.29789952 -0.5534724 -0.4799176
##          SnO2        SO2  weathering
## 1 -0.27306555 -0.2547526  0.93089312
## 2  2.25039548 -0.2547526 -0.06071042
## 3 -0.27306555  2.0086049  0.53425171
## 4 -0.08918544 -0.2547526  0.93089312
## 5 -0.21907522 -0.1964482 -1.05231397
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  2  3  3  4  4  5  5  3  5  3  3  5  5  2  2  5  5  5  1  5  1  5  1  1  1  4 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 
##  5  5  4  4  5  5  5  5  2  4  5  4  5  4  4  1  5  4  4  5  1  1  4 
## 
## Within cluster sum of squares by cluster:
## [1]  19.06685  57.38463  46.65616  79.03740 151.39265
##  (between_SS / total_SS =  50.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

#提取类标签并且与原始数据进行合并
dd <- cbind(data, cluster = km_result$cluster)

# 查看每一类的数目
table(dd$cluster)

## 
##  1  2  3  4  5 
##  8  4  5 12 20

#进行可视化展示
fviz_cluster(km_result, data = df,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800","#fac0b3","#b892ff"),
             ellipse.type = "euclid",
             star.plot = TRUE, 
             repel = TRUE,
             ggtheme = theme_minimal()
)

# 导出聚类结果
result_output <- data.frame(data,km_result$cluster)
write.csv(result_output,file = "铅钡组分类结果.csv",row.names = T)

层次聚类

#先求样本之间两两相似性
result <- dist(df, method = "euclidean")
#产生层次结构
result_hc <- hclust(d = result, method = "ward.D2")
#进行结果展示
fviz_dend(result_hc, k = 5, 
          cex = 0.5, 
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800","#fac0b3","#b892ff"),
          color_labels_by_k = TRUE, 
          rect = TRUE          
)

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

对高钾数据进行聚类分析

res1 <- kmeans(data1,centers = 2)
pairs(data1,col=res1$cluster+1)

ggpairs(data1,mapping=aes(colour=as.character(res1$cluster)))

library(factoextra)
# 确定最佳聚类数目
fviz_nbclust(df1,kmeans,method = "wss")+geom_vline(xintercept=6,linetype=2)

# 另一种确定最佳聚类数目的方法
library(mclust)
m_clust1 <- Mclust(df1,G=1:10)
summary(m_clust1)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust EII (spherical, equal volume) model with 10 components: 
## 
##  log-likelihood  n  df       BIC       ICL
##       -33.16312 18 160 -528.7857 -528.7857
## 
## Clustering table:
##  1  2  3  4  5  6  7  8  9 10 
##  3  2  1  1  1  6  1  1  1  1

怎么说呢，给了10个聚类数目，这个效果也太差了吧 14个指标随聚类数目变化的走势图

plot(m_clust1,"BIC")

现在看这个图，不做聚类分析挺好的，但还是试一试吧，最后确定聚类数目为2

#设置随机数种子，保证实验的可重复进行
set.seed(123)
#利用k-mean是进行聚类
km_result1 <- kmeans(df1, 2, nstart = 24)
#查看聚类的一些结果
print(km_result1)

## K-means clustering with 2 clusters of sizes 10, 8
## 
## Cluster means:
##         SiO2       Na2O        K2O        CaO        MgO      Al2O3      Fe2O3
## 1 -0.8097125  0.3404651  0.6282006  0.7112789  0.5015441  0.7073684  0.6014488
## 2  1.0121406 -0.4255814 -0.7852507 -0.8890986 -0.6269302 -0.8842105 -0.7518110
##          CuO        PbO        BaO       P2O5        SrO       SnO2        SO2
## 1  0.4754239  0.3784069  0.3791587  0.3534793  0.3470203 -0.2357023  0.3450032
## 2 -0.5942798 -0.4730087 -0.4739484 -0.4418491 -0.4337754  0.2946278 -0.4312540
##   weathering
## 1 -0.6871843
## 2  0.8589803
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 
##  1  2  1  1  1  1  1  2  2  2  2  1  1  1  2  1  2  2 
## 
## Within cluster sum of squares by cluster:
## [1] 123.02647  37.67787
##  (between_SS / total_SS =  37.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

#提取类标签并且与原始数据进行合并
dd1 <- cbind(data1, cluster = km_result1$cluster)

# 查看每一类的数目
table(dd1$cluster)

## 
##  1  2 
## 10  8

#进行可视化展示
fviz_cluster(km_result1, data = df1,
             palette = c("#2E9FDF", "#00AFBB"),
             ellipse.type = "euclid",
             star.plot = TRUE, 
             repel = TRUE,
             ggtheme = theme_minimal()
)

层次聚类

#先求样本之间两两相似性
result1 <- dist(df1, method = "euclidean")
#产生层次结构
result_hc1 <- hclust(d = result1, method = "ward.D2")
#进行结果展示
fviz_dend(result_hc1, k = 2, 
          cex = 0.5, 
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800","#fac0b3","#b892ff"),
          color_labels_by_k = TRUE, 
          rect = TRUE)

# 导出聚类结果
result_output1 <- data.frame(data1,km_result1$cluster)
write.csv(result_output1,file = "高钾组分类结果.csv",row.names = T)

# 另一种相关性的表达
library(corrplot)

## corrplot 0.92 loaded

data_cor = cor(data)
corrplot(corr = data_cor, method = 'color', addCoef.col="grey")

鉴定类别

pre <- read.csv("D://modeling//predict_sample.csv",header = T,encoding = "UTF-8")
# 根据矩阵行向量之间的聚类进行层次聚类
# 先将数据标准化
df2<- scale(pre)

clusters <- function(x, centers) {
  # compute squared euclidean distance from each sample to each cluster center
  tmp <- sapply(seq_len(nrow(x)),
                function(i) apply(centers, 1,
                                  function(v) sum((x[i, ]-v)^2)))
  max.col(-t(tmp))  # find index of min distance
}

# create a simple data set with two clusters
set.seed(1)
x_new <- rbind(matrix(rnorm(10, sd = 0.3), ncol = 2),
               matrix(rnorm(10, mean = 1, sd = 0.3), ncol = 2))

cl <- kmeans(df, centers=4)

all.equal(cl[["cluster"]], clusters(df, cl[["centers"]]))

## [1] "names for target but not for current"

# [1] TRUE
predict <- clusters(df2, cl[["centers"]])


pre$predict <- predict
# 导出聚类结果
write.csv(pre,file = "预测组.csv",row.names = T)

数学建模中的统计分析

赵雅楠

2022-09-18

问题一：玻璃文物的表面风化与其他参数的相关性及差异性分析

文物样品表面有无风化化学成分含量的统计规律

问题二：聚类模型搭建

K_means 聚类分析

鉴定类别