library(ggplot2)
library(dplyr)
##
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(olsrr)
##
## 载入程辑包:'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(InformationValue)
library(hablar)
##
## 载入程辑包:'hablar'
## The following object is masked from 'package:dplyr':
##
## na_if
library(openxlsx)
library(psych)
##
## 载入程辑包:'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(caret)
## 载入需要的程辑包:lattice
##
## 载入程辑包:'caret'
## The following objects are masked from 'package:InformationValue':
##
## confusionMatrix, precision, sensitivity, specificity
library(forestplot)
## 载入需要的程辑包:grid
## 载入需要的程辑包:magrittr
## 载入需要的程辑包:checkmate
library(meta)
## Loading 'meta' package (version 5.5-0).
## Type 'help(meta)' for a brief overview.
## Readers of 'Meta-Analysis with R (Use R!)' should install
## older version of 'meta' package: https://tinyurl.com/dt4y5drs
这里的参数均为离散型变量,因此我们用卡方检验来分析相关性
1.玻璃类型与是否风化
H0:玻璃类型不影响文物的风化
H1:玻璃类型影响文物的风化
comp <- c(6,12,28,12)
dim <- c(2,2)
chisq.test(comp,correct = FALSE)
##
## Chi-squared test for given probabilities
##
## data: comp
## X-squared = 18.414, df = 3, p-value = 0.0003613
得到的p value <-
0.05,因此有理由拒绝H0,接受H1,即不同的玻璃类型对玻璃文物表面风化影响不同。
2.文物纹饰是否影响风化
H0:文物纹饰不影响风化
H1:文物纹饰影响风化
wenshi <- c(11,11,6,0,17,13)
dim1 <- c(2,3)
chisq.test(wenshi,correct = FALSE)
##
## Chi-squared test for given probabilities
##
## data: wenshi
## X-squared = 18.138, df = 5, p-value = 0.002778
得到的p value <- 0.05,因此有理由拒绝H0,接受H1,即不同的纹饰类型对玻璃文物表面是否风化也有影响。
3.文物颜色是否影响风化
H0:文物颜色不影响风化
H1:文物颜色影响风化
color <- c(2,2,4,3,0,2,1,2,12,8,0,1,9,6,2,0,4,0)
dim2 <- c(2,9)
chisq.test(color,correct = FALSE)
## Warning in chisq.test(color, correct = FALSE): Chi-squared近似算法有可能不准
##
## Chi-squared test for given probabilities
##
## data: color
## X-squared = 62.414, df = 17, p-value = 4.177e-07
得到的p value <-
0.05,因此有理由拒绝H0,接受H1,即不同的文物颜色对玻璃文物表面是否风化也有影响。
这里提到卡方检验可能不准,我们可以使用 Spearman
test(非参检验)来验证是否风化与表面颜色的相关性
group1 <- c(2,4,0,1,12,0,9,2,4)
group2 <- c(2,3,2,2,8,1,6,0,0)
cor.test(group1,group2,method = "spearman",alternative = "two.sided",conf.level = 0.94)
##
## Spearman's rank correlation rho
##
## data: group1 and group2
## S = 54.825, p-value = 0.1307
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.5431236
P value > 0.05,因此我们认为文物颜色与文物是否风化没有关联,拒绝H0.
component <- read.csv("D://modeling//component.csv",header = F,encoding = "UTF-8")
component$V1[1]=69.33
component<- component %>%
as_tibble() %>%
convert(num(V1),num(V16))
for (i in 1:67){
if(component$V16[i]==0){
component$label[i] = "未风化"
}
if(component$V16[i]==1){
component$label[i] = "风化"
}
}
## Warning: Unknown or uninitialised column: `label`.
# 二氧化硅
component %>%
ggplot(aes(x=label, y=V1, group=label,fill=label)) +
geom_boxplot() +
labs(title="二氧化硅对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
# 独立T检验
t.test(component$V1,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V1 and component$V16
## t = 16.338, df = 132, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 42.66698 54.42227
## sample estimates:
## mean of x mean of y
## 49.0222388 0.4776119
# 氧化钠对风化的影响
component %>%
ggplot(aes(x=label, y=V2, group=label,fill=label)) +
geom_boxplot() +
labs(title="氧化钠对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V2,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V2 and component$V16
## t = 1.4603, df = 132, p-value = 0.1466
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1093279 0.7260443
## sample estimates:
## mean of x mean of y
## 0.7859701 0.4776119
# 氧化钾对风化的影响
component %>%
ggplot(aes(x=label, y=V3, group=label,fill=label)) +
geom_boxplot() +
labs(title="氧化钾对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V3,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V3 and component$V16
## t = 2.8648, df = 132, p-value = 0.004856
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.4237717 2.3144372
## sample estimates:
## mean of x mean of y
## 1.8467164 0.4776119
# 氧化钙对风化的影响
component %>%
ggplot(aes(x=label, y=V4, group=label,fill=label)) +
geom_boxplot() +
labs(title="氧化钙对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V4,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V4 and component$V16
## t = 7.0703, df = 132, p-value = 8.092e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.479794 2.629460
## sample estimates:
## mean of x mean of y
## 2.5322388 0.4776119
# 氧化镁对风化的影响
component %>%
ggplot(aes(x=label, y=V5, group=label,fill=label)) +
geom_boxplot() +
labs(title="氧化镁对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V5,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V5 and component$V16
## t = 2.0441, df = 132, p-value = 0.04294
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.006626473 0.404119795
## sample estimates:
## mean of x mean of y
## 0.6829851 0.4776119
# 氧化铝对风化的影响
component %>%
ggplot(aes(x=label, y=V6, group=label,fill=label)) +
geom_boxplot() +
labs(title="氧化铝对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V6,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V6 and component$V16
## t = 9.3833, df = 132, p-value = 2.437e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.811989 4.314280
## sample estimates:
## mean of x mean of y
## 4.0407463 0.4776119
# 氧化铁对风化的影响
component %>%
ggplot(aes(x=label, y=V7, group=label,fill=label)) +
geom_boxplot() +
labs(title="氧化铁对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V7,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V7 and component$V16
## t = 2.3745, df = 132, p-value = 0.01901
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.06206131 0.68152077
## sample estimates:
## mean of x mean of y
## 0.8494030 0.4776119
# 氧化铜对风化的影响
component %>%
ggplot(aes(x=label, y=V8, group=label,fill=label)) +
geom_boxplot() +
labs(title="氧化铜对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V8,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V8 and component$V16
## t = 5.258, df = 132, p-value = 5.711e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.9207875 2.0314513
## sample estimates:
## mean of x mean of y
## 1.9537313 0.4776119
# 氧化铅对风化的影响
component %>%
ggplot(aes(x=label, y=V9, group=label,fill=label)) +
geom_boxplot() +
labs(title="氧化铅对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V9,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V9 and component$V16
## t = 10.058, df = 132, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 19.26872 28.70292
## sample estimates:
## mean of x mean of y
## 24.4634328 0.4776119
# 氧化钡对风化的影响
component %>%
ggplot(aes(x=label, y=V10, group=label,fill=label)) +
geom_boxplot() +
labs(title="氧化钡对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V10,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V10 and component$V16
## t = 7.0816, df = 132, p-value = 7.631e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5.262065 9.341219
## sample estimates:
## mean of x mean of y
## 7.7792537 0.4776119
# 五氧化二磷对风化的影响
component %>%
ggplot(aes(x=label, y=V11, group=label,fill=label)) +
geom_boxplot() +
labs(title="五氧化二磷对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V11,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V11 and component$V16
## t = 5.0462, df = 132, p-value = 1.462e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.341690 3.071742
## sample estimates:
## mean of x mean of y
## 2.6843284 0.4776119
# 氧化锶对风化的影响
component %>%
ggplot(aes(x=label, y=V12, group=label,fill=label)) +
geom_boxplot() +
labs(title="氧化锶对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V12,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V12 and component$V16
## t = -3.098, df = 132, p-value = 0.002381
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.35338135 -0.07796193
## sample estimates:
## mean of x mean of y
## 0.2619403 0.4776119
# 氧化锡对风化的影响
component %>%
ggplot(aes(x=label, y=V13, group=label,fill=label)) +
geom_boxplot() +
labs(title="氧化锡对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V13,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V13 and component$V16
## t = -5.4023, df = 132, p-value = 2.97e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5462588 -0.2534427
## sample estimates:
## mean of x mean of y
## 0.07776119 0.47761194
# 二氧化硫对风化的影响
component %>%
ggplot(aes(x=label, y=V14, group=label,fill=label)) +
geom_boxplot() +
labs(title="氧化镁对风化的影响",
x="是否风化", y="化学成分占比") +
theme_classic()
t.test(component$V14,component$V16,alternative="two.sided", var.equal=T)
##
## Two Sample t-test
##
## data: component$V14 and component$V16
## t = 0.37395, df = 132, p-value = 0.709
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5378215 0.7885678
## sample estimates:
## mean of x mean of y
## 0.6029851 0.4776119
# 分析化学组分间的相关性
ggpairs(component,columns=1:14, columnLabels =c("SiO2","Na2O","K2O","CaO","MgO","Al2O3","Fe2O3","CuO",
"PbO","BaO","P2O5","SrO","SnO2","SO2"),
cardinality_threshold=NULL)
# 另一种表达
comp<-component
comp$label<-NULL
comp$V17 <- NULL
comp$V16 <- NULL
comp$V15 <- NULL
pairs.panels(comp)
# Logistic regression
colnames(component) <- c("SiO2","Na2O","K2O","CaO","MgO","Al2O3","Fe2O3","CuO","PbO","BaO","P2O5","SrO","SnO2","SO2","sum","weathering","type","label")
logistic <- glm(weathering~SiO2+Na2O+K2O+CaO+MgO+Al2O3+Fe2O3+PbO+BaO+CuO+P2O5+SrO+SnO2+SO2,data=component,family=binomial)
summary(logistic)
##
## Call:
## glm(formula = weathering ~ SiO2 + Na2O + K2O + CaO + MgO + Al2O3 +
## Fe2O3 + PbO + BaO + CuO + P2O5 + SrO + SnO2 + SO2, family = binomial,
## data = component)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.45188 -0.20057 -0.00058 0.47824 1.65313
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.00602 32.43615 0.216 0.829
## SiO2 -0.06676 0.32866 -0.203 0.839
## Na2O -0.63691 0.61520 -1.035 0.301
## K2O -1.15148 0.88057 -1.308 0.191
## CaO 0.13048 0.75992 0.172 0.864
## MgO -0.32798 1.07553 -0.305 0.760
## Al2O3 -0.30761 0.41601 -0.739 0.460
## Fe2O3 -1.05304 0.76141 -1.383 0.167
## PbO 0.02671 0.32040 0.083 0.934
## BaO -0.34802 0.48224 -0.722 0.470
## CuO 0.59770 0.38306 1.560 0.119
## P2O5 0.23230 0.54152 0.429 0.668
## SrO -2.34772 2.68614 -0.874 0.382
## SnO2 3.81792 2.57040 1.485 0.137
## SO2 0.31056 0.27791 1.118 0.264
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 92.747 on 66 degrees of freedom
## Residual deviance: 40.447 on 52 degrees of freedom
## AIC: 70.447
##
## Number of Fisher Scoring iterations: 8
pR2(logistic)["McFadden"]
## fitting null model for pseudo-r2
## McFadden
## 0.5638999
# Find optimal cutoff probability to use to maximize accuracy
predicted_value <- predict(logistic, component,type = "response")
optimalCutoff(component$weathering, predicted_value)[1]
## [1] 0.3397779
没有某一个变量非常显著,将二氧化硅剔除试一试
# Logistic regression
logistic2 <- glm(weathering~Na2O+K2O+CaO+MgO+Al2O3+Fe2O3+PbO+BaO+CuO+P2O5+SrO+SnO2+SO2,data=component,family=binomial)
summary(logistic2)
##
## Call:
## glm(formula = weathering ~ Na2O + K2O + CaO + MgO + Al2O3 + Fe2O3 +
## PbO + BaO + CuO + P2O5 + SrO + SnO2 + SO2, family = binomial,
## data = component)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.51338 -0.19804 -0.00079 0.48601 1.61306
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.42373 1.29412 0.327 0.7433
## Na2O -0.55791 0.47458 -1.176 0.2398
## K2O -1.05287 0.71650 -1.469 0.1417
## CaO 0.23381 0.57351 0.408 0.6835
## MgO -0.29967 1.06159 -0.282 0.7777
## Al2O3 -0.24814 0.29098 -0.853 0.3938
## Fe2O3 -1.02162 0.75360 -1.356 0.1752
## PbO 0.09120 0.04819 1.892 0.0584 .
## BaO -0.25465 0.13690 -1.860 0.0629 .
## CuO 0.60412 0.38237 1.580 0.1141
## P2O5 0.33176 0.23898 1.388 0.1651
## SrO -2.51239 2.57293 -0.976 0.3288
## SnO2 3.80291 2.53785 1.498 0.1340
## SO2 0.32011 0.27194 1.177 0.2392
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 92.747 on 66 degrees of freedom
## Residual deviance: 40.489 on 53 degrees of freedom
## AIC: 68.489
##
## Number of Fisher Scoring iterations: 8
氧化铅与氧化钡非常显著
# Logistic regression,剔除氧化硅,氧化铅,氧化钡,氧化钾
logistic1 <- glm(weathering~Na2O+K2O+CaO+MgO+Al2O3+Fe2O3+CuO+P2O5+SrO+SnO2+SO2,data=component,family=binomial)
summary(logistic1)
##
## Call:
## glm(formula = weathering ~ Na2O + K2O + CaO + MgO + Al2O3 + Fe2O3 +
## CuO + P2O5 + SrO + SnO2 + SO2, family = binomial, data = component)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.69577 -0.41787 -0.00704 0.55230 1.92026
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.31236 0.90857 1.444 0.1486
## Na2O -0.55042 0.38705 -1.422 0.1550
## K2O -0.79019 0.46497 -1.699 0.0892 .
## CaO 0.58077 0.41669 1.394 0.1634
## MgO -0.57619 0.93939 -0.613 0.5396
## Al2O3 -0.29290 0.22382 -1.309 0.1907
## Fe2O3 -1.21317 0.69694 -1.741 0.0817 .
## CuO -0.07010 0.18187 -0.385 0.6999
## P2O5 0.34956 0.18154 1.925 0.0542 .
## SrO -0.67430 1.72682 -0.390 0.6962
## SnO2 2.41072 1.72872 1.395 0.1632
## SO2 -0.09002 0.21527 -0.418 0.6758
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 92.747 on 66 degrees of freedom
## Residual deviance: 46.832 on 55 degrees of freedom
## AIC: 70.832
##
## Number of Fisher Scoring iterations: 8
AIC值越小说明预测效果越好,因此logistic2的拟合效果最好
用meta森林图预测风化前的化学成分
# 构建数据集
component$new_col <- 1-component$weathering
metal <- metabin(component$new_col,c(rep(2,67)),component$weathering,c(rep(2,67)),data=component,method = "MH",sm="OR")
forest(metal,family="sans",fontsize = 9.5,lab.e="未风化",lab.c="风化组")
# 因子分析
# 铅钡数据的因子分析
data <- read.xlsx("D:\\modeling\\铅钡组.xlsx",rows=1:50,cols = 1:15)
# 数据标准化
df <- scale(data)
# 因子分析
data_cor <- cor(df)
# 生成碎石图
fa.parallel(data_cor)
## Parallel analysis suggests that the number of factors = 3 and the number of components = 3
# 由碎石图可知保留3个主成分即可
# 因子分析
fa_model1 <- fa(data_cor,nfactors = 3,rotate = "varimax",fm="ml")
fa_model1
## Factor Analysis using method = ml
## Call: fa(r = data_cor, nfactors = 3, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 ML3 h2 u2 com
## SiO2 -0.91 -0.36 0.16 0.995 0.0048 1.4
## Na2O -0.39 -0.02 0.04 0.155 0.8451 1.0
## K2O -0.07 0.01 0.18 0.037 0.9627 1.3
## CaO 0.62 -0.03 0.46 0.599 0.4011 1.8
## MgO 0.13 -0.31 0.59 0.464 0.5357 1.6
## Al2O3 -0.27 -0.16 0.63 0.493 0.5074 1.5
## Fe2O3 0.06 -0.20 0.39 0.197 0.8025 1.5
## CuO 0.08 0.70 -0.16 0.523 0.4769 1.1
## PbO 0.86 -0.31 -0.41 0.995 0.0049 1.7
## BaO 0.05 0.96 -0.28 0.995 0.0049 1.2
## P2O5 0.66 0.09 0.49 0.689 0.3106 1.9
## SrO 0.48 0.11 -0.13 0.261 0.7386 1.2
## SnO2 -0.04 -0.01 0.24 0.060 0.9401 1.1
## SO2 0.16 0.65 0.00 0.453 0.5468 1.1
## weathering 0.82 0.11 -0.11 0.695 0.3054 1.1
##
## ML1 ML2 ML3
## SS loadings 3.59 2.26 1.77
## Proportion Var 0.24 0.15 0.12
## Cumulative Var 0.24 0.39 0.51
## Proportion Explained 0.47 0.30 0.23
## Cumulative Proportion 0.47 0.77 1.00
##
## Mean item complexity = 1.4
## Test of the hypothesis that 3 factors are sufficient.
##
## The degrees of freedom for the null model are 105 and the objective function was 11.39
## The degrees of freedom for the model are 63 and the objective function was 3
##
## The root mean square of the residuals (RMSR) is 0.08
## The df corrected root mean square of the residuals is 0.11
##
## Fit based upon off diagonal values = 0.92
## Measures of factor score adequacy
## ML1 ML2 ML3
## Correlation of (regression) scores with factors 1.00 1.00 0.98
## Multiple R square of scores with factors 0.99 0.99 0.95
## Minimum correlation of possible factor scores 0.99 0.99 0.91
fa函数中,第一个参数是数据,第二个参数说明要保留两个主成分,第三个参数为旋转方法,为none,即不进行主成分旋转,varimax意味着进行正交旋转;第四个参数表示提取公因子的方法为最大似然法。
# 对旋转结果进行可视化,这个在论文里不用写
factor.plot(fa_model1)
fa.diagram(fa_model1,simple = FALSE)
# 这个图跟SPSS分析的因子载荷矩阵热力图类似但更好理解!
# 高钾数据的因子分析,操作同上
data1 <- read.xlsx("D:\\modeling\\高钾组.xlsx",rows=1:50,cols = 1:15)
# 数据标准化
df1 <- scale(data1)
data_cor1 <- cor(data1)
# 生成碎石图
fa.parallel(data_cor1)
## Warning in fa.parallel(data_cor1): It seems as if you are using a correlation
## matrix, but have not specified the number of cases. The number of subjects is
## arbitrarily set to be 100
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Parallel analysis suggests that the number of factors = 5 and the number of components = 4
# 由碎石图可知保留3个主成分即可
# 因子分析
fa_model2 <- fa(data_cor,nfactors = 3,rotate = "varimax",fm="ml")
fa_model2
## Factor Analysis using method = ml
## Call: fa(r = data_cor, nfactors = 3, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 ML3 h2 u2 com
## SiO2 -0.91 -0.36 0.16 0.995 0.0048 1.4
## Na2O -0.39 -0.02 0.04 0.155 0.8451 1.0
## K2O -0.07 0.01 0.18 0.037 0.9627 1.3
## CaO 0.62 -0.03 0.46 0.599 0.4011 1.8
## MgO 0.13 -0.31 0.59 0.464 0.5357 1.6
## Al2O3 -0.27 -0.16 0.63 0.493 0.5074 1.5
## Fe2O3 0.06 -0.20 0.39 0.197 0.8025 1.5
## CuO 0.08 0.70 -0.16 0.523 0.4769 1.1
## PbO 0.86 -0.31 -0.41 0.995 0.0049 1.7
## BaO 0.05 0.96 -0.28 0.995 0.0049 1.2
## P2O5 0.66 0.09 0.49 0.689 0.3106 1.9
## SrO 0.48 0.11 -0.13 0.261 0.7386 1.2
## SnO2 -0.04 -0.01 0.24 0.060 0.9401 1.1
## SO2 0.16 0.65 0.00 0.453 0.5468 1.1
## weathering 0.82 0.11 -0.11 0.695 0.3054 1.1
##
## ML1 ML2 ML3
## SS loadings 3.59 2.26 1.77
## Proportion Var 0.24 0.15 0.12
## Cumulative Var 0.24 0.39 0.51
## Proportion Explained 0.47 0.30 0.23
## Cumulative Proportion 0.47 0.77 1.00
##
## Mean item complexity = 1.4
## Test of the hypothesis that 3 factors are sufficient.
##
## The degrees of freedom for the null model are 105 and the objective function was 11.39
## The degrees of freedom for the model are 63 and the objective function was 3
##
## The root mean square of the residuals (RMSR) is 0.08
## The df corrected root mean square of the residuals is 0.11
##
## Fit based upon off diagonal values = 0.92
## Measures of factor score adequacy
## ML1 ML2 ML3
## Correlation of (regression) scores with factors 1.00 1.00 0.98
## Multiple R square of scores with factors 0.99 0.99 0.95
## Minimum correlation of possible factor scores 0.99 0.99 0.91
# 对旋转结果进行可视化
factor.plot(fa_model2)
fa.diagram(fa_model2,simple = FALSE)
# 这个图跟SPSS分析的因子载荷矩阵热力图类似但更好理解!
对铅钡数据进行分类,先尝试分为三类
res <- kmeans(data,centers = 2)
pairs(data,col=res$cluster+1)
ggpairs(data,mapping=aes(colour=as.character(res$cluster)))
好像除了二氧化硅有明显的区别,其他区别都看不出来
library(factoextra)
## Warning: 程辑包'factoextra'是用R版本4.2.1 来建造的
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# 确定最佳聚类数目
fviz_nbclust(df,kmeans,method = "wss")+geom_vline(xintercept=6,linetype=2)
# 另一种确定最佳聚类数目的方法
library(mclust)
## Warning: 程辑包'mclust'是用R版本4.2.1 来建造的
## Package 'mclust' version 5.4.10
## Type 'citation("mclust")' for citing this R package in publications.
##
## 载入程辑包:'mclust'
## The following object is masked from 'package:psych':
##
## sim
m_clust <- Mclust(df,G=1:10)
summary(m_clust)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEI (diagonal, equal shape) model with 6 components:
##
## log-likelihood n df BIC ICL
## -629.5898 49 115 -1706.739 -1706.783
##
## Clustering table:
## 1 2 3 4 5 6
## 18 2 2 4 14 9
14个指标随聚类数目变化的走势图
plot(m_clust,"BIC")
最后确定聚类数目为6
#设置随机数种子,保证实验的可重复进行
set.seed(123)
#利用k-mean是进行聚类
km_result <- kmeans(df, 6, nstart = 24)
#查看聚类的一些结果
print(km_result)
## K-means clustering with 6 clusters of sizes 19, 8, 5, 1, 4, 12
##
## Cluster means:
## SiO2 Na2O K2O CaO MgO Al2O3
## 1 0.9862609 0.62465478 0.06191179 -0.62400301 0.007126134 0.34801828
## 2 -0.5326350 -0.16659070 -0.45269585 -0.52000251 -0.915424825 -0.72326470
## 3 -1.2246563 -0.49895590 -0.33926725 -0.08075333 -1.024543755 -0.82537515
## 4 0.7751866 -0.05764112 0.53186438 0.47106110 1.419720291 3.31753415
## 5 0.2407690 -0.49895590 1.60263035 0.68365036 0.388050401 0.19772338
## 6 -0.8410715 -0.49895590 -0.23340056 1.10118178 0.778233244 -0.06731514
## Fe2O3 CuO PbO BaO P2O5 SrO
## 1 -0.2759327 -0.24888539 -0.8744205 -0.26891265 -0.5600830 -0.4955971
## 2 -0.4938787 -0.43447496 1.1347068 0.02995811 -0.4563306 -0.0966186
## 3 -0.6915701 2.16820506 -0.2278124 2.48045834 0.2044766 0.8654238
## 4 0.3944144 -0.76083458 -1.1800907 -0.38173937 -0.5608908 -0.3717589
## 5 2.2896156 -0.68999683 0.1261642 -0.53717617 -0.4201982 -0.0681558
## 6 0.2582271 0.07370155 0.7792360 -0.41684768 1.2926268 0.5422130
## SnO2 SO2 weathering
## 1 -0.21623363 -0.1933795 -1.0523140
## 2 -0.27306555 -0.2547526 0.9308931
## 3 -0.27306555 2.0086049 0.5342517
## 4 5.87713715 -0.2547526 0.9308931
## 5 0.71284481 -0.2547526 -0.5565122
## 6 -0.08918544 -0.2547526 0.9308931
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 5 3 3 6 6 1 1 3 1 3 3 1 1 5 5 5 1 1 2 1 2 1 2 2 2 6
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## 1 1 6 6 1 1 1 1 4 6 1 6 1 6 6 2 1 6 6 1 2 2 6
##
## Within cluster sum of squares by cluster:
## [1] 129.81834 19.06685 46.65616 0.00000 42.47087 79.03740
## (between_SS / total_SS = 56.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#提取类标签并且与原始数据进行合并
dd <- cbind(data, cluster = km_result$cluster)
# 查看每一类的数目
table(dd$cluster)
##
## 1 2 3 4 5 6
## 19 8 5 1 4 12
#进行可视化展示
fviz_cluster(km_result, data = df,
palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07","#fac0b3","#b892ff"),
ellipse.type = "euclid",
star.plot = TRUE,
repel = TRUE,
ggtheme = theme_minimal()
)
## Too few points to calculate an ellipse
我们看到最后有一个数据是一类,不合理,因此将聚类数目改为5
#设置随机数种子,保证实验的可重复进行
set.seed(123)
#利用k-mean是进行聚类
km_result <- kmeans(df, 5, nstart = 24)
#查看聚类的一些结果
print(km_result)
## K-means clustering with 5 clusters of sizes 8, 4, 5, 12, 20
##
## Cluster means:
## SiO2 Na2O K2O CaO MgO Al2O3
## 1 -0.53263504 -0.1665907 -0.45269585 -0.52000251 -0.91542482 -0.72326470
## 2 0.07210428 -0.3886272 1.89300756 0.87023949 0.64596787 1.07343058
## 3 -1.22465632 -0.4989559 -0.33926725 -0.08075333 -1.02454376 -0.82537515
## 4 -0.84107153 -0.4989559 -0.23340056 1.10118178 0.77823324 -0.06731514
## 5 1.00944016 0.5684742 0.02733398 -0.60656763 0.02617235 0.32135263
## Fe2O3 CuO PbO BaO P2O5 SrO
## 1 -0.49387875 -0.43447496 1.1347068 0.02995811 -0.4563306 -0.0966186
## 2 1.35124063 -0.73452341 0.1121149 -0.42044853 -0.4534528 -0.1155938
## 3 -0.69157011 2.16820506 -0.2278124 2.48045834 0.2044766 0.8654238
## 4 0.25822706 0.07370155 0.7792360 -0.41684768 1.2926268 0.5422130
## 5 -0.05474033 -0.26557753 -0.8868942 -0.29789952 -0.5534724 -0.4799176
## SnO2 SO2 weathering
## 1 -0.27306555 -0.2547526 0.93089312
## 2 2.25039548 -0.2547526 -0.06071042
## 3 -0.27306555 2.0086049 0.53425171
## 4 -0.08918544 -0.2547526 0.93089312
## 5 -0.21907522 -0.1964482 -1.05231397
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 2 3 3 4 4 5 5 3 5 3 3 5 5 2 2 5 5 5 1 5 1 5 1 1 1 4
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## 5 5 4 4 5 5 5 5 2 4 5 4 5 4 4 1 5 4 4 5 1 1 4
##
## Within cluster sum of squares by cluster:
## [1] 19.06685 57.38463 46.65616 79.03740 151.39265
## (between_SS / total_SS = 50.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#提取类标签并且与原始数据进行合并
dd <- cbind(data, cluster = km_result$cluster)
# 查看每一类的数目
table(dd$cluster)
##
## 1 2 3 4 5
## 8 4 5 12 20
#进行可视化展示
fviz_cluster(km_result, data = df,
palette = c("#2E9FDF", "#00AFBB", "#E7B800","#fac0b3","#b892ff"),
ellipse.type = "euclid",
star.plot = TRUE,
repel = TRUE,
ggtheme = theme_minimal()
)
# 导出聚类结果
result_output <- data.frame(data,km_result$cluster)
write.csv(result_output,file = "铅钡组分类结果.csv",row.names = T)
层次聚类
#先求样本之间两两相似性
result <- dist(df, method = "euclidean")
#产生层次结构
result_hc <- hclust(d = result, method = "ward.D2")
#进行结果展示
fviz_dend(result_hc, k = 5,
cex = 0.5,
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800","#fac0b3","#b892ff"),
color_labels_by_k = TRUE,
rect = TRUE
)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
对高钾数据进行聚类分析
res1 <- kmeans(data1,centers = 2)
pairs(data1,col=res1$cluster+1)
ggpairs(data1,mapping=aes(colour=as.character(res1$cluster)))
library(factoextra)
# 确定最佳聚类数目
fviz_nbclust(df1,kmeans,method = "wss")+geom_vline(xintercept=6,linetype=2)
# 另一种确定最佳聚类数目的方法
library(mclust)
m_clust1 <- Mclust(df1,G=1:10)
summary(m_clust1)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EII (spherical, equal volume) model with 10 components:
##
## log-likelihood n df BIC ICL
## -33.16312 18 160 -528.7857 -528.7857
##
## Clustering table:
## 1 2 3 4 5 6 7 8 9 10
## 3 2 1 1 1 6 1 1 1 1
怎么说呢,给了10个聚类数目,这个效果也太差了吧 14个指标随聚类数目变化的走势图
plot(m_clust1,"BIC")
现在看这个图,不做聚类分析挺好的,但还是试一试吧,最后确定聚类数目为2
#设置随机数种子,保证实验的可重复进行
set.seed(123)
#利用k-mean是进行聚类
km_result1 <- kmeans(df1, 2, nstart = 24)
#查看聚类的一些结果
print(km_result1)
## K-means clustering with 2 clusters of sizes 10, 8
##
## Cluster means:
## SiO2 Na2O K2O CaO MgO Al2O3 Fe2O3
## 1 -0.8097125 0.3404651 0.6282006 0.7112789 0.5015441 0.7073684 0.6014488
## 2 1.0121406 -0.4255814 -0.7852507 -0.8890986 -0.6269302 -0.8842105 -0.7518110
## CuO PbO BaO P2O5 SrO SnO2 SO2
## 1 0.4754239 0.3784069 0.3791587 0.3534793 0.3470203 -0.2357023 0.3450032
## 2 -0.5942798 -0.4730087 -0.4739484 -0.4418491 -0.4337754 0.2946278 -0.4312540
## weathering
## 1 -0.6871843
## 2 0.8589803
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 2 1 1 1 1 1 2 2 2 2 1 1 1 2 1 2 2
##
## Within cluster sum of squares by cluster:
## [1] 123.02647 37.67787
## (between_SS / total_SS = 37.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#提取类标签并且与原始数据进行合并
dd1 <- cbind(data1, cluster = km_result1$cluster)
# 查看每一类的数目
table(dd1$cluster)
##
## 1 2
## 10 8
#进行可视化展示
fviz_cluster(km_result1, data = df1,
palette = c("#2E9FDF", "#00AFBB"),
ellipse.type = "euclid",
star.plot = TRUE,
repel = TRUE,
ggtheme = theme_minimal()
)
层次聚类
#先求样本之间两两相似性
result1 <- dist(df1, method = "euclidean")
#产生层次结构
result_hc1 <- hclust(d = result1, method = "ward.D2")
#进行结果展示
fviz_dend(result_hc1, k = 2,
cex = 0.5,
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800","#fac0b3","#b892ff"),
color_labels_by_k = TRUE,
rect = TRUE)
# 导出聚类结果
result_output1 <- data.frame(data1,km_result1$cluster)
write.csv(result_output1,file = "高钾组分类结果.csv",row.names = T)
# 另一种相关性的表达
library(corrplot)
## corrplot 0.92 loaded
data_cor = cor(data)
corrplot(corr = data_cor, method = 'color', addCoef.col="grey")
pre <- read.csv("D://modeling//predict_sample.csv",header = T,encoding = "UTF-8")
# 根据矩阵行向量之间的聚类进行层次聚类
# 先将数据标准化
df2<- scale(pre)
clusters <- function(x, centers) {
# compute squared euclidean distance from each sample to each cluster center
tmp <- sapply(seq_len(nrow(x)),
function(i) apply(centers, 1,
function(v) sum((x[i, ]-v)^2)))
max.col(-t(tmp)) # find index of min distance
}
# create a simple data set with two clusters
set.seed(1)
x_new <- rbind(matrix(rnorm(10, sd = 0.3), ncol = 2),
matrix(rnorm(10, mean = 1, sd = 0.3), ncol = 2))
cl <- kmeans(df, centers=4)
all.equal(cl[["cluster"]], clusters(df, cl[["centers"]]))
## [1] "names for target but not for current"
# [1] TRUE
predict <- clusters(df2, cl[["centers"]])
pre$predict <- predict
# 导出聚类结果
write.csv(pre,file = "预测组.csv",row.names = T)