## 目的就是分析这个表中的数据 最好是所有能分析出来的全部分析出来
## 例如相关性 疾病结果和年龄,严重,生病天数的分析和各个指标的相关性分析
## 对基因做T-test 求P-value 和q-value 再做p.adjust做多重假设检验矫正
## 其他所有能分析出来的最好都要
## array data 就是芯片
## factor就是基因
## actor group是基因的分组
## outcome就是最好的结果 这人是活了还是死了
## 然后那个傻傻ph 就是疾病的阶段
## sampling 那个就是录取样本的分类
## 加载包
library(readxl)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(ca)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
theme_set(theme_bw(base_family = "STKaiti"))
## 读取数据
ansdata <- read_excel("Array_data_nobackground_normalised.xlsx")
summary(ansdata)
## Factor Factor_group Array data
## Length:6400 Length:6400 Min. : 0.76
## Class :character Class :character 1st Qu.: 80.23
## Mode :character Mode :character Median : 143.06
## Mean : 1487.96
## 3rd Qu.: 316.36
## Max. :59801.26
##
## Patient Id Collection_Date Illness_Day Outcome
## Length:6400 Length:6400 Min. : 3.00 Length:6400
## Class :character Class :character 1st Qu.:11.00 Class :character
## Mode :character Mode :character Median :15.00 Mode :character
## Mean :17.89
## 3rd Qu.:23.00
## Max. :68.00
## NA's :400
## Age Sex Disease Phase Sampling Categry
## Min. : 6.00 Length:6400 Min. :1.0 Length:6400
## 1st Qu.:41.00 Class :character 1st Qu.:2.0 Class :character
## Median :55.00 Mode :character Median :2.0 Mode :character
## Mean :53.67 Mean :2.2
## 3rd Qu.:67.00 3rd Qu.:3.0
## Max. :82.00 Max. :4.0
## NA's :400 NA's :400
## 处理缺失值
sum(is.na(ansdata))
## [1] 2480
par(cex = 0.8)
VIM::aggr(ansdata)

colnames(ansdata) <- c("factor","factorgroup","arraydata","paintID",
"collectiondate","illnessday","outcom","age","sex",
"diseaPH","SampleCategry")
summary(ansdata)
## factor factorgroup arraydata
## Length:6400 Length:6400 Min. : 0.76
## Class :character Class :character 1st Qu.: 80.23
## Mode :character Mode :character Median : 143.06
## Mean : 1487.96
## 3rd Qu.: 316.36
## Max. :59801.26
##
## paintID collectiondate illnessday outcom
## Length:6400 Length:6400 Min. : 3.00 Length:6400
## Class :character Class :character 1st Qu.:11.00 Class :character
## Mode :character Mode :character Median :15.00 Mode :character
## Mean :17.89
## 3rd Qu.:23.00
## Max. :68.00
## NA's :400
## age sex diseaPH SampleCategry
## Min. : 6.00 Length:6400 Min. :1.0 Length:6400
## 1st Qu.:41.00 Class :character 1st Qu.:2.0 Class :character
## Median :55.00 Mode :character Median :2.0 Mode :character
## Mean :53.67 Mean :2.2
## 3rd Qu.:67.00 3rd Qu.:3.0
## Max. :82.00 Max. :4.0
## NA's :400 NA's :400
usedata <- ansdata[c("factor","factorgroup","illnessday","outcom","age","sex",
"diseaPH","SampleCategry")]
summary(usedata)
## factor factorgroup illnessday outcom
## Length:6400 Length:6400 Min. : 3.00 Length:6400
## Class :character Class :character 1st Qu.:11.00 Class :character
## Mode :character Mode :character Median :15.00 Mode :character
## Mean :17.89
## 3rd Qu.:23.00
## Max. :68.00
## NA's :400
## age sex diseaPH SampleCategry
## Min. : 6.00 Length:6400 Min. :1.0 Length:6400
## 1st Qu.:41.00 Class :character 1st Qu.:2.0 Class :character
## Median :55.00 Mode :character Median :2.0 Mode :character
## Mean :53.67 Mean :2.2
## 3rd Qu.:67.00 3rd Qu.:3.0
## Max. :82.00 Max. :4.0
## NA's :400 NA's :400
VIM::aggr(usedata)

## 对数据进行分析
## 剔除带有缺失值的行
usedata <- na.omit(usedata)
# ## 查看数据的分组
# for (col in colnames(dansdata)){
# if(is.character(dansdata[,col])){
# par(family = "STKaiti")
# barplot(table(dansdata[,col]),main = col)
# }
# }
#
#
# for (col in colnames(dansdata)){
# if(is.numeric(dansdata[,col])){
# par(family = "STKaiti")
# hist(dansdata[,col],main = col)
# }
# }
## 选择有用的数据分析
usedata <- usedata%>%
group_by(factor,factorgroup,sex,outcom)
## 分析factor变量
table(usedata$factor)
##
## ANG BDNF BLC CCL11 CCL24 CCL26 Ck-b8-1 CSF1
## 75 75 75 75 75 75 75 75
## CSF2 CSF3 CX3CL1 EGF ENA-78 FGF-4 FGF-6 FGF-7
## 75 75 75 75 75 75 75 75
## FGF-9 Flt-3-LG GCP-2 GDNF GRO GRO-a HGF I-309
## 75 75 75 75 75 75 75 75
## IFN-g IGF-I IGFBP-1 IGFBP-2 IGFBP-3 IGFBP-4 IL-10 IL-12p70
## 75 75 75 75 75 75 75 75
## IL-13 IL-15 IL-16 IL-1a IL-1b IL-2 IL-3 IL-4
## 75 75 75 75 75 75 75 75
## IL-5 IL-6 IL-7 IL-8 IP-10 LIGHT MCP-1 MCP-2
## 75 75 75 75 75 75 75 75
## MCP-3 MCP-4 MDC MIG MIP-1b MIP-1d MIP-3a NAP-2
## 75 75 75 75 75 75 75 75
## NT-3 NT-4 OPG OSM PARC PDGF-BB PIGF RANTES
## 75 75 75 75 75 75 75 75
## SCF SDF-1 TARC TGF-b1 TGF-b2 TGF-b3 THPO TNF-a
## 75 75 75 75 75 75 75 75
## TNF-b VEGF
## 75 75
ggplot(as.data.frame(table(usedata$factor)),aes(x = Var1,y = Freq)) +
geom_bar(stat = "identity",fill = "blue") +
theme_bw(base_size = 9) +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5)) +
labs(x = "Factor",y = "Frequency",title = "Factor bar plot") +
theme(plot.title = element_text(hjust = 0.5))

## Factor group是基因的分组
table(usedata$factorgroup)
##
## 白介素 干扰素 集落刺激因子 趋化因子 生长因子
## 1050 75 225 1950 1950
## 肿瘤坏死因子
## 300
ggplot(as.data.frame(table(usedata$factorgroup)),aes(x = Var1,y = Freq)) +
geom_bar(stat = "identity",fill = "blue") +
theme(axis.text.x = element_text(vjust = 0.5)) +
labs(x = "Factor Group",y = "Frequency",title = "Factor Group bar plot") +
theme(plot.title = element_text(hjust = 0.5))

## "illnessday" 探索
summary(usedata$illnessday)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 11.00 15.00 17.89 23.00 68.00
ggplot(usedata) +
geom_histogram(aes(illnessday),binwidth = 5,fill = "blue",colour = "red") +
labs(title = "Illness day histogram") +
theme(plot.title = element_text(hjust = 0.5))

## "outcom"探索
table(usedata$outcom)
##
## Fatal Mild Severe
## 1480 962 3108
ggplot(as.data.frame(table(usedata$outcom)),aes(x = Var1,y = Freq)) +
geom_bar(stat = "identity",fill = "blue") +
theme(axis.text.x = element_text(vjust = 0.5)) +
labs(x = "Outcom",y = "Frequency",title = "Outcom bar plot") +
theme(plot.title = element_text(hjust = 0.5))

## "age"
summary(usedata$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 41.00 55.00 53.67 67.00 82.00
ggplot(usedata) +
geom_histogram(aes(age),binwidth = 5,fill = "blue",colour = "red") +
labs(title = "Age histogram") +
theme(plot.title = element_text(hjust = 0.5))

## "sex"
table(usedata$sex)
##
## F M
## 1480 4070
## "diseaPH"
table(usedata$diseaPH)
##
## 1 2 3 4
## 1332 2590 814 814
ggplot(as.data.frame(table(usedata$diseaPH)),aes(x = Var1,y = Freq)) +
geom_bar(stat = "identity",fill = "blue") +
theme(axis.text.x = element_text(vjust = 0.5)) +
labs(x = "diseaPH",y = "Frequency",title = "diseaPH bar plot") +
theme(plot.title = element_text(hjust = 0.5))

## "SampleCategry"
table(usedata$SampleCategry)
##
## All other First Last
## 2516 1776 1258
ggplot(as.data.frame(table(usedata$SampleCategry)),aes(x = Var1,y = Freq)) +
geom_bar(stat = "identity",fill = "blue") +
theme(axis.text.x = element_text(vjust = 0.5)) +
labs(x = "Sample Categry",y = "Frequency",title = "Sample Categry bar plot") +
theme(plot.title = element_text(hjust = 0.5))

## 例如相关性 疾病结果和年龄,严重,生病天数的分析和各个指标的相关性分析
## 分类变量和数值变量间的相关性不好讨论
## 1:outcom 与年龄的关系
ggplot(usedata) +
geom_boxplot(aes(x = outcom,y = age),position="dodge") +
labs(title = "outcom boxplot") +
theme(plot.title = element_text(hjust = 0.5))

## 单因子方差分析
summary(aov(age~outcom,usedata))
## Df Sum Sq Mean Sq F value Pr(>F)
## outcom 2 223227 111613 444.9 <2e-16 ***
## Residuals 5547 1391651 251
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 结果显示p值《0.05,拒绝原假设,认为三种outcom的均值之间有显著想差异
## 均值多重比较
pairwise.t.test(usedata$age,usedata$outcom)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: usedata$age and usedata$outcom
##
## Fatal Mild
## Mild < 2e-16 -
## Severe 4.2e-15 < 2e-16
##
## P value adjustment method: holm
## 结果说明各两两组之间的均值是有显著差异的
## 2:outcom 与 生病天数 的关系
ggplot(usedata) +
geom_boxplot(aes(x = outcom,y = illnessday,fill = outcom),position="dodge") +
labs(title = "outcom boxplot") +
theme(plot.title = element_text(hjust = 0.5))

## 单因子方差分析
summary(aov(illnessday~outcom,usedata))
## Df Sum Sq Mean Sq F value Pr(>F)
## outcom 2 42129 21064 167.7 <2e-16 ***
## Residuals 5547 696624 126
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 结果显示p值《0.05,拒绝原假设,认为三种outcom的均值之间有显著想差异
## 均值多重比较
pairwise.t.test(usedata$illnessday,usedata$outcom)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: usedata$illnessday and usedata$outcom
##
## Fatal Mild
## Mild <2e-16 -
## Severe 0.023 <2e-16
##
## P value adjustment method: holm
## 结果说明各两两组之间的均值是有显著差异的
table(usedata$outcom)
##
## Fatal Mild Severe
## 1480 962 3108
## 因为每个分组的样本数量不一样,所以无法做cor.test()相关系数分析
## 3:outcom 与 factor group 的关系
table(usedata$outcom,usedata$factorgroup)
##
## 白介素 干扰素 集落刺激因子 趋化因子 生长因子 肿瘤坏死因子
## Fatal 280 20 60 520 520 80
## Mild 182 13 39 338 338 52
## Severe 588 42 126 1092 1092 168
## 两变量是否独立的卡方检验
chisq.test(usedata$outcom,usedata$factorgroup)
##
## Pearson's Chi-squared test
##
## data: usedata$outcom and usedata$factorgroup
## X-squared = 0, df = 10, p-value = 1
## p-value = 1,接受原假设,说明两变量是独立的
## 马赛克图也说明了这一点两变量是独立的
mosaicplot(~ factorgroup+outcom,data = usedata)

## 4:outcom 与 diseaPH 的关系
table(usedata$outcom,usedata$diseaPH)
##
## 1 2 3 4
## Fatal 370 592 370 148
## Mild 370 592 0 0
## Severe 592 1406 444 666
## 两变量是否独立的卡方检验
chisq.test(usedata$outcom,usedata$diseaPH)
##
## Pearson's Chi-squared test
##
## data: usedata$outcom and usedata$diseaPH
## X-squared = 686.36, df = 6, p-value < 2.2e-16
## p-value <0.05,拒绝原假设,说明两变量是有关系的
mosaicplot(~ diseaPH+outcom,data = usedata)

## 对应分析
ca1 <- ca(~ diseaPH+outcom,data = usedata)
summary(ca1)
##
## Principal inertias (eigenvalues):
##
## dim value % cum% scree plot
## 1 0.090181 72.9 72.9 ******************
## 2 0.033489 27.1 100.0 *******
## -------- -----
## Total: 0.123669 100.0
##
##
## Rows:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | 1 | 240 1000 169 | -284 927 215 | 80 73 46 |
## 2 | 2 | 467 1000 89 | -141 843 103 | -61 157 51 |
## 3 | 3 | 147 1000 363 | 428 598 298 | 351 402 539 |
## 4 | 4 | 147 1000 379 | 486 740 384 | -288 260 364 |
##
## Columns:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | Fatl | 267 1000 210 | 94 90 26 | 298 910 707 |
## 2 | Mild | 173 1000 600 | -653 997 820 | -36 3 7 |
## 3 | Sevr | 560 1000 190 | 158 592 154 | -131 408 286 |
## 对应分析图
par(family = "STKaiti",mfrow = c(1,1))
plot.ca(ca1,main = "~ diseaPH+outcom")

## 点离的越近,说明因素之间关系越强
## 5:outcom 与 SampleCategry 的关系
table(usedata$outcom,usedata$SampleCategry)
##
## All other First Last
## Fatal 592 444 444
## Mild 444 518 0
## Severe 1480 814 814
## 两变量是否独立的卡方检验
chisq.test(usedata$outcom,usedata$SampleCategry)
##
## Pearson's Chi-squared test
##
## data: usedata$outcom and usedata$SampleCategry
## X-squared = 461.31, df = 4, p-value < 2.2e-16
## p-value <0.05,拒绝原假设,说明两变量是有关系的
mosaicplot(~ SampleCategry+outcom,data = usedata)

## 对应分析
ca1 <- ca(~ SampleCategry+outcom,data = usedata)
summary(ca1)
##
## Principal inertias (eigenvalues):
##
## dim value % cum% scree plot
## 1 0.078922 95.0 95.0 ************************
## 2 0.004197 5.0 100.0 *
## -------- -----
## Total: 0.083119 100.0
##
##
## Rows:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | Allt | 453 1000 28 | -11 23 1 | 71 977 546 |
## 2 | Frst | 320 1000 386 | -311 962 391 | -62 38 289 |
## 3 | Last | 227 1000 586 | 460 986 608 | -55 14 165 |
##
## Columns:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | Fatl | 267 1000 100 | 144 666 70 | -102 334 663 |
## 2 | Mild | 173 1000 784 | -613 1000 826 | -5 0 1 |
## 3 | Sevr | 560 1000 116 | 121 853 104 | 50 147 336 |
## 对应分析图
par(family = "STKaiti",mfrow = c(1,1))
plot.ca(ca1,main = "~ SampleCategry+outcom")

## 点离的越近,说明因素之间关系越强
## 对基因做T-test 求P-value 和q-value 再做p.adjust做多重假设检验矫正
## 1:factor 与年龄的关系
ggplot(usedata) +
geom_boxplot(aes(x = factor,y = age),position="dodge") +
labs(title = "factor boxplot") +
theme(plot.title = element_text(hjust = 0.5)) +
theme_bw(base_size = 9) +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5)) +
theme(plot.title = element_text(hjust = 0.5))

## 1:factor 与年龄的关系 没啥关系
table(usedata$factor)
##
## ANG BDNF BLC CCL11 CCL24 CCL26 Ck-b8-1 CSF1
## 75 75 75 75 75 75 75 75
## CSF2 CSF3 CX3CL1 EGF ENA-78 FGF-4 FGF-6 FGF-7
## 75 75 75 75 75 75 75 75
## FGF-9 Flt-3-LG GCP-2 GDNF GRO GRO-a HGF I-309
## 75 75 75 75 75 75 75 75
## IFN-g IGF-I IGFBP-1 IGFBP-2 IGFBP-3 IGFBP-4 IL-10 IL-12p70
## 75 75 75 75 75 75 75 75
## IL-13 IL-15 IL-16 IL-1a IL-1b IL-2 IL-3 IL-4
## 75 75 75 75 75 75 75 75
## IL-5 IL-6 IL-7 IL-8 IP-10 LIGHT MCP-1 MCP-2
## 75 75 75 75 75 75 75 75
## MCP-3 MCP-4 MDC MIG MIP-1b MIP-1d MIP-3a NAP-2
## 75 75 75 75 75 75 75 75
## NT-3 NT-4 OPG OSM PARC PDGF-BB PIGF RANTES
## 75 75 75 75 75 75 75 75
## SCF SDF-1 TARC TGF-b1 TGF-b2 TGF-b3 THPO TNF-a
## 75 75 75 75 75 75 75 75
## TNF-b VEGF
## 75 75
## 在factor 与年龄的数据中,所有的数据都是一样的,没必要分析这个关系,因为相关系数都为1
factorname <- names(table(usedata$factor))
cor.test(usedata$age[usedata$factor == factorname[1]],
usedata$age[usedata$factor == factorname[2]])
##
## Pearson's product-moment correlation
##
## data: usedata$age[usedata$factor == factorname[1]] and usedata$age[usedata$factor == factorname[2]]
## t = Inf, df = 73, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 1 1
## sample estimates:
## cor
## 1
## ## 2:factor 与 生病天数 的关系
ggplot(usedata) +
geom_boxplot(aes(x = factorgroup,y = illnessday),position="dodge") +
labs(title = "factor boxplot") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(vjust = 0.5)) +
theme(plot.title = element_text(hjust = 0.5))

ggplot(usedata) +
geom_boxplot(aes(x = factor,y = illnessday),position="dodge") +
labs(title = "factor boxplot") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5)) +
theme(plot.title = element_text(hjust = 0.5))

## 2:factor 与 生病天数 的关系 没啥关系
table(usedata$factorgroup)
##
## 白介素 干扰素 集落刺激因子 趋化因子 生长因子
## 1050 75 225 1950 1950
## 肿瘤坏死因子
## 300
## 因为每个分组的样本数量不一样,所以无法做cor.test()相关系数分析
## 3: factor 与 outcom 的关系
table(usedata$factor,usedata$outcom)
##
## Fatal Mild Severe
## ANG 20 13 42
## BDNF 20 13 42
## BLC 20 13 42
## CCL11 20 13 42
## CCL24 20 13 42
## CCL26 20 13 42
## Ck-b8-1 20 13 42
## CSF1 20 13 42
## CSF2 20 13 42
## CSF3 20 13 42
## CX3CL1 20 13 42
## EGF 20 13 42
## ENA-78 20 13 42
## FGF-4 20 13 42
## FGF-6 20 13 42
## FGF-7 20 13 42
## FGF-9 20 13 42
## Flt-3-LG 20 13 42
## GCP-2 20 13 42
## GDNF 20 13 42
## GRO 20 13 42
## GRO-a 20 13 42
## HGF 20 13 42
## I-309 20 13 42
## IFN-g 20 13 42
## IGF-I 20 13 42
## IGFBP-1 20 13 42
## IGFBP-2 20 13 42
## IGFBP-3 20 13 42
## IGFBP-4 20 13 42
## IL-10 20 13 42
## IL-12p70 20 13 42
## IL-13 20 13 42
## IL-15 20 13 42
## IL-16 20 13 42
## IL-1a 20 13 42
## IL-1b 20 13 42
## IL-2 20 13 42
## IL-3 20 13 42
## IL-4 20 13 42
## IL-5 20 13 42
## IL-6 20 13 42
## IL-7 20 13 42
## IL-8 20 13 42
## IP-10 20 13 42
## LIGHT 20 13 42
## MCP-1 20 13 42
## MCP-2 20 13 42
## MCP-3 20 13 42
## MCP-4 20 13 42
## MDC 20 13 42
## MIG 20 13 42
## MIP-1b 20 13 42
## MIP-1d 20 13 42
## MIP-3a 20 13 42
## NAP-2 20 13 42
## NT-3 20 13 42
## NT-4 20 13 42
## OPG 20 13 42
## OSM 20 13 42
## PARC 20 13 42
## PDGF-BB 20 13 42
## PIGF 20 13 42
## RANTES 20 13 42
## SCF 20 13 42
## SDF-1 20 13 42
## TARC 20 13 42
## TGF-b1 20 13 42
## TGF-b2 20 13 42
## TGF-b3 20 13 42
## THPO 20 13 42
## TNF-a 20 13 42
## TNF-b 20 13 42
## VEGF 20 13 42
## 两变量是否独立的卡方检验
chisq.test(usedata$factor,usedata$outcom)
##
## Pearson's Chi-squared test
##
## data: usedata$factor and usedata$outcom
## X-squared = 0, df = 146, p-value = 1
## p-value = 1,接受原假设,说明两变量是独立的
## 马赛克图也说明了这一点两变量是独立的
mosaicplot(~ factor+outcom,data = usedata)

## 4:factor 与 diseaPH 的关系
table(usedata$factor,usedata$diseaPH)
##
## 1 2 3 4
## ANG 18 35 11 11
## BDNF 18 35 11 11
## BLC 18 35 11 11
## CCL11 18 35 11 11
## CCL24 18 35 11 11
## CCL26 18 35 11 11
## Ck-b8-1 18 35 11 11
## CSF1 18 35 11 11
## CSF2 18 35 11 11
## CSF3 18 35 11 11
## CX3CL1 18 35 11 11
## EGF 18 35 11 11
## ENA-78 18 35 11 11
## FGF-4 18 35 11 11
## FGF-6 18 35 11 11
## FGF-7 18 35 11 11
## FGF-9 18 35 11 11
## Flt-3-LG 18 35 11 11
## GCP-2 18 35 11 11
## GDNF 18 35 11 11
## GRO 18 35 11 11
## GRO-a 18 35 11 11
## HGF 18 35 11 11
## I-309 18 35 11 11
## IFN-g 18 35 11 11
## IGF-I 18 35 11 11
## IGFBP-1 18 35 11 11
## IGFBP-2 18 35 11 11
## IGFBP-3 18 35 11 11
## IGFBP-4 18 35 11 11
## IL-10 18 35 11 11
## IL-12p70 18 35 11 11
## IL-13 18 35 11 11
## IL-15 18 35 11 11
## IL-16 18 35 11 11
## IL-1a 18 35 11 11
## IL-1b 18 35 11 11
## IL-2 18 35 11 11
## IL-3 18 35 11 11
## IL-4 18 35 11 11
## IL-5 18 35 11 11
## IL-6 18 35 11 11
## IL-7 18 35 11 11
## IL-8 18 35 11 11
## IP-10 18 35 11 11
## LIGHT 18 35 11 11
## MCP-1 18 35 11 11
## MCP-2 18 35 11 11
## MCP-3 18 35 11 11
## MCP-4 18 35 11 11
## MDC 18 35 11 11
## MIG 18 35 11 11
## MIP-1b 18 35 11 11
## MIP-1d 18 35 11 11
## MIP-3a 18 35 11 11
## NAP-2 18 35 11 11
## NT-3 18 35 11 11
## NT-4 18 35 11 11
## OPG 18 35 11 11
## OSM 18 35 11 11
## PARC 18 35 11 11
## PDGF-BB 18 35 11 11
## PIGF 18 35 11 11
## RANTES 18 35 11 11
## SCF 18 35 11 11
## SDF-1 18 35 11 11
## TARC 18 35 11 11
## TGF-b1 18 35 11 11
## TGF-b2 18 35 11 11
## TGF-b3 18 35 11 11
## THPO 18 35 11 11
## TNF-a 18 35 11 11
## TNF-b 18 35 11 11
## VEGF 18 35 11 11
## 两变量是否独立的卡方检验
chisq.test(usedata$factor,usedata$diseaPH)
##
## Pearson's Chi-squared test
##
## data: usedata$factor and usedata$diseaPH
## X-squared = 0, df = 219, p-value = 1
## p-value =1,接受原假设,说明两变量是没有关系的
mosaicplot(~ diseaPH+factor,data = usedata)

## 4:factor group 与 diseaPH 的关系
table(usedata$factorgroup,usedata$diseaPH)
##
## 1 2 3 4
## 白介素 252 490 154 154
## 干扰素 18 35 11 11
## 集落刺激因子 54 105 33 33
## 趋化因子 468 910 286 286
## 生长因子 468 910 286 286
## 肿瘤坏死因子 72 140 44 44
## 两变量是否独立的卡方检验
chisq.test(usedata$factorgroup,usedata$diseaPH)
##
## Pearson's Chi-squared test
##
## data: usedata$factorgroup and usedata$diseaPH
## X-squared = 0, df = 15, p-value = 1
## p-value =1,接受原假设,说明两变量是没有关系的
mosaicplot(~ factorgroup + diseaPH,data = usedata)

## 4:factor 与 SampleCategry 的关系
table(usedata$factor,usedata$SampleCategry)
##
## All other First Last
## ANG 34 24 17
## BDNF 34 24 17
## BLC 34 24 17
## CCL11 34 24 17
## CCL24 34 24 17
## CCL26 34 24 17
## Ck-b8-1 34 24 17
## CSF1 34 24 17
## CSF2 34 24 17
## CSF3 34 24 17
## CX3CL1 34 24 17
## EGF 34 24 17
## ENA-78 34 24 17
## FGF-4 34 24 17
## FGF-6 34 24 17
## FGF-7 34 24 17
## FGF-9 34 24 17
## Flt-3-LG 34 24 17
## GCP-2 34 24 17
## GDNF 34 24 17
## GRO 34 24 17
## GRO-a 34 24 17
## HGF 34 24 17
## I-309 34 24 17
## IFN-g 34 24 17
## IGF-I 34 24 17
## IGFBP-1 34 24 17
## IGFBP-2 34 24 17
## IGFBP-3 34 24 17
## IGFBP-4 34 24 17
## IL-10 34 24 17
## IL-12p70 34 24 17
## IL-13 34 24 17
## IL-15 34 24 17
## IL-16 34 24 17
## IL-1a 34 24 17
## IL-1b 34 24 17
## IL-2 34 24 17
## IL-3 34 24 17
## IL-4 34 24 17
## IL-5 34 24 17
## IL-6 34 24 17
## IL-7 34 24 17
## IL-8 34 24 17
## IP-10 34 24 17
## LIGHT 34 24 17
## MCP-1 34 24 17
## MCP-2 34 24 17
## MCP-3 34 24 17
## MCP-4 34 24 17
## MDC 34 24 17
## MIG 34 24 17
## MIP-1b 34 24 17
## MIP-1d 34 24 17
## MIP-3a 34 24 17
## NAP-2 34 24 17
## NT-3 34 24 17
## NT-4 34 24 17
## OPG 34 24 17
## OSM 34 24 17
## PARC 34 24 17
## PDGF-BB 34 24 17
## PIGF 34 24 17
## RANTES 34 24 17
## SCF 34 24 17
## SDF-1 34 24 17
## TARC 34 24 17
## TGF-b1 34 24 17
## TGF-b2 34 24 17
## TGF-b3 34 24 17
## THPO 34 24 17
## TNF-a 34 24 17
## TNF-b 34 24 17
## VEGF 34 24 17
## 两变量是否独立的卡方检验
chisq.test(usedata$factor,usedata$SampleCategry)
##
## Pearson's Chi-squared test
##
## data: usedata$factor and usedata$SampleCategry
## X-squared = 0, df = 146, p-value = 1
## p-value =1,接受原假设,说明两变量是没有关系的
mosaicplot(~ factor+SampleCategry,data = usedata)

## 4:factor group 与 diseaPH 的关系
table(usedata$factorgroup,usedata$SampleCategry)
##
## All other First Last
## 白介素 476 336 238
## 干扰素 34 24 17
## 集落刺激因子 102 72 51
## 趋化因子 884 624 442
## 生长因子 884 624 442
## 肿瘤坏死因子 136 96 68
## 两变量是否独立的卡方检验
chisq.test(usedata$factorgroup,usedata$SampleCategry)
##
## Pearson's Chi-squared test
##
## data: usedata$factorgroup and usedata$SampleCategry
## X-squared = 0, df = 10, p-value = 1
## p-value =1,接受原假设,说明两变量是没有关系的
mosaicplot(~ factorgroup + SampleCategry,data = usedata)

## "diseaPH"和"SampleCategry"之间的关系
table(usedata$diseaPH,usedata$SampleCategry)
##
## All other First Last
## 1 74 1258 0
## 2 1776 518 296
## 3 518 0 296
## 4 148 0 666
## 两变量是否独立的卡方检验
chisq.test(usedata$diseaPH,usedata$SampleCategry)
##
## Pearson's Chi-squared test
##
## data: usedata$diseaPH and usedata$SampleCategry
## X-squared = 4996.7, df = 6, p-value < 2.2e-16
## p-value < 0.05,拒绝原假设,说明两变量是有关系,不独立
mosaicplot(~ diseaPH + SampleCategry,data = usedata)

## 对应分析
ca1 <- ca(~ SampleCategry+diseaPH,data = usedata)
summary(ca1)
##
## Principal inertias (eigenvalues):
##
## dim value % cum% scree plot
## 1 0.630351 70.0 70.0 ******************
## 2 0.269950 30.0 100.0 *******
## -------- -----
## Total: 0.900301 100.0
##
##
## Rows:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | Allt | 453 1000 193 | 320 266 73 | -531 734 473 |
## 2 | Frst | 320 1000 452 | -1106 961 621 | 223 39 59 |
## 3 | Last | 227 1000 354 | 922 604 306 | 746 396 468 |
##
## Columns:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | 1 | 240 1000 478 | -1293 932 637 | 349 68 109 |
## 2 | 2 | 467 1000 114 | 130 77 13 | -450 923 351 |
## 3 | 3 | 147 1000 78 | 678 966 107 | -128 34 9 |
## 4 | 4 | 147 1000 330 | 1023 517 244 | 989 483 532 |
## 对应分析图
par(family = "STKaiti",mfrow = c(1,1))
plot.ca(ca1,main = "~ SampleCategry+outcom")

## 点离的越近,说明因素之间关系越强
## "diseaPH" 和 age的关系
ggplot(usedata) +
geom_boxplot(aes(x = factor(diseaPH),y = age),position="dodge") +
labs(x = "diseaPH",title = "diseaPH boxplot") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(vjust = 0.5)) +
theme(plot.title = element_text(hjust = 0.5))

## 单因子方差分析
summary(aov(age~diseaPH,usedata))
## Df Sum Sq Mean Sq F value Pr(>F)
## diseaPH 1 43564 43564 153.8 <2e-16 ***
## Residuals 5548 1571314 283
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 结果显示p值《0.05,拒绝原假设,认为三种outcom的均值之间有显著性差异
## 均值多重比较
pairwise.t.test(usedata$age,usedata$diseaPH)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: usedata$age and usedata$diseaPH
##
## 1 2 3
## 2 2.9e-10 - -
## 3 3.0e-14 0.00128 -
## 4 < 2e-16 3.1e-14 0.00043
##
## P value adjustment method: holm
## 结果说明各两两组之间的均值是有显著差异的
table(usedata$diseaPH)
##
## 1 2 3 4
## 1332 2590 814 814
## 因子样本数量不一样,无法使用cor.test()分析
## "diseaPH" 和 illnessday的关系
ggplot(usedata) +
geom_boxplot(aes(x = factor(diseaPH),y = illnessday),position="dodge") +
labs(x = "diseaPH",title = "diseaPH boxplot") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(vjust = 0.5)) +
theme(plot.title = element_text(hjust = 0.5))

## 单因子方差分析
summary(aov(illnessday~diseaPH,usedata))
## Df Sum Sq Mean Sq F value Pr(>F)
## diseaPH 1 556581 556581 16951 <2e-16 ***
## Residuals 5548 182172 33
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 结果显示p值《0.05,拒绝原假设,认为三种outcom的均值之间有显著性差异
## 均值多重比较
pairwise.t.test(usedata$illnessday,usedata$diseaPH)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: usedata$illnessday and usedata$diseaPH
##
## 1 2 3
## 2 <2e-16 - -
## 3 <2e-16 <2e-16 -
## 4 <2e-16 <2e-16 <2e-16
##
## P value adjustment method: holm
## 结果说明各两两组之间的均值是有显著差异的
## SampleCategry 和 age的关系
ggplot(usedata) +
geom_boxplot(aes(x = SampleCategry,y = age),position="dodge") +
labs(x = "SampleCategry",title = "diseaPH boxplot") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(vjust = 0.5)) +
theme(plot.title = element_text(hjust = 0.5))

## 单因子方差分析
summary(aov(age~SampleCategry,usedata))
## Df Sum Sq Mean Sq F value Pr(>F)
## SampleCategry 2 22508 11254 39.2 <2e-16 ***
## Residuals 5547 1592369 287
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 结果显示p值《0.05,拒绝原假设,认为三种outcom的均值之间有显著性差异
## 均值多重比较
pairwise.t.test(usedata$age,usedata$SampleCategry)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: usedata$age and usedata$SampleCategry
##
## All other First
## First 1e-06 -
## Last 1e-06 <2e-16
##
## P value adjustment method: holm
## 结果说明各两两组之间的均值是有显著差异的
table(usedata$SampleCategry)
##
## All other First Last
## 2516 1776 1258
## 因子样本数量不一样,无法使用cor.test()分析
## "diseaPH" 和 illnessday的关系
ggplot(usedata) +
geom_boxplot(aes(x = SampleCategry,y = illnessday),position="dodge") +
labs(x = "SampleCategry",title = "diseaPH boxplot") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(vjust = 0.5)) +
theme(plot.title = element_text(hjust = 0.5))

## 单因子方差分析
summary(aov(illnessday~SampleCategry,usedata))
## Df Sum Sq Mean Sq F value Pr(>F)
## SampleCategry 2 390813 195407 3115 <2e-16 ***
## Residuals 5547 347940 63
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 结果显示p值《0.05,拒绝原假设,认为三种outcom的均值之间有显著性差异
## 均值多重比较
pairwise.t.test(usedata$illnessday,usedata$SampleCategry)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: usedata$illnessday and usedata$SampleCategry
##
## All other First
## First <2e-16 -
## Last <2e-16 <2e-16
##
## P value adjustment method: holm
## 结果说明各两两组之间的均值是有显著差异的
## 关联规则,发现频繁项集
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
##数据准备
table(usedata$illnessday)
##
## 3 4 5 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
## 74 148 74 296 296 222 222 592 370 296 74 444 222 222 74 74 222 74
## 22 23 24 25 26 27 28 29 31 32 34 39 42 44 48 68
## 148 148 74 74 74 148 74 74 74 148 74 74 148 74 74 74
table(usedata$age)
##
## 6 21 31 34 38 39 41 42 43 45 48 51 52 55 57 58 67 76
## 74 148 296 296 370 148 148 222 148 148 222 296 148 296 740 296 296 444
## 77 82
## 518 296
table(usedata[,4])
##
## Fatal Mild Severe
## 1480 962 3108
usedata2 <- apply(usedata,2,factor)
rulerdata <- as(data.frame(usedata2),"transactions")
inspect(rulerdata[1:10])
## items transactionID
## [1] {factor=ANG,
## factorgroup=生长因子,
## illnessday=10,
## outcom=Fatal,
## age=67,
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 1
## [2] {factor=BDNF,
## factorgroup=生长因子,
## illnessday=10,
## outcom=Fatal,
## age=67,
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 2
## [3] {factor=BLC,
## factorgroup=趋化因子,
## illnessday=10,
## outcom=Fatal,
## age=67,
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 3
## [4] {factor=CCL11,
## factorgroup=趋化因子,
## illnessday=10,
## outcom=Fatal,
## age=67,
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 4
## [5] {factor=CCL24,
## factorgroup=趋化因子,
## illnessday=10,
## outcom=Fatal,
## age=67,
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 5
## [6] {factor=CCL26,
## factorgroup=趋化因子,
## illnessday=10,
## outcom=Fatal,
## age=67,
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 6
## [7] {factor=Ck-b8-1,
## factorgroup=趋化因子,
## illnessday=10,
## outcom=Fatal,
## age=67,
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 7
## [8] {factor=CSF1,
## factorgroup=集落刺激因子,
## illnessday=10,
## outcom=Fatal,
## age=67,
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 8
## [9] {factor=CSF2,
## factorgroup=集落刺激因子,
## illnessday=10,
## outcom=Fatal,
## age=67,
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 9
## [10] {factor=CSF3,
## factorgroup=集落刺激因子,
## illnessday=10,
## outcom=Fatal,
## age=67,
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 10
par(cex = 0.8)
itemFrequencyPlot(rulerdata,topN = 35,main = "频繁的项")

## 挖掘关联规则
guize <- apriori(rulerdata,parameter = list(supp = 0.25, ##支持度
conf = 0.3, ## 置信度
minlen = 3))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.3 0.1 1 none FALSE TRUE 5 0.25 3
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 1387
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[146 item(s), 5550 transaction(s)] done [0.00s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [3 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(guize)
## set of 3 rules
##
## rule length distribution (lhs + rhs):sizes
## 3
## 3
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3 3 3 3 3 3
##
## summary of quality measures:
## support confidence lift
## Min. :0.2667 Min. :0.7143 Min. :1.136
## 1st Qu.:0.2667 1st Qu.:0.7418 1st Qu.:1.356
## Median :0.2667 Median :0.7692 Median :1.576
## Mean :0.2667 Mean :0.7723 Mean :1.453
## 3rd Qu.:0.2667 3rd Qu.:0.8013 3rd Qu.:1.612
## Max. :0.2667 Max. :0.8333 Max. :1.648
##
## mining info:
## data ntransactions support confidence
## rulerdata 5550 0.25 0.3
inspect(guize)
## lhs rhs
## [1] {diseaPH=2,SampleCategry=All other} => {sex=M}
## [2] {sex=M,SampleCategry=All other} => {diseaPH=2}
## [3] {sex=M,diseaPH=2} => {SampleCategry=All other}
## support confidence lift
## [1] 0.2666667 0.8333333 1.136364
## [2] 0.2666667 0.7692308 1.648352
## [3] 0.2666667 0.7142857 1.575630
guize <- apriori(rulerdata,parameter = list(supp = 0.1, ##支持度
conf = 0.1, ## 置信度
minlen = 2),
appearance = list(rhs = c("outcom=Fatal","outcom=Mild","outcom=Severe"),
default = "lhs"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.1 0.1 1 none FALSE TRUE 5 0.1 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 555
##
## set item appearances ...[3 item(s)] done [0.00s].
## set transactions ...[146 item(s), 5550 transaction(s)] done [0.00s].
## sorting and recoding items ... [17 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [26 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(guize)
## set of 26 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 16 9 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 2.000 2.423 3.000 4.000
##
## summary of quality measures:
## support confidence lift
## Min. :0.1059 Min. :0.1818 Min. :0.6818
## 1st Qu.:0.1100 1st Qu.:0.4614 1st Qu.:0.9514
## Median :0.1400 Median :0.5600 Median :1.0453
## Mean :0.1652 Mean :0.5309 Mean :1.0974
## 3rd Qu.:0.1942 3rd Qu.:0.6364 3rd Qu.:1.1507
## Max. :0.4667 Max. :1.0000 Max. :1.8750
##
## mining info:
## data ntransactions support confidence
## rulerdata 5550 0.1 0.1
inspect(sort(guize,by = "lift"))
## lhs rhs support confidence lift
## [1] {sex=F} => {outcom=Fatal} 0.1333333 0.5000000 1.8750000
## [2] {sex=M,
## diseaPH=4} => {outcom=Severe} 0.1066667 1.0000000 1.7857143
## [3] {diseaPH=4} => {outcom=Severe} 0.1200000 0.8181818 1.4610390
## [4] {sex=M,
## SampleCategry=Last} => {outcom=Severe} 0.1200000 0.7500000 1.3392857
## [5] {diseaPH=2} => {outcom=Mild} 0.1066667 0.2285714 1.3186813
## [6] {sex=M,
## SampleCategry=All other} => {outcom=Severe} 0.2266667 0.6538462 1.1675824
## [7] {SampleCategry=Last} => {outcom=Severe} 0.1466667 0.6470588 1.1554622
## [8] {sex=M} => {outcom=Severe} 0.4666667 0.6363636 1.1363636
## [9] {factorgroup=生长因子,
## sex=M} => {outcom=Severe} 0.1639640 0.6363636 1.1363636
## [10] {factorgroup=趋化因子,
## sex=M} => {outcom=Severe} 0.1639640 0.6363636 1.1363636
## [11] {sex=M,
## diseaPH=2,
## SampleCategry=All other} => {outcom=Severe} 0.1600000 0.6000000 1.0714286
## [12] {SampleCategry=All other} => {outcom=Severe} 0.2666667 0.5882353 1.0504202
## [13] {sex=M} => {outcom=Mild} 0.1333333 0.1818182 1.0489510
## [14] {diseaPH=2,
## SampleCategry=All other} => {outcom=Severe} 0.1866667 0.5833333 1.0416667
## [15] {sex=M,
## diseaPH=2} => {outcom=Severe} 0.2133333 0.5714286 1.0204082
## [16] {factorgroup=白介素} => {outcom=Severe} 0.1059459 0.5600000 1.0000000
## [17] {factorgroup=生长因子} => {outcom=Severe} 0.1967568 0.5600000 1.0000000
## [18] {factorgroup=趋化因子} => {outcom=Severe} 0.1967568 0.5600000 1.0000000
## [19] {diseaPH=2} => {outcom=Severe} 0.2533333 0.5428571 0.9693878
## [20] {sex=M,
## SampleCategry=First} => {outcom=Severe} 0.1200000 0.5294118 0.9453782
## [21] {SampleCategry=All other} => {outcom=Fatal} 0.1066667 0.2352941 0.8823529
## [22] {diseaPH=2} => {outcom=Fatal} 0.1066667 0.2285714 0.8571429
## [23] {diseaPH=1,
## SampleCategry=First} => {outcom=Severe} 0.1066667 0.4705882 0.8403361
## [24] {SampleCategry=First} => {outcom=Severe} 0.1466667 0.4583333 0.8184524
## [25] {diseaPH=1} => {outcom=Severe} 0.1066667 0.4444444 0.7936508
## [26] {sex=M} => {outcom=Fatal} 0.1333333 0.1818182 0.6818182
## 这些是发现的规则,提升度lift大于1的规则是有意义的规则
## jiang数值分组
usedata3 <- usedata
usedata3$illnessday <- cut_number(usedata3$illnessday,4)
usedata3$age <- cut_number(usedata3$age,4)
usedata3 <- apply(usedata3,2,factor)
rulerdata2 <- as(data.frame(usedata3),"transactions")
inspect(rulerdata2[1:10])
## items transactionID
## [1] {factor=ANG,
## factorgroup=生长因子,
## illnessday=[3,11],
## outcom=Fatal,
## age=(55,67],
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 1
## [2] {factor=BDNF,
## factorgroup=生长因子,
## illnessday=[3,11],
## outcom=Fatal,
## age=(55,67],
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 2
## [3] {factor=BLC,
## factorgroup=趋化因子,
## illnessday=[3,11],
## outcom=Fatal,
## age=(55,67],
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 3
## [4] {factor=CCL11,
## factorgroup=趋化因子,
## illnessday=[3,11],
## outcom=Fatal,
## age=(55,67],
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 4
## [5] {factor=CCL24,
## factorgroup=趋化因子,
## illnessday=[3,11],
## outcom=Fatal,
## age=(55,67],
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 5
## [6] {factor=CCL26,
## factorgroup=趋化因子,
## illnessday=[3,11],
## outcom=Fatal,
## age=(55,67],
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 6
## [7] {factor=Ck-b8-1,
## factorgroup=趋化因子,
## illnessday=[3,11],
## outcom=Fatal,
## age=(55,67],
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 7
## [8] {factor=CSF1,
## factorgroup=集落刺激因子,
## illnessday=[3,11],
## outcom=Fatal,
## age=(55,67],
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 8
## [9] {factor=CSF2,
## factorgroup=集落刺激因子,
## illnessday=[3,11],
## outcom=Fatal,
## age=(55,67],
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 9
## [10] {factor=CSF3,
## factorgroup=集落刺激因子,
## illnessday=[3,11],
## outcom=Fatal,
## age=(55,67],
## sex=M,
## diseaPH=1,
## SampleCategry=All other} 10
par(cex = 0.8)
itemFrequencyPlot(rulerdata2,topN = 35,main = "频繁的项")

## 挖掘关联规则
guize <- apriori(rulerdata2,parameter = list(supp = 0.2, ##支持度
conf = 0.3, ## 置信度
minlen = 3))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.3 0.1 1 none FALSE TRUE 5 0.2 3
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 1110
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[100 item(s), 5550 transaction(s)] done [0.00s].
## sorting and recoding items ... [19 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(guize)
## set of 15 rules
##
## rule length distribution (lhs + rhs):sizes
## 3
## 15
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3 3 3 3 3 3
##
## summary of quality measures:
## support confidence lift
## Min. :0.2000 Min. :0.4571 Min. :0.9796
## 1st Qu.:0.2133 1st Qu.:0.6126 1st Qu.:1.1423
## Median :0.2267 Median :0.8095 Median :1.2784
## Mean :0.2267 Mean :0.7603 Mean :1.7365
## 3rd Qu.:0.2267 3rd Qu.:0.8938 3rd Qu.:2.3270
## Max. :0.2667 Max. :1.0000 Max. :3.3730
##
## mining info:
## data ntransactions support confidence
## rulerdata2 5550 0.2 0.3
inspect(guize)
## lhs rhs support confidence lift
## [1] {illnessday=(11,15],
## diseaPH=2} => {sex=M} 0.2000000 0.9375000 1.2784091
## [2] {illnessday=(11,15],
## sex=M} => {diseaPH=2} 0.2000000 1.0000000 2.1428571
## [3] {sex=M,
## diseaPH=2} => {illnessday=(11,15]} 0.2000000 0.5357143 2.5111607
## [4] {diseaPH=1,
## SampleCategry=First} => {illnessday=[3,11]} 0.2266667 1.0000000 2.8846154
## [5] {illnessday=[3,11],
## diseaPH=1} => {SampleCategry=First} 0.2266667 0.9444444 2.9513889
## [6] {illnessday=[3,11],
## SampleCategry=First} => {diseaPH=1} 0.2266667 0.8095238 3.3730159
## [7] {diseaPH=2,
## SampleCategry=All other} => {sex=M} 0.2666667 0.8333333 1.1363636
## [8] {sex=M,
## SampleCategry=All other} => {diseaPH=2} 0.2666667 0.7692308 1.6483516
## [9] {sex=M,
## diseaPH=2} => {SampleCategry=All other} 0.2666667 0.7142857 1.5756303
## [10] {outcom=Severe,
## SampleCategry=All other} => {sex=M} 0.2266667 0.8500000 1.1590909
## [11] {sex=M,
## SampleCategry=All other} => {outcom=Severe} 0.2266667 0.6538462 1.1675824
## [12] {outcom=Severe,
## sex=M} => {SampleCategry=All other} 0.2266667 0.4857143 1.0714286
## [13] {outcom=Severe,
## diseaPH=2} => {sex=M} 0.2133333 0.8421053 1.1483254
## [14] {sex=M,
## diseaPH=2} => {outcom=Severe} 0.2133333 0.5714286 1.0204082
## [15] {outcom=Severe,
## sex=M} => {diseaPH=2} 0.2133333 0.4571429 0.9795918
guize <- apriori(rulerdata2,parameter = list(supp = 0.1, ##支持度
conf = 0.1, ## 置信度
minlen = 3),
appearance = list(rhs = c("outcom=Fatal","outcom=Mild","outcom=Severe"),
default = "lhs"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.1 0.1 1 none FALSE TRUE 5 0.1 3
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 555
##
## set item appearances ...[3 item(s)] done [0.00s].
## set transactions ...[100 item(s), 5550 transaction(s)] done [0.00s].
## sorting and recoding items ... [23 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [25 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(guize)
## set of 25 rules
##
## rule length distribution (lhs + rhs):sizes
## 3 4
## 20 5
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.0 3.0 3.0 3.2 3.0 4.0
##
## summary of quality measures:
## support confidence lift
## Min. :0.1067 Min. :0.4286 Min. :0.7653
## 1st Qu.:0.1067 1st Qu.:0.5556 1st Qu.:0.9921
## Median :0.1200 Median :0.6250 Median :1.1161
## Mean :0.1374 Mean :0.6744 Mean :1.2043
## 3rd Qu.:0.1600 3rd Qu.:0.7500 3rd Qu.:1.3393
## Max. :0.2267 Max. :1.0000 Max. :1.7857
##
## mining info:
## data ntransactions support confidence
## rulerdata2 5550 0.1 0.1
inspect(sort(guize,by = "lift"))
## lhs rhs support confidence lift
## [1] {sex=M,
## diseaPH=4} => {outcom=Severe} 0.1066667 1.0000000 1.7857143
## [2] {illnessday=(23,68],
## sex=M} => {outcom=Severe} 0.1600000 1.0000000 1.7857143
## [3] {age=(67,82],
## sex=M} => {outcom=Severe} 0.1866667 1.0000000 1.7857143
## [4] {illnessday=(23,68],
## sex=M,
## diseaPH=4} => {outcom=Severe} 0.1066667 1.0000000 1.7857143
## [5] {illnessday=(23,68],
## sex=M,
## SampleCategry=Last} => {outcom=Severe} 0.1066667 1.0000000 1.7857143
## [6] {illnessday=(23,68],
## diseaPH=4} => {outcom=Severe} 0.1200000 0.8181818 1.4610390
## [7] {illnessday=(23,68],
## SampleCategry=Last} => {outcom=Severe} 0.1200000 0.7500000 1.3392857
## [8] {sex=M,
## SampleCategry=Last} => {outcom=Severe} 0.1200000 0.7500000 1.3392857
## [9] {age=[6,41],
## sex=M} => {outcom=Severe} 0.1200000 0.6923077 1.2362637
## [10] {sex=M,
## SampleCategry=All other} => {outcom=Severe} 0.2266667 0.6538462 1.1675824
## [11] {factorgroup=生长因子,
## sex=M} => {outcom=Severe} 0.1639640 0.6363636 1.1363636
## [12] {factorgroup=趋化因子,
## sex=M} => {outcom=Severe} 0.1639640 0.6363636 1.1363636
## [13] {illnessday=(11,15],
## diseaPH=2} => {outcom=Severe} 0.1333333 0.6250000 1.1160714
## [14] {illnessday=(11,15],
## sex=M} => {outcom=Severe} 0.1200000 0.6000000 1.0714286
## [15] {illnessday=(11,15],
## sex=M,
## diseaPH=2} => {outcom=Severe} 0.1200000 0.6000000 1.0714286
## [16] {sex=M,
## diseaPH=2,
## SampleCategry=All other} => {outcom=Severe} 0.1600000 0.6000000 1.0714286
## [17] {diseaPH=2,
## SampleCategry=All other} => {outcom=Severe} 0.1866667 0.5833333 1.0416667
## [18] {sex=M,
## diseaPH=2} => {outcom=Severe} 0.2133333 0.5714286 1.0204082
## [19] {illnessday=[3,11],
## sex=M} => {outcom=Severe} 0.1333333 0.5555556 0.9920635
## [20] {sex=M,
## SampleCategry=First} => {outcom=Severe} 0.1200000 0.5294118 0.9453782
## [21] {diseaPH=1,
## SampleCategry=First} => {outcom=Severe} 0.1066667 0.4705882 0.8403361
## [22] {illnessday=[3,11],
## diseaPH=1,
## SampleCategry=First} => {outcom=Severe} 0.1066667 0.4705882 0.8403361
## [23] {illnessday=[3,11],
## diseaPH=1} => {outcom=Severe} 0.1066667 0.4444444 0.7936508
## [24] {age=(41,55],
## sex=M} => {outcom=Severe} 0.1066667 0.4444444 0.7936508
## [25] {illnessday=[3,11],
## SampleCategry=First} => {outcom=Severe} 0.1200000 0.4285714 0.7653061
## 这些是发现的规则,提升度lift大于1的规则是有意义的规则