fenxi_1.R

## 目的就是分析这个表中的数据 最好是所有能分析出来的全部分析出来
## 例如相关性 疾病结果和年龄，严重，生病天数的分析和各个指标的相关性分析
## 对基因做T－test 求P－value  和q－value 再做p.adjust做多重假设检验矫正
## 其他所有能分析出来的最好都要


## array data 就是芯片
## factor就是基因
## actor group是基因的分组

## outcome就是最好的结果 这人是活了还是死了

## 然后那个傻傻ph 就是疾病的阶段
## sampling 那个就是录取样本的分类



## 加载包
library(readxl)
library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## Loading required package: data.table

## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.

## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.3.2

library(ca)
library(dplyr)

## -------------------------------------------------------------------------

## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!

## -------------------------------------------------------------------------

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

theme_set(theme_bw(base_family = "STKaiti"))

## 读取数据
ansdata <- read_excel("Array_data_nobackground_normalised.xlsx")
summary(ansdata)

##     Factor          Factor_group         Array data      
##  Length:6400        Length:6400        Min.   :    0.76  
##  Class :character   Class :character   1st Qu.:   80.23  
##  Mode  :character   Mode  :character   Median :  143.06  
##                                        Mean   : 1487.96  
##                                        3rd Qu.:  316.36  
##                                        Max.   :59801.26  
##                                                          
##   Patient Id        Collection_Date     Illness_Day      Outcome         
##  Length:6400        Length:6400        Min.   : 3.00   Length:6400       
##  Class :character   Class :character   1st Qu.:11.00   Class :character  
##  Mode  :character   Mode  :character   Median :15.00   Mode  :character  
##                                        Mean   :17.89                     
##                                        3rd Qu.:23.00                     
##                                        Max.   :68.00                     
##                                        NA's   :400                       
##       Age            Sex            Disease Phase Sampling Categry  
##  Min.   : 6.00   Length:6400        Min.   :1.0   Length:6400       
##  1st Qu.:41.00   Class :character   1st Qu.:2.0   Class :character  
##  Median :55.00   Mode  :character   Median :2.0   Mode  :character  
##  Mean   :53.67                      Mean   :2.2                     
##  3rd Qu.:67.00                      3rd Qu.:3.0                     
##  Max.   :82.00                      Max.   :4.0                     
##  NA's   :400                        NA's   :400

## 处理缺失值
sum(is.na(ansdata))

## [1] 2480

par(cex = 0.8)
VIM::aggr(ansdata)

colnames(ansdata) <- c("factor","factorgroup","arraydata","paintID",
                        "collectiondate","illnessday","outcom","age","sex",
                        "diseaPH","SampleCategry")
summary(ansdata)

##     factor          factorgroup          arraydata       
##  Length:6400        Length:6400        Min.   :    0.76  
##  Class :character   Class :character   1st Qu.:   80.23  
##  Mode  :character   Mode  :character   Median :  143.06  
##                                        Mean   : 1487.96  
##                                        3rd Qu.:  316.36  
##                                        Max.   :59801.26  
##                                                          
##    paintID          collectiondate       illnessday       outcom         
##  Length:6400        Length:6400        Min.   : 3.00   Length:6400       
##  Class :character   Class :character   1st Qu.:11.00   Class :character  
##  Mode  :character   Mode  :character   Median :15.00   Mode  :character  
##                                        Mean   :17.89                     
##                                        3rd Qu.:23.00                     
##                                        Max.   :68.00                     
##                                        NA's   :400                       
##       age            sex               diseaPH    SampleCategry     
##  Min.   : 6.00   Length:6400        Min.   :1.0   Length:6400       
##  1st Qu.:41.00   Class :character   1st Qu.:2.0   Class :character  
##  Median :55.00   Mode  :character   Median :2.0   Mode  :character  
##  Mean   :53.67                      Mean   :2.2                     
##  3rd Qu.:67.00                      3rd Qu.:3.0                     
##  Max.   :82.00                      Max.   :4.0                     
##  NA's   :400                        NA's   :400

usedata <- ansdata[c("factor","factorgroup","illnessday","outcom","age","sex",
                      "diseaPH","SampleCategry")]

summary(usedata)

##     factor          factorgroup          illnessday       outcom         
##  Length:6400        Length:6400        Min.   : 3.00   Length:6400       
##  Class :character   Class :character   1st Qu.:11.00   Class :character  
##  Mode  :character   Mode  :character   Median :15.00   Mode  :character  
##                                        Mean   :17.89                     
##                                        3rd Qu.:23.00                     
##                                        Max.   :68.00                     
##                                        NA's   :400                       
##       age            sex               diseaPH    SampleCategry     
##  Min.   : 6.00   Length:6400        Min.   :1.0   Length:6400       
##  1st Qu.:41.00   Class :character   1st Qu.:2.0   Class :character  
##  Median :55.00   Mode  :character   Median :2.0   Mode  :character  
##  Mean   :53.67                      Mean   :2.2                     
##  3rd Qu.:67.00                      3rd Qu.:3.0                     
##  Max.   :82.00                      Max.   :4.0                     
##  NA's   :400                        NA's   :400

VIM::aggr(usedata)

## 对数据进行分析

## 剔除带有缺失值的行
usedata <- na.omit(usedata)

# ## 查看数据的分组
# for (col in colnames(dansdata)){
#   if(is.character(dansdata[,col])){
#     par(family = "STKaiti")
#     barplot(table(dansdata[,col]),main = col)
#   }
# }
# 
# 
# for (col in colnames(dansdata)){
#   if(is.numeric(dansdata[,col])){
#     par(family = "STKaiti")
#     hist(dansdata[,col],main = col)
#   }
# }

## 选择有用的数据分析


usedata <- usedata%>%
  group_by(factor,factorgroup,sex,outcom)




## 分析factor变量
table(usedata$factor)

## 
##      ANG     BDNF      BLC    CCL11    CCL24    CCL26  Ck-b8-1     CSF1 
##       75       75       75       75       75       75       75       75 
##     CSF2     CSF3   CX3CL1      EGF   ENA-78    FGF-4    FGF-6    FGF-7 
##       75       75       75       75       75       75       75       75 
##    FGF-9 Flt-3-LG    GCP-2     GDNF      GRO    GRO-a      HGF    I-309 
##       75       75       75       75       75       75       75       75 
##    IFN-g    IGF-I  IGFBP-1  IGFBP-2  IGFBP-3  IGFBP-4    IL-10 IL-12p70 
##       75       75       75       75       75       75       75       75 
##    IL-13    IL-15    IL-16    IL-1a    IL-1b     IL-2     IL-3     IL-4 
##       75       75       75       75       75       75       75       75 
##     IL-5     IL-6     IL-7     IL-8    IP-10    LIGHT    MCP-1    MCP-2 
##       75       75       75       75       75       75       75       75 
##    MCP-3    MCP-4      MDC      MIG   MIP-1b   MIP-1d   MIP-3a    NAP-2 
##       75       75       75       75       75       75       75       75 
##     NT-3     NT-4      OPG      OSM     PARC  PDGF-BB     PIGF   RANTES 
##       75       75       75       75       75       75       75       75 
##      SCF    SDF-1     TARC   TGF-b1   TGF-b2   TGF-b3     THPO    TNF-a 
##       75       75       75       75       75       75       75       75 
##    TNF-b     VEGF 
##       75       75

ggplot(as.data.frame(table(usedata$factor)),aes(x = Var1,y = Freq)) +
  geom_bar(stat = "identity",fill = "blue") +
  theme_bw(base_size = 9) +
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5)) +
  labs(x = "Factor",y = "Frequency",title = "Factor bar plot") +
  theme(plot.title = element_text(hjust = 0.5))

## Factor group是基因的分组
table(usedata$factorgroup)

## 
##       白介素       干扰素 集落刺激因子     趋化因子     生长因子 
##         1050           75          225         1950         1950 
## 肿瘤坏死因子 
##          300

ggplot(as.data.frame(table(usedata$factorgroup)),aes(x = Var1,y = Freq)) +
  geom_bar(stat = "identity",fill = "blue") +
  theme(axis.text.x = element_text(vjust = 0.5)) +
  labs(x = "Factor Group",y = "Frequency",title = "Factor Group bar plot") +
  theme(plot.title = element_text(hjust = 0.5))

## "illnessday" 探索
summary(usedata$illnessday)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00   11.00   15.00   17.89   23.00   68.00

ggplot(usedata) +
  geom_histogram(aes(illnessday),binwidth = 5,fill = "blue",colour = "red") +
  labs(title = "Illness day histogram") +
  theme(plot.title = element_text(hjust = 0.5))

## "outcom"探索
table(usedata$outcom)

## 
##  Fatal   Mild Severe 
##   1480    962   3108

ggplot(as.data.frame(table(usedata$outcom)),aes(x = Var1,y = Freq)) +
  geom_bar(stat = "identity",fill = "blue") +
  theme(axis.text.x = element_text(vjust = 0.5)) +
  labs(x = "Outcom",y = "Frequency",title = "Outcom bar plot") +
  theme(plot.title = element_text(hjust = 0.5))

## "age"
summary(usedata$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.00   41.00   55.00   53.67   67.00   82.00

ggplot(usedata) +
  geom_histogram(aes(age),binwidth = 5,fill = "blue",colour = "red") +
  labs(title = "Age histogram") +
  theme(plot.title = element_text(hjust = 0.5))

## "sex"
table(usedata$sex)

## 
##    F    M 
## 1480 4070

## "diseaPH"
table(usedata$diseaPH)

## 
##    1    2    3    4 
## 1332 2590  814  814

ggplot(as.data.frame(table(usedata$diseaPH)),aes(x = Var1,y = Freq)) +
  geom_bar(stat = "identity",fill = "blue") +
  theme(axis.text.x = element_text(vjust = 0.5)) +
  labs(x = "diseaPH",y = "Frequency",title = "diseaPH bar plot") +
  theme(plot.title = element_text(hjust = 0.5))

## "SampleCategry"
table(usedata$SampleCategry)

## 
## All other     First      Last 
##      2516      1776      1258

ggplot(as.data.frame(table(usedata$SampleCategry)),aes(x = Var1,y = Freq)) +
  geom_bar(stat = "identity",fill = "blue") +
  theme(axis.text.x = element_text(vjust = 0.5)) +
  labs(x = "Sample Categry",y = "Frequency",title = "Sample Categry bar plot") +
  theme(plot.title = element_text(hjust = 0.5))

## 例如相关性 疾病结果和年龄，严重，生病天数的分析和各个指标的相关性分析
## 分类变量和数值变量间的相关性不好讨论
## 1:outcom 与年龄的关系
ggplot(usedata) +
  geom_boxplot(aes(x = outcom,y = age),position="dodge") +
  labs(title = "outcom boxplot") +
  theme(plot.title = element_text(hjust = 0.5))

## 单因子方差分析
summary(aov(age~outcom,usedata))

##               Df  Sum Sq Mean Sq F value Pr(>F)    
## outcom         2  223227  111613   444.9 <2e-16 ***
## Residuals   5547 1391651     251                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 结果显示p值《0.05，拒绝原假设，认为三种outcom的均值之间有显著想差异

## 均值多重比较
pairwise.t.test(usedata$age,usedata$outcom)

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  usedata$age and usedata$outcom 
## 
##        Fatal   Mild   
## Mild   < 2e-16 -      
## Severe 4.2e-15 < 2e-16
## 
## P value adjustment method: holm

## 结果说明各两两组之间的均值是有显著差异的

## 2:outcom 与 生病天数  的关系

ggplot(usedata) +
  geom_boxplot(aes(x = outcom,y = illnessday,fill = outcom),position="dodge") +
  labs(title = "outcom boxplot") +
  theme(plot.title = element_text(hjust = 0.5))

## 单因子方差分析
summary(aov(illnessday~outcom,usedata))

##               Df Sum Sq Mean Sq F value Pr(>F)    
## outcom         2  42129   21064   167.7 <2e-16 ***
## Residuals   5547 696624     126                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 结果显示p值《0.05，拒绝原假设，认为三种outcom的均值之间有显著想差异

## 均值多重比较
pairwise.t.test(usedata$illnessday,usedata$outcom)

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  usedata$illnessday and usedata$outcom 
## 
##        Fatal  Mild  
## Mild   <2e-16 -     
## Severe 0.023  <2e-16
## 
## P value adjustment method: holm

## 结果说明各两两组之间的均值是有显著差异的


table(usedata$outcom)

## 
##  Fatal   Mild Severe 
##   1480    962   3108

##  因为每个分组的样本数量不一样，所以无法做cor.test()相关系数分析



## 3:outcom 与 factor group  的关系
table(usedata$outcom,usedata$factorgroup)

##         
##          白介素 干扰素 集落刺激因子 趋化因子 生长因子 肿瘤坏死因子
##   Fatal     280     20           60      520      520           80
##   Mild      182     13           39      338      338           52
##   Severe    588     42          126     1092     1092          168

## 两变量是否独立的卡方检验
chisq.test(usedata$outcom,usedata$factorgroup)

## 
##  Pearson's Chi-squared test
## 
## data:  usedata$outcom and usedata$factorgroup
## X-squared = 0, df = 10, p-value = 1

## p-value = 1，接受原假设，说明两变量是独立的

## 马赛克图也说明了这一点两变量是独立的
mosaicplot(~ factorgroup+outcom,data = usedata)

## 4:outcom 与 diseaPH 的关系
table(usedata$outcom,usedata$diseaPH)

##         
##             1    2    3    4
##   Fatal   370  592  370  148
##   Mild    370  592    0    0
##   Severe  592 1406  444  666

## 两变量是否独立的卡方检验
chisq.test(usedata$outcom,usedata$diseaPH)

## 
##  Pearson's Chi-squared test
## 
## data:  usedata$outcom and usedata$diseaPH
## X-squared = 686.36, df = 6, p-value < 2.2e-16

## p-value <0.05，拒绝原假设，说明两变量是有关系的
mosaicplot(~ diseaPH+outcom,data = usedata)

## 对应分析

ca1 <- ca(~ diseaPH+outcom,data = usedata)
summary(ca1)

## 
## Principal inertias (eigenvalues):
## 
##  dim    value      %   cum%   scree plot               
##  1      0.090181  72.9  72.9  ******************       
##  2      0.033489  27.1 100.0  *******                  
##         -------- -----                                 
##  Total: 0.123669 100.0                                 
## 
## 
## Rows:
##     name   mass  qlt  inr    k=1 cor ctr    k=2 cor ctr  
## 1 |    1 |  240 1000  169 | -284 927 215 |   80  73  46 |
## 2 |    2 |  467 1000   89 | -141 843 103 |  -61 157  51 |
## 3 |    3 |  147 1000  363 |  428 598 298 |  351 402 539 |
## 4 |    4 |  147 1000  379 |  486 740 384 | -288 260 364 |
## 
## Columns:
##     name   mass  qlt  inr    k=1 cor ctr    k=2 cor ctr  
## 1 | Fatl |  267 1000  210 |   94  90  26 |  298 910 707 |
## 2 | Mild |  173 1000  600 | -653 997 820 |  -36   3   7 |
## 3 | Sevr |  560 1000  190 |  158 592 154 | -131 408 286 |

## 对应分析图
par(family = "STKaiti",mfrow = c(1,1))
plot.ca(ca1,main = "~ diseaPH+outcom")

## 点离的越近，说明因素之间关系越强



## 5:outcom 与 SampleCategry 的关系
table(usedata$outcom,usedata$SampleCategry)

##         
##          All other First Last
##   Fatal        592   444  444
##   Mild         444   518    0
##   Severe      1480   814  814

## 两变量是否独立的卡方检验
chisq.test(usedata$outcom,usedata$SampleCategry)

## 
##  Pearson's Chi-squared test
## 
## data:  usedata$outcom and usedata$SampleCategry
## X-squared = 461.31, df = 4, p-value < 2.2e-16

## p-value <0.05，拒绝原假设，说明两变量是有关系的
mosaicplot(~ SampleCategry+outcom,data = usedata)

## 对应分析
ca1 <- ca(~ SampleCategry+outcom,data = usedata)
summary(ca1)

## 
## Principal inertias (eigenvalues):
## 
##  dim    value      %   cum%   scree plot               
##  1      0.078922  95.0  95.0  ************************ 
##  2      0.004197   5.0 100.0  *                        
##         -------- -----                                 
##  Total: 0.083119 100.0                                 
## 
## 
## Rows:
##     name   mass  qlt  inr    k=1 cor ctr    k=2 cor ctr  
## 1 | Allt |  453 1000   28 |  -11  23   1 |   71 977 546 |
## 2 | Frst |  320 1000  386 | -311 962 391 |  -62  38 289 |
## 3 | Last |  227 1000  586 |  460 986 608 |  -55  14 165 |
## 
## Columns:
##     name   mass  qlt  inr    k=1  cor ctr    k=2 cor ctr  
## 1 | Fatl |  267 1000  100 |  144  666  70 | -102 334 663 |
## 2 | Mild |  173 1000  784 | -613 1000 826 |   -5   0   1 |
## 3 | Sevr |  560 1000  116 |  121  853 104 |   50 147 336 |

## 对应分析图
par(family = "STKaiti",mfrow = c(1,1))
plot.ca(ca1,main = "~ SampleCategry+outcom")

## 点离的越近，说明因素之间关系越强




## 对基因做T－test 求P－value  和q－value 再做p.adjust做多重假设检验矫正
## 1:factor 与年龄的关系
ggplot(usedata) +
  geom_boxplot(aes(x = factor,y = age),position="dodge") +
  labs(title = "factor boxplot") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_bw(base_size = 9) +
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5)) +
  theme(plot.title = element_text(hjust = 0.5))

## 1:factor 与年龄的关系 没啥关系

table(usedata$factor)

## 
##      ANG     BDNF      BLC    CCL11    CCL24    CCL26  Ck-b8-1     CSF1 
##       75       75       75       75       75       75       75       75 
##     CSF2     CSF3   CX3CL1      EGF   ENA-78    FGF-4    FGF-6    FGF-7 
##       75       75       75       75       75       75       75       75 
##    FGF-9 Flt-3-LG    GCP-2     GDNF      GRO    GRO-a      HGF    I-309 
##       75       75       75       75       75       75       75       75 
##    IFN-g    IGF-I  IGFBP-1  IGFBP-2  IGFBP-3  IGFBP-4    IL-10 IL-12p70 
##       75       75       75       75       75       75       75       75 
##    IL-13    IL-15    IL-16    IL-1a    IL-1b     IL-2     IL-3     IL-4 
##       75       75       75       75       75       75       75       75 
##     IL-5     IL-6     IL-7     IL-8    IP-10    LIGHT    MCP-1    MCP-2 
##       75       75       75       75       75       75       75       75 
##    MCP-3    MCP-4      MDC      MIG   MIP-1b   MIP-1d   MIP-3a    NAP-2 
##       75       75       75       75       75       75       75       75 
##     NT-3     NT-4      OPG      OSM     PARC  PDGF-BB     PIGF   RANTES 
##       75       75       75       75       75       75       75       75 
##      SCF    SDF-1     TARC   TGF-b1   TGF-b2   TGF-b3     THPO    TNF-a 
##       75       75       75       75       75       75       75       75 
##    TNF-b     VEGF 
##       75       75

## 在factor 与年龄的数据中，所有的数据都是一样的，没必要分析这个关系,因为相关系数都为1
factorname <- names(table(usedata$factor))
cor.test(usedata$age[usedata$factor == factorname[1]],
         usedata$age[usedata$factor == factorname[2]])

## 
##  Pearson's product-moment correlation
## 
## data:  usedata$age[usedata$factor == factorname[1]] and usedata$age[usedata$factor == factorname[2]]
## t = Inf, df = 73, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  1 1
## sample estimates:
## cor 
##   1

## ## 2:factor 与 生病天数  的关系

ggplot(usedata) +
  geom_boxplot(aes(x = factorgroup,y = illnessday),position="dodge") +
  labs(title = "factor boxplot") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(vjust = 0.5)) +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(usedata) +
  geom_boxplot(aes(x = factor,y = illnessday),position="dodge") +
  labs(title = "factor boxplot") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5)) +
  theme(plot.title = element_text(hjust = 0.5))

## 2:factor 与 生病天数  的关系    没啥关系

table(usedata$factorgroup)

## 
##       白介素       干扰素 集落刺激因子     趋化因子     生长因子 
##         1050           75          225         1950         1950 
## 肿瘤坏死因子 
##          300

##  因为每个分组的样本数量不一样，所以无法做cor.test()相关系数分析


## 3: factor  与 outcom  的关系
table(usedata$factor,usedata$outcom)

##           
##            Fatal Mild Severe
##   ANG         20   13     42
##   BDNF        20   13     42
##   BLC         20   13     42
##   CCL11       20   13     42
##   CCL24       20   13     42
##   CCL26       20   13     42
##   Ck-b8-1     20   13     42
##   CSF1        20   13     42
##   CSF2        20   13     42
##   CSF3        20   13     42
##   CX3CL1      20   13     42
##   EGF         20   13     42
##   ENA-78      20   13     42
##   FGF-4       20   13     42
##   FGF-6       20   13     42
##   FGF-7       20   13     42
##   FGF-9       20   13     42
##   Flt-3-LG    20   13     42
##   GCP-2       20   13     42
##   GDNF        20   13     42
##   GRO         20   13     42
##   GRO-a       20   13     42
##   HGF         20   13     42
##   I-309       20   13     42
##   IFN-g       20   13     42
##   IGF-I       20   13     42
##   IGFBP-1     20   13     42
##   IGFBP-2     20   13     42
##   IGFBP-3     20   13     42
##   IGFBP-4     20   13     42
##   IL-10       20   13     42
##   IL-12p70    20   13     42
##   IL-13       20   13     42
##   IL-15       20   13     42
##   IL-16       20   13     42
##   IL-1a       20   13     42
##   IL-1b       20   13     42
##   IL-2        20   13     42
##   IL-3        20   13     42
##   IL-4        20   13     42
##   IL-5        20   13     42
##   IL-6        20   13     42
##   IL-7        20   13     42
##   IL-8        20   13     42
##   IP-10       20   13     42
##   LIGHT       20   13     42
##   MCP-1       20   13     42
##   MCP-2       20   13     42
##   MCP-3       20   13     42
##   MCP-4       20   13     42
##   MDC         20   13     42
##   MIG         20   13     42
##   MIP-1b      20   13     42
##   MIP-1d      20   13     42
##   MIP-3a      20   13     42
##   NAP-2       20   13     42
##   NT-3        20   13     42
##   NT-4        20   13     42
##   OPG         20   13     42
##   OSM         20   13     42
##   PARC        20   13     42
##   PDGF-BB     20   13     42
##   PIGF        20   13     42
##   RANTES      20   13     42
##   SCF         20   13     42
##   SDF-1       20   13     42
##   TARC        20   13     42
##   TGF-b1      20   13     42
##   TGF-b2      20   13     42
##   TGF-b3      20   13     42
##   THPO        20   13     42
##   TNF-a       20   13     42
##   TNF-b       20   13     42
##   VEGF        20   13     42

## 两变量是否独立的卡方检验
chisq.test(usedata$factor,usedata$outcom)

## 
##  Pearson's Chi-squared test
## 
## data:  usedata$factor and usedata$outcom
## X-squared = 0, df = 146, p-value = 1

## p-value = 1，接受原假设，说明两变量是独立的

## 马赛克图也说明了这一点两变量是独立的
mosaicplot(~ factor+outcom,data = usedata)

## 4:factor 与 diseaPH 的关系
table(usedata$factor,usedata$diseaPH)

##           
##             1  2  3  4
##   ANG      18 35 11 11
##   BDNF     18 35 11 11
##   BLC      18 35 11 11
##   CCL11    18 35 11 11
##   CCL24    18 35 11 11
##   CCL26    18 35 11 11
##   Ck-b8-1  18 35 11 11
##   CSF1     18 35 11 11
##   CSF2     18 35 11 11
##   CSF3     18 35 11 11
##   CX3CL1   18 35 11 11
##   EGF      18 35 11 11
##   ENA-78   18 35 11 11
##   FGF-4    18 35 11 11
##   FGF-6    18 35 11 11
##   FGF-7    18 35 11 11
##   FGF-9    18 35 11 11
##   Flt-3-LG 18 35 11 11
##   GCP-2    18 35 11 11
##   GDNF     18 35 11 11
##   GRO      18 35 11 11
##   GRO-a    18 35 11 11
##   HGF      18 35 11 11
##   I-309    18 35 11 11
##   IFN-g    18 35 11 11
##   IGF-I    18 35 11 11
##   IGFBP-1  18 35 11 11
##   IGFBP-2  18 35 11 11
##   IGFBP-3  18 35 11 11
##   IGFBP-4  18 35 11 11
##   IL-10    18 35 11 11
##   IL-12p70 18 35 11 11
##   IL-13    18 35 11 11
##   IL-15    18 35 11 11
##   IL-16    18 35 11 11
##   IL-1a    18 35 11 11
##   IL-1b    18 35 11 11
##   IL-2     18 35 11 11
##   IL-3     18 35 11 11
##   IL-4     18 35 11 11
##   IL-5     18 35 11 11
##   IL-6     18 35 11 11
##   IL-7     18 35 11 11
##   IL-8     18 35 11 11
##   IP-10    18 35 11 11
##   LIGHT    18 35 11 11
##   MCP-1    18 35 11 11
##   MCP-2    18 35 11 11
##   MCP-3    18 35 11 11
##   MCP-4    18 35 11 11
##   MDC      18 35 11 11
##   MIG      18 35 11 11
##   MIP-1b   18 35 11 11
##   MIP-1d   18 35 11 11
##   MIP-3a   18 35 11 11
##   NAP-2    18 35 11 11
##   NT-3     18 35 11 11
##   NT-4     18 35 11 11
##   OPG      18 35 11 11
##   OSM      18 35 11 11
##   PARC     18 35 11 11
##   PDGF-BB  18 35 11 11
##   PIGF     18 35 11 11
##   RANTES   18 35 11 11
##   SCF      18 35 11 11
##   SDF-1    18 35 11 11
##   TARC     18 35 11 11
##   TGF-b1   18 35 11 11
##   TGF-b2   18 35 11 11
##   TGF-b3   18 35 11 11
##   THPO     18 35 11 11
##   TNF-a    18 35 11 11
##   TNF-b    18 35 11 11
##   VEGF     18 35 11 11

## 两变量是否独立的卡方检验
chisq.test(usedata$factor,usedata$diseaPH)

## 
##  Pearson's Chi-squared test
## 
## data:  usedata$factor and usedata$diseaPH
## X-squared = 0, df = 219, p-value = 1

## p-value ＝1，接受原假设，说明两变量是没有关系的
mosaicplot(~ diseaPH+factor,data = usedata)

## 4:factor group 与 diseaPH 的关系
table(usedata$factorgroup,usedata$diseaPH)

##               
##                  1   2   3   4
##   白介素       252 490 154 154
##   干扰素        18  35  11  11
##   集落刺激因子  54 105  33  33
##   趋化因子     468 910 286 286
##   生长因子     468 910 286 286
##   肿瘤坏死因子  72 140  44  44

## 两变量是否独立的卡方检验
chisq.test(usedata$factorgroup,usedata$diseaPH)

## 
##  Pearson's Chi-squared test
## 
## data:  usedata$factorgroup and usedata$diseaPH
## X-squared = 0, df = 15, p-value = 1

## p-value ＝1，接受原假设，说明两变量是没有关系的
mosaicplot(~ factorgroup + diseaPH,data = usedata)

## 4:factor 与 SampleCategry 的关系
table(usedata$factor,usedata$SampleCategry)

##           
##            All other First Last
##   ANG             34    24   17
##   BDNF            34    24   17
##   BLC             34    24   17
##   CCL11           34    24   17
##   CCL24           34    24   17
##   CCL26           34    24   17
##   Ck-b8-1         34    24   17
##   CSF1            34    24   17
##   CSF2            34    24   17
##   CSF3            34    24   17
##   CX3CL1          34    24   17
##   EGF             34    24   17
##   ENA-78          34    24   17
##   FGF-4           34    24   17
##   FGF-6           34    24   17
##   FGF-7           34    24   17
##   FGF-9           34    24   17
##   Flt-3-LG        34    24   17
##   GCP-2           34    24   17
##   GDNF            34    24   17
##   GRO             34    24   17
##   GRO-a           34    24   17
##   HGF             34    24   17
##   I-309           34    24   17
##   IFN-g           34    24   17
##   IGF-I           34    24   17
##   IGFBP-1         34    24   17
##   IGFBP-2         34    24   17
##   IGFBP-3         34    24   17
##   IGFBP-4         34    24   17
##   IL-10           34    24   17
##   IL-12p70        34    24   17
##   IL-13           34    24   17
##   IL-15           34    24   17
##   IL-16           34    24   17
##   IL-1a           34    24   17
##   IL-1b           34    24   17
##   IL-2            34    24   17
##   IL-3            34    24   17
##   IL-4            34    24   17
##   IL-5            34    24   17
##   IL-6            34    24   17
##   IL-7            34    24   17
##   IL-8            34    24   17
##   IP-10           34    24   17
##   LIGHT           34    24   17
##   MCP-1           34    24   17
##   MCP-2           34    24   17
##   MCP-3           34    24   17
##   MCP-4           34    24   17
##   MDC             34    24   17
##   MIG             34    24   17
##   MIP-1b          34    24   17
##   MIP-1d          34    24   17
##   MIP-3a          34    24   17
##   NAP-2           34    24   17
##   NT-3            34    24   17
##   NT-4            34    24   17
##   OPG             34    24   17
##   OSM             34    24   17
##   PARC            34    24   17
##   PDGF-BB         34    24   17
##   PIGF            34    24   17
##   RANTES          34    24   17
##   SCF             34    24   17
##   SDF-1           34    24   17
##   TARC            34    24   17
##   TGF-b1          34    24   17
##   TGF-b2          34    24   17
##   TGF-b3          34    24   17
##   THPO            34    24   17
##   TNF-a           34    24   17
##   TNF-b           34    24   17
##   VEGF            34    24   17

## 两变量是否独立的卡方检验
chisq.test(usedata$factor,usedata$SampleCategry)

## 
##  Pearson's Chi-squared test
## 
## data:  usedata$factor and usedata$SampleCategry
## X-squared = 0, df = 146, p-value = 1

## p-value ＝1，接受原假设，说明两变量是没有关系的
mosaicplot(~ factor+SampleCategry,data = usedata)

## 4:factor group 与 diseaPH 的关系
table(usedata$factorgroup,usedata$SampleCategry)

##               
##                All other First Last
##   白介素             476   336  238
##   干扰素              34    24   17
##   集落刺激因子       102    72   51
##   趋化因子           884   624  442
##   生长因子           884   624  442
##   肿瘤坏死因子       136    96   68

## 两变量是否独立的卡方检验
chisq.test(usedata$factorgroup,usedata$SampleCategry)

## 
##  Pearson's Chi-squared test
## 
## data:  usedata$factorgroup and usedata$SampleCategry
## X-squared = 0, df = 10, p-value = 1

## p-value ＝1，接受原假设，说明两变量是没有关系的
mosaicplot(~ factorgroup + SampleCategry,data = usedata)

## "diseaPH"和"SampleCategry"之间的关系
table(usedata$diseaPH,usedata$SampleCategry)

##    
##     All other First Last
##   1        74  1258    0
##   2      1776   518  296
##   3       518     0  296
##   4       148     0  666

## 两变量是否独立的卡方检验
chisq.test(usedata$diseaPH,usedata$SampleCategry)

## 
##  Pearson's Chi-squared test
## 
## data:  usedata$diseaPH and usedata$SampleCategry
## X-squared = 4996.7, df = 6, p-value < 2.2e-16

## p-value < 0.05，拒绝原假设，说明两变量是有关系，不独立
mosaicplot(~ diseaPH + SampleCategry,data = usedata)

## 对应分析
ca1 <- ca(~ SampleCategry+diseaPH,data = usedata)
summary(ca1)

## 
## Principal inertias (eigenvalues):
## 
##  dim    value      %   cum%   scree plot               
##  1      0.630351  70.0  70.0  ******************       
##  2      0.269950  30.0 100.0  *******                  
##         -------- -----                                 
##  Total: 0.900301 100.0                                 
## 
## 
## Rows:
##     name   mass  qlt  inr     k=1 cor ctr    k=2 cor ctr  
## 1 | Allt |  453 1000  193 |   320 266  73 | -531 734 473 |
## 2 | Frst |  320 1000  452 | -1106 961 621 |  223  39  59 |
## 3 | Last |  227 1000  354 |   922 604 306 |  746 396 468 |
## 
## Columns:
##     name   mass  qlt  inr     k=1 cor ctr    k=2 cor ctr  
## 1 |    1 |  240 1000  478 | -1293 932 637 |  349  68 109 |
## 2 |    2 |  467 1000  114 |   130  77  13 | -450 923 351 |
## 3 |    3 |  147 1000   78 |   678 966 107 | -128  34   9 |
## 4 |    4 |  147 1000  330 |  1023 517 244 |  989 483 532 |

## 对应分析图
par(family = "STKaiti",mfrow = c(1,1))
plot.ca(ca1,main = "~ SampleCategry+outcom")

## 点离的越近，说明因素之间关系越强

## "diseaPH" 和 age的关系
ggplot(usedata) +
  geom_boxplot(aes(x = factor(diseaPH),y = age),position="dodge") +
  labs(x = "diseaPH",title = "diseaPH boxplot") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(vjust = 0.5)) +
  theme(plot.title = element_text(hjust = 0.5))

## 单因子方差分析
summary(aov(age~diseaPH,usedata))

##               Df  Sum Sq Mean Sq F value Pr(>F)    
## diseaPH        1   43564   43564   153.8 <2e-16 ***
## Residuals   5548 1571314     283                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 结果显示p值《0.05，拒绝原假设，认为三种outcom的均值之间有显著性差异

## 均值多重比较
pairwise.t.test(usedata$age,usedata$diseaPH)

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  usedata$age and usedata$diseaPH 
## 
##   1       2       3      
## 2 2.9e-10 -       -      
## 3 3.0e-14 0.00128 -      
## 4 < 2e-16 3.1e-14 0.00043
## 
## P value adjustment method: holm

## 结果说明各两两组之间的均值是有显著差异的

table(usedata$diseaPH)

## 
##    1    2    3    4 
## 1332 2590  814  814

## 因子样本数量不一样，无法使用cor.test()分析

## "diseaPH" 和 illnessday的关系
ggplot(usedata) +
  geom_boxplot(aes(x = factor(diseaPH),y = illnessday),position="dodge") +
  labs(x = "diseaPH",title = "diseaPH boxplot") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(vjust = 0.5)) +
  theme(plot.title = element_text(hjust = 0.5))

## 单因子方差分析
summary(aov(illnessday~diseaPH,usedata))

##               Df Sum Sq Mean Sq F value Pr(>F)    
## diseaPH        1 556581  556581   16951 <2e-16 ***
## Residuals   5548 182172      33                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 结果显示p值《0.05，拒绝原假设，认为三种outcom的均值之间有显著性差异

## 均值多重比较
pairwise.t.test(usedata$illnessday,usedata$diseaPH)

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  usedata$illnessday and usedata$diseaPH 
## 
##   1      2      3     
## 2 <2e-16 -      -     
## 3 <2e-16 <2e-16 -     
## 4 <2e-16 <2e-16 <2e-16
## 
## P value adjustment method: holm

## 结果说明各两两组之间的均值是有显著差异的


## SampleCategry 和 age的关系
ggplot(usedata) +
  geom_boxplot(aes(x = SampleCategry,y = age),position="dodge") +
  labs(x = "SampleCategry",title = "diseaPH boxplot") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(vjust = 0.5)) +
  theme(plot.title = element_text(hjust = 0.5))

## 单因子方差分析
summary(aov(age~SampleCategry,usedata))

##                 Df  Sum Sq Mean Sq F value Pr(>F)    
## SampleCategry    2   22508   11254    39.2 <2e-16 ***
## Residuals     5547 1592369     287                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 结果显示p值《0.05，拒绝原假设，认为三种outcom的均值之间有显著性差异

## 均值多重比较
pairwise.t.test(usedata$age,usedata$SampleCategry)

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  usedata$age and usedata$SampleCategry 
## 
##       All other First 
## First 1e-06     -     
## Last  1e-06     <2e-16
## 
## P value adjustment method: holm

## 结果说明各两两组之间的均值是有显著差异的


table(usedata$SampleCategry)

## 
## All other     First      Last 
##      2516      1776      1258

## 因子样本数量不一样，无法使用cor.test()分析

## "diseaPH" 和 illnessday的关系
ggplot(usedata) +
  geom_boxplot(aes(x = SampleCategry,y = illnessday),position="dodge") +
  labs(x = "SampleCategry",title = "diseaPH boxplot") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(vjust = 0.5)) +
  theme(plot.title = element_text(hjust = 0.5))

## 单因子方差分析
summary(aov(illnessday~SampleCategry,usedata))

##                 Df Sum Sq Mean Sq F value Pr(>F)    
## SampleCategry    2 390813  195407    3115 <2e-16 ***
## Residuals     5547 347940      63                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 结果显示p值《0.05，拒绝原假设，认为三种outcom的均值之间有显著性差异

## 均值多重比较
pairwise.t.test(usedata$illnessday,usedata$SampleCategry)

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  usedata$illnessday and usedata$SampleCategry 
## 
##       All other First 
## First <2e-16    -     
## Last  <2e-16    <2e-16
## 
## P value adjustment method: holm

## 结果说明各两两组之间的均值是有显著差异的



## 关联规则，发现频繁项集
library(arules)

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

##数据准备
table(usedata$illnessday)

## 
##   3   4   5   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21 
##  74 148  74 296 296 222 222 592 370 296  74 444 222 222  74  74 222  74 
##  22  23  24  25  26  27  28  29  31  32  34  39  42  44  48  68 
## 148 148  74  74  74 148  74  74  74 148  74  74 148  74  74  74

table(usedata$age)

## 
##   6  21  31  34  38  39  41  42  43  45  48  51  52  55  57  58  67  76 
##  74 148 296 296 370 148 148 222 148 148 222 296 148 296 740 296 296 444 
##  77  82 
## 518 296

table(usedata[,4])

## 
##  Fatal   Mild Severe 
##   1480    962   3108

usedata2 <- apply(usedata,2,factor)
rulerdata <- as(data.frame(usedata2),"transactions")
inspect(rulerdata[1:10])

##      items                      transactionID
## [1]  {factor=ANG,                            
##       factorgroup=生长因子,                  
##       illnessday=10,                         
##       outcom=Fatal,                          
##       age=67,                                
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             1 
## [2]  {factor=BDNF,                           
##       factorgroup=生长因子,                  
##       illnessday=10,                         
##       outcom=Fatal,                          
##       age=67,                                
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             2 
## [3]  {factor=BLC,                            
##       factorgroup=趋化因子,                  
##       illnessday=10,                         
##       outcom=Fatal,                          
##       age=67,                                
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             3 
## [4]  {factor=CCL11,                          
##       factorgroup=趋化因子,                  
##       illnessday=10,                         
##       outcom=Fatal,                          
##       age=67,                                
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             4 
## [5]  {factor=CCL24,                          
##       factorgroup=趋化因子,                  
##       illnessday=10,                         
##       outcom=Fatal,                          
##       age=67,                                
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             5 
## [6]  {factor=CCL26,                          
##       factorgroup=趋化因子,                  
##       illnessday=10,                         
##       outcom=Fatal,                          
##       age=67,                                
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             6 
## [7]  {factor=Ck-b8-1,                        
##       factorgroup=趋化因子,                  
##       illnessday=10,                         
##       outcom=Fatal,                          
##       age=67,                                
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             7 
## [8]  {factor=CSF1,                           
##       factorgroup=集落刺激因子,              
##       illnessday=10,                         
##       outcom=Fatal,                          
##       age=67,                                
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             8 
## [9]  {factor=CSF2,                           
##       factorgroup=集落刺激因子,              
##       illnessday=10,                         
##       outcom=Fatal,                          
##       age=67,                                
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             9 
## [10] {factor=CSF3,                           
##       factorgroup=集落刺激因子,              
##       illnessday=10,                         
##       outcom=Fatal,                          
##       age=67,                                
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             10

par(cex = 0.8)
itemFrequencyPlot(rulerdata,topN = 35,main = "频繁的项")

## 挖掘关联规则

guize <- apriori(rulerdata,parameter = list(supp = 0.25, ##支持度
                                            conf = 0.3, ## 置信度
                                            minlen = 3))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.3    0.1    1 none FALSE            TRUE       5    0.25      3
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 1387 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[146 item(s), 5550 transaction(s)] done [0.00s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [3 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

summary(guize)

## set of 3 rules
## 
## rule length distribution (lhs + rhs):sizes
## 3 
## 3 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       3       3       3       3       3       3 
## 
## summary of quality measures:
##     support         confidence          lift      
##  Min.   :0.2667   Min.   :0.7143   Min.   :1.136  
##  1st Qu.:0.2667   1st Qu.:0.7418   1st Qu.:1.356  
##  Median :0.2667   Median :0.7692   Median :1.576  
##  Mean   :0.2667   Mean   :0.7723   Mean   :1.453  
##  3rd Qu.:0.2667   3rd Qu.:0.8013   3rd Qu.:1.612  
##  Max.   :0.2667   Max.   :0.8333   Max.   :1.648  
## 
## mining info:
##       data ntransactions support confidence
##  rulerdata          5550    0.25        0.3

inspect(guize)

##     lhs                                    rhs                      
## [1] {diseaPH=2,SampleCategry=All other} => {sex=M}                  
## [2] {sex=M,SampleCategry=All other}     => {diseaPH=2}              
## [3] {sex=M,diseaPH=2}                   => {SampleCategry=All other}
##     support   confidence lift    
## [1] 0.2666667 0.8333333  1.136364
## [2] 0.2666667 0.7692308  1.648352
## [3] 0.2666667 0.7142857  1.575630

guize <- apriori(rulerdata,parameter = list(supp = 0.1, ##支持度
                                            conf = 0.1, ## 置信度
                                            minlen = 2),
                 appearance = list(rhs = c("outcom=Fatal","outcom=Mild","outcom=Severe"),
                                   default = "lhs"))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.1    0.1    1 none FALSE            TRUE       5     0.1      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 555 
## 
## set item appearances ...[3 item(s)] done [0.00s].
## set transactions ...[146 item(s), 5550 transaction(s)] done [0.00s].
## sorting and recoding items ... [17 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [26 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

summary(guize)

## set of 26 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3  4 
## 16  9  1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   2.000   2.423   3.000   4.000 
## 
## summary of quality measures:
##     support         confidence          lift       
##  Min.   :0.1059   Min.   :0.1818   Min.   :0.6818  
##  1st Qu.:0.1100   1st Qu.:0.4614   1st Qu.:0.9514  
##  Median :0.1400   Median :0.5600   Median :1.0453  
##  Mean   :0.1652   Mean   :0.5309   Mean   :1.0974  
##  3rd Qu.:0.1942   3rd Qu.:0.6364   3rd Qu.:1.1507  
##  Max.   :0.4667   Max.   :1.0000   Max.   :1.8750  
## 
## mining info:
##       data ntransactions support confidence
##  rulerdata          5550     0.1        0.1

inspect(sort(guize,by = "lift"))

##      lhs                          rhs               support confidence      lift
## [1]  {sex=F}                   => {outcom=Fatal}  0.1333333  0.5000000 1.8750000
## [2]  {sex=M,                                                                    
##       diseaPH=4}               => {outcom=Severe} 0.1066667  1.0000000 1.7857143
## [3]  {diseaPH=4}               => {outcom=Severe} 0.1200000  0.8181818 1.4610390
## [4]  {sex=M,                                                                    
##       SampleCategry=Last}      => {outcom=Severe} 0.1200000  0.7500000 1.3392857
## [5]  {diseaPH=2}               => {outcom=Mild}   0.1066667  0.2285714 1.3186813
## [6]  {sex=M,                                                                    
##       SampleCategry=All other} => {outcom=Severe} 0.2266667  0.6538462 1.1675824
## [7]  {SampleCategry=Last}      => {outcom=Severe} 0.1466667  0.6470588 1.1554622
## [8]  {sex=M}                   => {outcom=Severe} 0.4666667  0.6363636 1.1363636
## [9]  {factorgroup=生长因子,                                                     
##       sex=M}                   => {outcom=Severe} 0.1639640  0.6363636 1.1363636
## [10] {factorgroup=趋化因子,                                                     
##       sex=M}                   => {outcom=Severe} 0.1639640  0.6363636 1.1363636
## [11] {sex=M,                                                                    
##       diseaPH=2,                                                                
##       SampleCategry=All other} => {outcom=Severe} 0.1600000  0.6000000 1.0714286
## [12] {SampleCategry=All other} => {outcom=Severe} 0.2666667  0.5882353 1.0504202
## [13] {sex=M}                   => {outcom=Mild}   0.1333333  0.1818182 1.0489510
## [14] {diseaPH=2,                                                                
##       SampleCategry=All other} => {outcom=Severe} 0.1866667  0.5833333 1.0416667
## [15] {sex=M,                                                                    
##       diseaPH=2}               => {outcom=Severe} 0.2133333  0.5714286 1.0204082
## [16] {factorgroup=白介素}      => {outcom=Severe} 0.1059459  0.5600000 1.0000000
## [17] {factorgroup=生长因子}    => {outcom=Severe} 0.1967568  0.5600000 1.0000000
## [18] {factorgroup=趋化因子}    => {outcom=Severe} 0.1967568  0.5600000 1.0000000
## [19] {diseaPH=2}               => {outcom=Severe} 0.2533333  0.5428571 0.9693878
## [20] {sex=M,                                                                    
##       SampleCategry=First}     => {outcom=Severe} 0.1200000  0.5294118 0.9453782
## [21] {SampleCategry=All other} => {outcom=Fatal}  0.1066667  0.2352941 0.8823529
## [22] {diseaPH=2}               => {outcom=Fatal}  0.1066667  0.2285714 0.8571429
## [23] {diseaPH=1,                                                                
##       SampleCategry=First}     => {outcom=Severe} 0.1066667  0.4705882 0.8403361
## [24] {SampleCategry=First}     => {outcom=Severe} 0.1466667  0.4583333 0.8184524
## [25] {diseaPH=1}               => {outcom=Severe} 0.1066667  0.4444444 0.7936508
## [26] {sex=M}                   => {outcom=Fatal}  0.1333333  0.1818182 0.6818182

## 这些是发现的规则，提升度lift大于1的规则是有意义的规则



## jiang数值分组
usedata3 <- usedata
usedata3$illnessday <- cut_number(usedata3$illnessday,4)
usedata3$age <- cut_number(usedata3$age,4)
usedata3 <- apply(usedata3,2,factor)
rulerdata2 <- as(data.frame(usedata3),"transactions")
inspect(rulerdata2[1:10])

##      items                      transactionID
## [1]  {factor=ANG,                            
##       factorgroup=生长因子,                  
##       illnessday=[3,11],                     
##       outcom=Fatal,                          
##       age=(55,67],                           
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             1 
## [2]  {factor=BDNF,                           
##       factorgroup=生长因子,                  
##       illnessday=[3,11],                     
##       outcom=Fatal,                          
##       age=(55,67],                           
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             2 
## [3]  {factor=BLC,                            
##       factorgroup=趋化因子,                  
##       illnessday=[3,11],                     
##       outcom=Fatal,                          
##       age=(55,67],                           
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             3 
## [4]  {factor=CCL11,                          
##       factorgroup=趋化因子,                  
##       illnessday=[3,11],                     
##       outcom=Fatal,                          
##       age=(55,67],                           
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             4 
## [5]  {factor=CCL24,                          
##       factorgroup=趋化因子,                  
##       illnessday=[3,11],                     
##       outcom=Fatal,                          
##       age=(55,67],                           
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             5 
## [6]  {factor=CCL26,                          
##       factorgroup=趋化因子,                  
##       illnessday=[3,11],                     
##       outcom=Fatal,                          
##       age=(55,67],                           
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             6 
## [7]  {factor=Ck-b8-1,                        
##       factorgroup=趋化因子,                  
##       illnessday=[3,11],                     
##       outcom=Fatal,                          
##       age=(55,67],                           
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             7 
## [8]  {factor=CSF1,                           
##       factorgroup=集落刺激因子,              
##       illnessday=[3,11],                     
##       outcom=Fatal,                          
##       age=(55,67],                           
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             8 
## [9]  {factor=CSF2,                           
##       factorgroup=集落刺激因子,              
##       illnessday=[3,11],                     
##       outcom=Fatal,                          
##       age=(55,67],                           
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             9 
## [10] {factor=CSF3,                           
##       factorgroup=集落刺激因子,              
##       illnessday=[3,11],                     
##       outcom=Fatal,                          
##       age=(55,67],                           
##       sex=M,                                 
##       diseaPH=1,                             
##       SampleCategry=All other}             10

par(cex = 0.8)
itemFrequencyPlot(rulerdata2,topN = 35,main = "频繁的项")

## 挖掘关联规则

guize <- apriori(rulerdata2,parameter = list(supp = 0.2, ##支持度
                                            conf = 0.3, ## 置信度
                                            minlen = 3))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.3    0.1    1 none FALSE            TRUE       5     0.2      3
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 1110 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[100 item(s), 5550 transaction(s)] done [0.00s].
## sorting and recoding items ... [19 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

summary(guize)

## set of 15 rules
## 
## rule length distribution (lhs + rhs):sizes
##  3 
## 15 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       3       3       3       3       3       3 
## 
## summary of quality measures:
##     support         confidence          lift       
##  Min.   :0.2000   Min.   :0.4571   Min.   :0.9796  
##  1st Qu.:0.2133   1st Qu.:0.6126   1st Qu.:1.1423  
##  Median :0.2267   Median :0.8095   Median :1.2784  
##  Mean   :0.2267   Mean   :0.7603   Mean   :1.7365  
##  3rd Qu.:0.2267   3rd Qu.:0.8938   3rd Qu.:2.3270  
##  Max.   :0.2667   Max.   :1.0000   Max.   :3.3730  
## 
## mining info:
##        data ntransactions support confidence
##  rulerdata2          5550     0.2        0.3

inspect(guize)

##      lhs                          rhs                         support confidence      lift
## [1]  {illnessday=(11,15],                                                                 
##       diseaPH=2}               => {sex=M}                   0.2000000  0.9375000 1.2784091
## [2]  {illnessday=(11,15],                                                                 
##       sex=M}                   => {diseaPH=2}               0.2000000  1.0000000 2.1428571
## [3]  {sex=M,                                                                              
##       diseaPH=2}               => {illnessday=(11,15]}      0.2000000  0.5357143 2.5111607
## [4]  {diseaPH=1,                                                                          
##       SampleCategry=First}     => {illnessday=[3,11]}       0.2266667  1.0000000 2.8846154
## [5]  {illnessday=[3,11],                                                                  
##       diseaPH=1}               => {SampleCategry=First}     0.2266667  0.9444444 2.9513889
## [6]  {illnessday=[3,11],                                                                  
##       SampleCategry=First}     => {diseaPH=1}               0.2266667  0.8095238 3.3730159
## [7]  {diseaPH=2,                                                                          
##       SampleCategry=All other} => {sex=M}                   0.2666667  0.8333333 1.1363636
## [8]  {sex=M,                                                                              
##       SampleCategry=All other} => {diseaPH=2}               0.2666667  0.7692308 1.6483516
## [9]  {sex=M,                                                                              
##       diseaPH=2}               => {SampleCategry=All other} 0.2666667  0.7142857 1.5756303
## [10] {outcom=Severe,                                                                      
##       SampleCategry=All other} => {sex=M}                   0.2266667  0.8500000 1.1590909
## [11] {sex=M,                                                                              
##       SampleCategry=All other} => {outcom=Severe}           0.2266667  0.6538462 1.1675824
## [12] {outcom=Severe,                                                                      
##       sex=M}                   => {SampleCategry=All other} 0.2266667  0.4857143 1.0714286
## [13] {outcom=Severe,                                                                      
##       diseaPH=2}               => {sex=M}                   0.2133333  0.8421053 1.1483254
## [14] {sex=M,                                                                              
##       diseaPH=2}               => {outcom=Severe}           0.2133333  0.5714286 1.0204082
## [15] {outcom=Severe,                                                                      
##       sex=M}                   => {diseaPH=2}               0.2133333  0.4571429 0.9795918

guize <- apriori(rulerdata2,parameter = list(supp = 0.1, ##支持度
                                            conf = 0.1, ## 置信度
                                            minlen = 3),
                 appearance = list(rhs = c("outcom=Fatal","outcom=Mild","outcom=Severe"),
                                   default = "lhs"))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.1    0.1    1 none FALSE            TRUE       5     0.1      3
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 555 
## 
## set item appearances ...[3 item(s)] done [0.00s].
## set transactions ...[100 item(s), 5550 transaction(s)] done [0.00s].
## sorting and recoding items ... [23 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [25 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

summary(guize)

## set of 25 rules
## 
## rule length distribution (lhs + rhs):sizes
##  3  4 
## 20  5 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     3.0     3.0     3.0     3.2     3.0     4.0 
## 
## summary of quality measures:
##     support         confidence          lift       
##  Min.   :0.1067   Min.   :0.4286   Min.   :0.7653  
##  1st Qu.:0.1067   1st Qu.:0.5556   1st Qu.:0.9921  
##  Median :0.1200   Median :0.6250   Median :1.1161  
##  Mean   :0.1374   Mean   :0.6744   Mean   :1.2043  
##  3rd Qu.:0.1600   3rd Qu.:0.7500   3rd Qu.:1.3393  
##  Max.   :0.2267   Max.   :1.0000   Max.   :1.7857  
## 
## mining info:
##        data ntransactions support confidence
##  rulerdata2          5550     0.1        0.1

inspect(sort(guize,by = "lift"))

##      lhs                          rhs               support confidence      lift
## [1]  {sex=M,                                                                    
##       diseaPH=4}               => {outcom=Severe} 0.1066667  1.0000000 1.7857143
## [2]  {illnessday=(23,68],                                                       
##       sex=M}                   => {outcom=Severe} 0.1600000  1.0000000 1.7857143
## [3]  {age=(67,82],                                                              
##       sex=M}                   => {outcom=Severe} 0.1866667  1.0000000 1.7857143
## [4]  {illnessday=(23,68],                                                       
##       sex=M,                                                                    
##       diseaPH=4}               => {outcom=Severe} 0.1066667  1.0000000 1.7857143
## [5]  {illnessday=(23,68],                                                       
##       sex=M,                                                                    
##       SampleCategry=Last}      => {outcom=Severe} 0.1066667  1.0000000 1.7857143
## [6]  {illnessday=(23,68],                                                       
##       diseaPH=4}               => {outcom=Severe} 0.1200000  0.8181818 1.4610390
## [7]  {illnessday=(23,68],                                                       
##       SampleCategry=Last}      => {outcom=Severe} 0.1200000  0.7500000 1.3392857
## [8]  {sex=M,                                                                    
##       SampleCategry=Last}      => {outcom=Severe} 0.1200000  0.7500000 1.3392857
## [9]  {age=[6,41],                                                               
##       sex=M}                   => {outcom=Severe} 0.1200000  0.6923077 1.2362637
## [10] {sex=M,                                                                    
##       SampleCategry=All other} => {outcom=Severe} 0.2266667  0.6538462 1.1675824
## [11] {factorgroup=生长因子,                                                     
##       sex=M}                   => {outcom=Severe} 0.1639640  0.6363636 1.1363636
## [12] {factorgroup=趋化因子,                                                     
##       sex=M}                   => {outcom=Severe} 0.1639640  0.6363636 1.1363636
## [13] {illnessday=(11,15],                                                       
##       diseaPH=2}               => {outcom=Severe} 0.1333333  0.6250000 1.1160714
## [14] {illnessday=(11,15],                                                       
##       sex=M}                   => {outcom=Severe} 0.1200000  0.6000000 1.0714286
## [15] {illnessday=(11,15],                                                       
##       sex=M,                                                                    
##       diseaPH=2}               => {outcom=Severe} 0.1200000  0.6000000 1.0714286
## [16] {sex=M,                                                                    
##       diseaPH=2,                                                                
##       SampleCategry=All other} => {outcom=Severe} 0.1600000  0.6000000 1.0714286
## [17] {diseaPH=2,                                                                
##       SampleCategry=All other} => {outcom=Severe} 0.1866667  0.5833333 1.0416667
## [18] {sex=M,                                                                    
##       diseaPH=2}               => {outcom=Severe} 0.2133333  0.5714286 1.0204082
## [19] {illnessday=[3,11],                                                        
##       sex=M}                   => {outcom=Severe} 0.1333333  0.5555556 0.9920635
## [20] {sex=M,                                                                    
##       SampleCategry=First}     => {outcom=Severe} 0.1200000  0.5294118 0.9453782
## [21] {diseaPH=1,                                                                
##       SampleCategry=First}     => {outcom=Severe} 0.1066667  0.4705882 0.8403361
## [22] {illnessday=[3,11],                                                        
##       diseaPH=1,                                                                
##       SampleCategry=First}     => {outcom=Severe} 0.1066667  0.4705882 0.8403361
## [23] {illnessday=[3,11],                                                        
##       diseaPH=1}               => {outcom=Severe} 0.1066667  0.4444444 0.7936508
## [24] {age=(41,55],                                                              
##       sex=M}                   => {outcom=Severe} 0.1066667  0.4444444 0.7936508
## [25] {illnessday=[3,11],                                                        
##       SampleCategry=First}     => {outcom=Severe} 0.1200000  0.4285714 0.7653061

## 这些是发现的规则，提升度lift大于1的规则是有意义的规则

fenxi_1.R

daitu

Sat Apr 8 23:27:34 2017