Predicao de interacoes geneticas em E.coli.

͍ndice

1-Introducao

2-Conjunto completo de GI

3-Sub-conjunto estatisticamente confiavel de GI

1 - Introducao

2 - Conjunto completo de dados de GI

gi <- read.csv("GI.csv")
excluiVars = names(gi) %in% c("geneA", "geneB", "tipo")
newGI <- gi[!excluiVars]

Grafico betmet_min x s-score para primeira analise visual

plot(newGI$s.score ~ newGI$betInt_min, xlab = "betInt_min", ylab = "S-score")

plot of chunk unnamed-chunk-2

betCut <- cut(newGI$betInt_min, 10)
boxplot(newGI$s.score ~ betCut, xlab = "betInt_min", ylab = "S-score")

plot of chunk unnamed-chunk-2


excluiVars = names(gi) %in% c("geneA", "geneB", "tipo", "z.score", "s.score")
centralidades <- gi[!excluiVars]
zscore <- gi$z.score
sscore <- gi$s.score
cent <- as.matrix(centralidades)
lmS <- lm(sscore ~ cent)
lmZ <- lm(zscore ~ cent)

Regressao linear para o S - score

summary(lmS)
## 
## Call:
## lm(formula = sscore ~ cent)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.865  -0.062   0.084   0.208  16.419 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -9.74e-02   5.45e-03  -17.86  < 2e-16 ***
## centdegInt_min  2.00e-03   4.05e-04    4.94  7.8e-07 ***
## centdegInt_max  8.94e-04   3.28e-04    2.72  0.00643 ** 
## centbetInt_min -7.78e+01   1.26e+01   -6.16  7.3e-10 ***
## centbetInt_max  3.10e-02   4.77e-01    0.07  0.94816    
## centdegppi_min -7.05e-03   1.21e-03   -5.82  5.8e-09 ***
## centdegppi_max  3.52e-04   4.44e-04    0.79  0.42708    
## centdegreg_min  1.21e-02   1.53e-02    0.79  0.42906    
## centdegreg_max -1.02e-03   3.36e-04   -3.02  0.00256 ** 
## centdegmet_min -1.72e-04   1.34e-03   -0.13  0.89787    
## centdegmet_max  3.56e-04   2.54e-04    1.40  0.16105    
## centregin_min  -3.39e-02   1.56e-02   -2.17  0.02999 *  
## centregin_max   1.98e-02   2.73e-03    7.26  3.8e-13 ***
## centregout_min -4.97e-02   1.56e-02   -3.19  0.00141 ** 
## centregout_max        NA         NA      NA       NA    
## centmetin_min  -5.61e-03   1.50e-03   -3.75  0.00018 ***
## centmetin_max  -1.61e-03   3.42e-04   -4.70  2.7e-06 ***
## centmetout_min  6.43e-04   1.50e-03    0.43  0.66915    
## centmetout_max        NA         NA      NA       NA    
## centbetppi_min  6.93e+00   5.61e+00    1.24  0.21664    
## centbetppi_max -4.03e+00   1.16e+00   -3.49  0.00049 ***
## centbetreg_min  1.61e+05   1.38e+04   11.61  < 2e-16 ***
## centbetreg_max  5.64e+02   2.20e+02    2.56  0.01056 *  
## centbetmet_min  8.07e+00   3.28e+00    2.46  0.01375 *  
## centbetmet_max -1.91e-01   3.86e-01   -0.50  0.62058    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.747 on 224377 degrees of freedom
## Multiple R-squared:  0.00335,    Adjusted R-squared:  0.00326 
## F-statistic: 34.3 on 22 and 224377 DF,  p-value: <2e-16

A centralidade com menor p-value: o betInt_min (7.3e-10). A correlacao entre o betInt_min e o S-score:

cor(cent[, 3], sscore)
## [1] -0.03791

Regressao linear para o Z - score

summary(lmZ)
## 
## Call:
## lm(formula = zscore ~ cent)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -36.92  -0.11   0.12   0.31  22.96 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.76e-02   7.62e-03    2.31  0.02079 *  
## centdegInt_min  1.23e-03   5.66e-04    2.18  0.02930 *  
## centdegInt_max -5.52e-04   4.58e-04   -1.20  0.22831    
## centbetInt_min -6.83e+01   1.77e+01   -3.87  0.00011 ***
## centbetInt_max  7.23e-01   6.66e-01    1.09  0.27735    
## centdegppi_min -7.49e-03   1.69e-03   -4.43  9.6e-06 ***
## centdegppi_max  5.50e-04   6.20e-04    0.89  0.37493    
## centdegreg_min  2.50e-02   2.13e-02    1.17  0.24165    
## centdegreg_max  2.73e-04   4.70e-04    0.58  0.56114    
## centdegmet_min  1.76e-03   1.87e-03    0.94  0.34750    
## centdegmet_max  9.78e-04   3.55e-04    2.76  0.00586 ** 
## centregin_min  -5.04e-02   2.18e-02   -2.31  0.02089 *  
## centregin_max   7.81e-03   3.81e-03    2.05  0.04036 *  
## centregout_min -7.86e-02   2.17e-02   -3.62  0.00030 ***
## centregout_max        NA         NA      NA       NA    
## centmetin_min  -7.75e-03   2.09e-03   -3.71  0.00021 ***
## centmetin_max  -1.31e-03   4.78e-04   -2.75  0.00600 ** 
## centmetout_min -7.43e-04   2.10e-03   -0.35  0.72335    
## centmetout_max        NA         NA      NA       NA    
## centbetppi_min  1.41e+01   7.83e+00    1.79  0.07275 .  
## centbetppi_max -3.59e+00   1.62e+00   -2.22  0.02641 *  
## centbetreg_min  2.06e+05   1.93e+04   10.65  < 2e-16 ***
## centbetreg_max  5.30e+02   3.08e+02    1.72  0.08551 .  
## centbetmet_min  9.71e+00   4.58e+00    2.12  0.03383 *  
## centbetmet_max -1.39e-01   5.39e-01   -0.26  0.79685    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.04 on 224377 degrees of freedom
## Multiple R-squared:  0.0027, Adjusted R-squared:  0.0026 
## F-statistic: 27.6 on 22 and 224377 DF,  p-value: <2e-16

A centralidade com menor p-value - o degppi_min (9.6e-06). A correlacao entre o betInt_min e o Z-score -

cor(cent[, 3], zscore)
## [1] -0.03744

Regressao linear somente para o betInt_min

lmbetInt <- lm(sscore ~ centralidades$betInt_min)
summary(lmbetInt)
## 
## Call:
## lm(formula = sscore ~ centralidades$betInt_min)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.958  -0.061   0.082   0.207  16.424 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -0.0804     0.0016   -50.4   <2e-16 ***
## centralidades$betInt_min -62.5586     3.4814   -18.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.748 on 224398 degrees of freedom
## Multiple R-squared:  0.00144,    Adjusted R-squared:  0.00143 
## F-statistic:  323 on 1 and 224398 DF,  p-value: <2e-16

Coeficiente de correlação entre Score e betInt_min

cor(sscore, centralidades$betInt_min)
## [1] -0.03791

3 - Sub-conjunto estatisticamente confiavel de GI

Geracao das tabelas para S-score, Z-score e Log-score

highScoreGI = read.csv("scoreHighConfidenceCentralidades2.csv")
excluiVarsScore = names(highScoreGI) %in% c("gene1", "gene2", "Zscore", "Log", 
    "scoreNominal")
sscore <- highScoreGI[!excluiVarsScore]

excluiVarsZcore = names(highScoreGI) %in% c("gene1", "gene2", "Sscore", "Log", 
    "scoreNominal")
zscore <- highScoreGI[!excluiVarsZcore]

excluiVarsLog = names(highScoreGI) %in% c("gene1", "gene2", "Sscore", "Zscore", 
    "scoreNominal")
logscore <- highScoreGI[!excluiVarsLog]

Linear Model

plot(sscore$Sscore ~ sscore$min_betInt)

plot of chunk unnamed-chunk-10

lmS <- lm(sscore$Sscore ~ sscore$min_betInt)
summary(lmS)
## 
## Call:
## lm(formula = sscore$Sscore ~ sscore$min_betInt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.909  -2.161  -0.012   1.476  20.423 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -4.080      0.149  -27.43   <2e-16 ***
## sscore$min_betInt  175.798    391.254    0.45     0.65    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.7 on 1108 degrees of freedom
## Multiple R-squared:  0.000182,   Adjusted R-squared:  -0.00072 
## F-statistic: 0.202 on 1 and 1108 DF,  p-value: 0.653

Correlacao entre o Score e betInt_min

cor(sscore$Sscore, sscore$min_betInt)
## [1] 0.0135

Observação: pelo menos para os dados com alta significância as árvores do Weka deram bons resultados: AUC = 69% para cross-validation e AUC = 74% para o conjunto de treinamento.