library(knitr)
library(ggplot2)
library(caret)
library(class)
library(gmodels)
library(kableExtra)
library(neuralnet)
library(kernlab)
library(rpart)
library(rpart.plot)
library(randomForest)
library(rattle)
library(RColorBrewer)
workingDir <- getwd()
dataDir <- file.path(workingDir,"../Data/")
dir(path = dataDir)
[1] "allpairsfeta.rds" "SGA10_BremKruglyak2005_v1.rds"
[3] "SGA16_BremKruglyak2005_v1.rds"
Cargamos los datos
SGA10 <- readRDS(paste0(dataDir, "SGA10_BremKruglyak2005_v1.rds"))
SGA16 <- readRDS(paste0(dataDir, "SGA16_BremKruglyak2005_v1.rds"))
allpaired <- readRDS(paste0(dataDir, "allpairsfeta.rds"))
dim(SGA10)
[1] 227 5
head(SGA10)
summary(SGA10)
GENE1 GENE2 SGAsco
Length:227 Length:227 Min. :-0.92210
Class :character Class :character 1st Qu.:-0.14750
Mode :character Mode :character Median :-0.06810
Mean :-0.13711
3rd Qu.:-0.03765
Max. :-0.00720
PCC NPC
Min. :-0.04256 Min. :0.01352
1st Qu.: 0.19316 1st Qu.:0.02127
Median : 0.28407 Median :0.02784
Mean : 0.34374 Mean :0.03239
3rd Qu.: 0.44519 3rd Qu.:0.03732
Max. : 0.93761 Max. :0.13079
str(SGA10)
'data.frame': 227 obs. of 5 variables:
$ GENE1 : chr "YPL045W" "YLR085C" "YPL091W" "YPR120C" ...
$ GENE2 : chr "YER114C" "YMR167W" "YOR237W" "YHR031C" ...
$ SGAsco: num -0.0072 -0.2766 -0.0201 -0.3233 -0.3202 ...
$ PCC : num 0.495 0.409 0.275 0.238 0.404 ...
$ NPC : num 0.0464 0.0479 0.0151 0.0198 0.0197 ...
dim(SGA16)
[1] 757 5
head(SGA16)
str(SGA16)
'data.frame': 757 obs. of 5 variables:
$ GENE1 : chr "YDR311W" "YIL039W" "YDR228C" "YJR093C" ...
$ GENE2 : chr "YDR131C" "YDR108W" "YLR277C" "YPR114W" ...
$ SGAsco: num -0.0571 -0.1693 -0.1522 -0.1016 -0.1578 ...
$ PCC : num 0.585 0.434 0.594 0.204 0.406 ...
$ NPC : num 0.0483 0.0335 0.026 0.0254 0.033 ...
dim(allpaired)
[1] 17461095 14
head(allpaired)
tail(allpaired)
str(allpaired)
'data.frame': 17461095 obs. of 14 variables:
$ V1 : chr "Q0045" "Q0045" "Q0045" "Q0045" ...
$ V2 : chr "Q0050" "Q0055" "Q0060" "Q0065" ...
$ Homology : chr "1" "1" "1" "1" ...
$ PhysicalInt : chr "0" "0" "0" "0" ...
$ CommonReg : chr "0" "0" "0" "0" ...
$ Colocalization: chr "0" "0" "0" "0" ...
$ Phenotype : chr "0" "0" "0" "0" ...
$ Ohnology : chr "0" "0" "0" "0" ...
$ Complexes : chr "0" "0" "0" "0" ...
$ SameFunction : chr "0" "0" "0" "0" ...
$ SameProtein : chr "0" "0" "0" "0" ...
$ fcBP : num 0.294 0.345 0.392 0.392 0.387 ...
$ fcCC : num 0.324 0.432 0.432 0.432 0.432 ...
$ fcMF : num 0.485 0.818 0.871 0.871 0.871 ...
allpaired$V1 <- NULL
allpaired$V2 <- NULL
allpaired[] <- lapply(allpaired, as.numeric)
str(allpaired)
'data.frame': 17461095 obs. of 12 variables:
$ Homology : num 1 1 1 1 1 0 0 0 0 0 ...
$ PhysicalInt : num 0 0 0 0 0 0 1 1 1 0 ...
$ CommonReg : num 0 0 0 0 0 0 0 0 0 0 ...
$ Colocalization: num 0 0 0 0 0 0 0 0 0 0 ...
$ Phenotype : num 0 0 0 0 0 0 0 0 0 0 ...
$ Ohnology : num 0 0 0 0 0 0 0 0 0 0 ...
$ Complexes : num 0 0 0 0 0 0 0 0 0 0 ...
$ SameFunction : num 0 0 0 0 0 0 0 0 0 0 ...
$ SameProtein : num 0 0 0 0 0 0 0 0 0 0 ...
$ fcBP : num 0.294 0.345 0.392 0.392 0.387 ...
$ fcCC : num 0.324 0.432 0.432 0.432 0.432 ...
$ fcMF : num 0.485 0.818 0.871 0.871 0.871 ...
allpaired_10 <- allpaired[row.names(allpaired) %in% row.names(SGA10), ]
dim(allpaired_10)
[1] 190 12
Vemos que en lugar de 227 registros hay 190, debido a que algunos genes han cambiado de identificador
allpaired_16 <- allpaired[row.names(allpaired) %in% row.names(SGA16), ]
dim(allpaired_16)
[1] 635 12
Vemos que en lugar de 757 registros, hay 635, debido a que algunos genes han cambiado de identificador
SGA10_full <- merge(SGA10, allpaired_10, by=0)
rownames(SGA10_full) <- SGA10_full$Row.names
SGA10_full$Row.names <- NULL
dim(SGA10_full)
[1] 190 17
head(SGA10_full)
SGA16_full <- merge(SGA16, allpaired_16, by=0)
rownames(SGA16_full) <- SGA16_full$Row.names
SGA16_full$Row.names <- NULL
dim(SGA16_full)
[1] 635 17
head(SGA16_full)
SGA10_full$GENE1 <- NULL
SGA10_full$GENE2 <- NULL
SGA16_full$GENE1 <- NULL
SGA16_full$GENE2 <- NULL
mod1 <- lm(SGAsco ~. -NPC -fcMF , SGA10_full)
summary(mod1)
Call:
lm(formula = SGAsco ~ . - NPC - fcMF, data = SGA10_full)
Residuals:
Min 1Q Median 3Q Max
-0.49977 -0.04257 0.02682 0.06667 0.41775
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.06514 0.02438 -2.671 0.008261 **
PCC 0.06176 0.05360 1.152 0.250823
Homology 0.04954 0.06646 0.745 0.457038
PhysicalInt 0.05644 0.08797 0.642 0.521982
CommonReg -0.01943 0.03538 -0.549 0.583582
Colocalization -0.11032 0.04828 -2.285 0.023504 *
Phenotype -0.06806 0.06928 -0.982 0.327226
Ohnology -0.20133 0.08714 -2.310 0.022018 *
Complexes 0.15479 0.08802 1.759 0.080385 .
SameFunction -0.15191 0.04264 -3.563 0.000472 ***
SameProtein -0.28151 0.07911 -3.559 0.000479 ***
fcBP -0.16859 0.07486 -2.252 0.025549 *
fcCC -0.10993 0.05058 -2.173 0.031071 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.1271 on 177 degrees of freedom
Multiple R-squared: 0.5301, Adjusted R-squared: 0.4982
F-statistic: 16.64 on 12 and 177 DF, p-value: < 2.2e-16
(RMSE.lm1 <- sqrt(mean(mod1$residuals^2)))
[1] 0.1226716
mod2 <- lm(SGAsco ~ Ohnology + Complexes + SameProtein +
SameFunction + fcBP + fcCC + NPC, SGA10_full)
summary(mod2)
Call:
lm(formula = SGAsco ~ Ohnology + Complexes + SameProtein + SameFunction +
fcBP + fcCC + NPC, data = SGA10_full)
Residuals:
Min 1Q Median 3Q Max
-0.52410 -0.03507 0.02359 0.06385 0.35944
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.003326 0.024082 0.138 0.890296
Ohnology -0.207477 0.071485 -2.902 0.004161 **
Complexes 0.166463 0.071605 2.325 0.021190 *
SameProtein -0.253821 0.070119 -3.620 0.000382 ***
SameFunction -0.114987 0.041052 -2.801 0.005645 **
fcBP -0.118014 0.061202 -1.928 0.055379 .
fcCC -0.124799 0.048765 -2.559 0.011305 *
NPC -1.754990 0.579212 -3.030 0.002802 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.1251 on 182 degrees of freedom
Multiple R-squared: 0.5321, Adjusted R-squared: 0.5141
F-statistic: 29.57 on 7 and 182 DF, p-value: < 2.2e-16
(RMSE.lm2 <- sqrt(mean(mod2$residuals^2)))
[1] 0.1224099
Los resultados, si bien no son idénticos, son muy parecidos.
bound_10 <- floor(nrow(SGA10_full)*0.67)
bound_16 <- floor(nrow(SGA16_full)*0.67)
seed <- c(12345)
set.seed(seed)
row_train_10 <- sample(seq_len(nrow(SGA10_full)), size = bound_10)
set.seed(seed)
row_train_16 <- sample(seq_len(nrow(SGA16_full)), size = bound_16)
SGA10_train <- SGA10_full[row_train_10, ]
SGA10_test <- SGA10_full[-row_train_10, ]
SGA16_train <- SGA16_full[row_train_16, ]
SGA16_test <- SGA16_full[-row_train_16, ]
set.seed(seed)
knn_model_PCC_1 <- train(SGAsco~. -NPC, data=SGA10_train, method="knn")
plot(knn_model_PCC_1)
k_PCC_1 <- rownames(knn_model_PCC_1$bestTune)
knn_model_PCC_1$results[k_PCC_1,]
pred_PCC_1 <- knn_model_PCC_1 %>% predict(SGA10_test)
tabla_knn <- data.frame(Option = c("Default", "Param"), PCC = NA, NPC = NA)
(tabla_knn[1,2] <- RMSE(pred_PCC_1, SGA10_test$SGAsco))
[1] 0.1465124
set.seed(seed)
knn_model_NPC_1 <- train(SGAsco~. -PCC, data=SGA10_train, method="knn")
plot(knn_model_NPC_1)
k_NPC_1 <- rownames(knn_model_NPC_1$bestTune)
knn_model_NPC_1$results[k_NPC_1,]
pred_NPC_1 <- knn_model_NPC_1 %>% predict(SGA10_test)
(tabla_knn[1,3] <- RMSE(pred_NPC_1, SGA10_test$SGAsco))
[1] 0.1417423
set.seed(seed)
knn_model_PCC_2 <- train(SGAsco~. -NPC, data=SGA10_train, method="knn",
trControl = trainControl("cv", number = 10),
preProcess = c("center","scale"),
tuneLength = 15)
plot(knn_model_PCC_2)
k_PCC_2 <- rownames(knn_model_PCC_2$bestTune)
knn_model_PCC_2$results[k_PCC_2,]
pred_PCC_2 <- knn_model_PCC_2 %>% predict(SGA10_test)
(tabla_knn[2,2] <- RMSE(pred_PCC_2, SGA10_test$SGAsco))
[1] 0.1488071
set.seed(seed)
knn_model_NPC_2 <- train(SGAsco~. -PCC, data=SGA10_train, method="knn",
trControl = trainControl("cv", number = 10),
preProcess = c("center","scale"),
tuneLength = 15)
plot(knn_model_NPC_2)
k_NPC_2 <- rownames(knn_model_NPC_2$bestTune)
knn_model_NPC_2$results[k_NPC_2,]
pred_NPC_2 <- knn_model_NPC_2 %>% predict(SGA10_test)
(tabla_knn[2,3] <- RMSE(pred_NPC_2, SGA10_test$SGAsco))
[1] 0.1522432
kable(tabla_knn) %>%
kable_styling(full_width = F, position = "left")
Option | PCC | NPC |
---|---|---|
Default | 0.1465124 | 0.1417423 |
Param | 0.1488071 | 0.1522432 |
set.seed(seed)
ann_model_1_PCC <- neuralnet(SGAsco ~. -NPC, data=SGA10_train)
plot(ann_model_1_PCC, rep="best")
set.seed(seed)
ann_model_1_NPC <- neuralnet(SGAsco ~. -PCC, data=SGA10_train)
plot(ann_model_1_NPC, rep="best")
tabla_ann <- data.frame(hidden = c(1,2,3), PCC = NA, NPC = NA)
ann_results_1_PCC <- compute(ann_model_1_PCC, SGA10_test)
ann_pred_1_PCC <- ann_results_1_PCC$net.result
(tabla_ann[1,2] <- RMSE(ann_pred_1_PCC, SGA10_test$SGAsco))
[1] 0.2053081
ann_results_1_NPC <- compute(ann_model_1_NPC, SGA10_test)
ann_pred_1_NPC <- ann_results_1_NPC$net.result
(tabla_ann[1,3] <- RMSE(ann_pred_1_NPC, SGA10_test$SGAsco))
[1] 0.2159961
kable(tabla_ann) %>%
kable_styling(full_width = F, position = "left")
hidden | PCC | NPC |
---|---|---|
1 | 0.2053081 | 0.2159961 |
2 | 0.1745996 | 0.1867948 |
3 | 0.1942752 | 0.2170448 |
vanilladot
)set.seed(seed)
svm_model_PCC_1 <- ksvm(SGAsco ~. -NPC, data=SGA10_train, kernel = "vanilladot")
Setting default kernel parameters
set.seed(seed)
svm_model_NPC_1 <- ksvm(SGAsco ~. -PCC, data=SGA10_train, kernel = "vanilladot")
Setting default kernel parameters
tabla_svm <- data.frame(kernel = c("vanilladot", "rbfdot", "laplacedot" ), PCC = NA, NPC = NA)
svm_pred_PCC_1 <- predict(svm_model_PCC_1, SGA10_test)
(tabla_svm[1,2] <- RMSE(svm_pred_PCC_1, SGA10_test$SGAsco))
[1] 0.1653479
svm_pred_NPC_1 <- predict(svm_model_NPC_1, SGA10_test)
(tabla_svm[1,3] <- RMSE(svm_pred_NPC_1, SGA10_test$SGAsco))
[1] 0.1654833
rbfdot
(Radial Basis kernel “Gaussian”)set.seed(seed)
svm_model_PCC_2 <- ksvm(SGAsco ~. -NPC, data=SGA10_train, kernel = "rbfdot")
svm_pred_PCC_2 <- predict(svm_model_PCC_2, SGA10_test)
(tabla_svm[2,2] <- RMSE(svm_pred_PCC_2, SGA10_test$SGAsco))
[1] 0.1832892
set.seed(seed)
svm_model_NPC_2 <- ksvm(SGAsco ~. -PCC, data=SGA10_train, kernel = "rbfdot")
svm_pred_NPC_2 <- predict(svm_model_NPC_2, SGA10_test)
(tabla_svm[2,3] <- RMSE(svm_pred_NPC_2, SGA10_test$SGAsco))
[1] 0.1861551
laplacedot
(Laplacian kernel)set.seed(seed)
svm_model_PCC_3 <- ksvm(SGAsco~. -NPC, data=SGA10_train, kernel = "laplacedot")
svm_pred_PCC_3 <- predict(svm_model_PCC_3, SGA10_test)
(tabla_svm[3,2] <- RMSE(svm_pred_PCC_3, SGA10_test$SGAsco))
[1] 0.1691735
set.seed(seed)
svm_model_NPC_3 <- ksvm(SGAsco ~. -PCC, data=SGA10_train, kernel = "laplacedot")
svm_pred_NPC_3 <- predict(svm_model_NPC_3, SGA10_test)
(tabla_svm[3,3] <- RMSE(svm_pred_NPC_3, SGA10_test$SGAsco))
[1] 0.1790633
kable(tabla_svm) %>%
kable_styling(full_width = F, position = "left")
kernel | PCC | NPC |
---|---|---|
vanilladot | 0.1653479 | 0.1654833 |
rbfdot | 0.1832892 | 0.1861551 |
laplacedot | 0.1691735 | 0.1790633 |
set.seed(seed)
ad_model_PCC_1 <- rpart(SGAsco ~. -NPC, data=SGA10_train,)
set.seed(seed)
ad_model_NPC_1 <- rpart(SGAsco ~. -PCC, data=SGA10_train)
fancyRpartPlot(ad_model_PCC_1, caption = NULL)
fancyRpartPlot(ad_model_NPC_1, caption = NULL)
tabla_ad <- data.frame(Option = c("Default", "Param"), PCC = NA, NPC = NA)
ad_pred_PCC_1 <- predict(ad_model_PCC_1, SGA10_test)
(tabla_ad[1,2] <- RMSE(ad_pred_PCC_1, SGA10_test$SGAsco))
[1] 0.165828
ad_pred_NPC_1 <- predict(ad_model_NPC_1, SGA10_test)
(tabla_ad[1,3] <- RMSE(ad_pred_NPC_1, SGA10_test$SGAsco))
[1] 0.1667751
set.seed(seed)
ad_model_PCC_2 <- rpart(SGAsco ~. -NPC, data=SGA10_train,
minsplit = 2, minbucket = 1)
fancyRpartPlot(ad_model_PCC_2, caption = NULL)
ad_pred_PCC_2 <- predict(ad_model_PCC_2, SGA10_test)
(tabla_ad[2,2] <- RMSE(ad_pred_PCC_2, SGA10_test$SGAsco))
[1] 0.1418558
set.seed(seed)
ad_model_NPC_2 <- rpart(SGAsco ~. -PCC, data=SGA10_train,
minsplit = 2, minbucket = 1)
fancyRpartPlot(ad_model_NPC_2, caption = NULL)
ad_pred_NPC_2 <- predict(ad_model_NPC_2, SGA10_test)
(tabla_ad[2,3] <- RMSE(ad_pred_NPC_2, SGA10_test$SGAsco))
[1] 0.1376482
kable(tabla_ad) %>%
kable_styling(full_width = F, position = "left")
Option | PCC | NPC |
---|---|---|
Default | 0.1658280 | 0.1667751 |
Param | 0.1418558 | 0.1376482 |
set.seed(seed)
rf_model_PCC_1<- randomForest(SGAsco ~. -NPC, data=SGA10_train)
set.seed(seed)
rf_model_NPC_1<- randomForest(SGAsco ~. -PCC, data=SGA10_train)
plot(rf_model_PCC_1)
plot(rf_model_NPC_1)
varImpPlot(rf_model_PCC_1)
varImpPlot(rf_model_NPC_1)
tabla_rf <- data.frame(Option = c("Default", "Param"), PCC = NA, NPC = NA)
rf_pred_PCC_1 <- predict(rf_model_PCC_1, SGA10_test)
(tabla_rf[1,2] <- RMSE(rf_pred_PCC_1, SGA10_test$SGAsco))
[1] 0.1420198
rf_pred_NPC_1 <- predict(rf_model_NPC_1, SGA10_test)
(tabla_rf[1,3] <- RMSE(rf_pred_NPC_1, SGA10_test$SGAsco))
[1] 0.1431044
set.seed(seed)
rf_model_PCC_1000 <- randomForest(SGAsco ~. -NPC, data=SGA10_train, ntree=1000)
rf_pred_PCC_1000 <- predict(rf_model_PCC_1000, SGA10_test)
(tabla_rf[2,2] <- RMSE(rf_pred_PCC_1000, SGA10_test$SGAsco))
[1] 0.1432169
set.seed(seed)
rf_model_NPC_1000 <- randomForest(SGAsco ~. -PCC, data=SGA10_train, ntree=1000)
rf_pred_NPC_1000 <- predict(rf_model_NPC_1000, SGA10_test)
(tabla_rf[2,3] <- RMSE(rf_pred_NPC_1000, SGA10_test$SGAsco))
[1] 0.1425534
kable(tabla_rf) %>%
kable_styling(full_width = F, position = "left")
Option | PCC | NPC |
---|---|---|
Default | 0.1420198 | 0.1431044 |
Param | 0.1432169 | 0.1425534 |
RMSE.lm1
[1] 0.1226716
Algoritmo k-nn
kable(tabla_knn) %>%
kable_styling(full_width = F, position = "left")
Option | PCC | NPC |
---|---|---|
Default | 0.1465124 | 0.1417423 |
Param | 0.1488071 | 0.1522432 |
Algoritmo Artificial Neural Networks
kable(tabla_ann) %>%
kable_styling(full_width = F, position = "left")
hidden | PCC | NPC |
---|---|---|
1 | 0.2053081 | 0.2159961 |
2 | 0.1745996 | 0.1867948 |
3 | 0.1942752 | 0.2170448 |
Algoritmo Support Vector Machine
kable(tabla_svm) %>%
kable_styling(full_width = F, position = "left")
kernel | PCC | NPC |
---|---|---|
vanilladot | 0.1653479 | 0.1654833 |
rbfdot | 0.1832892 | 0.1861551 |
laplacedot | 0.1691735 | 0.1790633 |
Algoritmo Árbol de decisión
kable(tabla_ad) %>%
kable_styling(full_width = F, position = "left")
Option | PCC | NPC |
---|---|---|
Default | 0.1658280 | 0.1667751 |
Param | 0.1418558 | 0.1376482 |
Algoritmo Random Forest
kable(tabla_rf) %>%
kable_styling(full_width = F, position = "left")
Option | PCC | NPC |
---|---|---|
Default | 0.1420198 | 0.1431044 |
Param | 0.1432169 | 0.1425534 |
set.seed(seed)
ad_model_NPC_16 <- rpart(SGAsco ~. -PCC, data=SGA16_train,
minsplit = 2, minbucket = 1)
fancyRpartPlot(ad_model_NPC_16, caption = NULL)
ad_pred_NPC_16 <- predict(ad_model_NPC_16, SGA16_test)
RMSE(ad_pred_NPC_16, SGA16_test$SGAsco)
[1] 0.1561926