setwd('/Users/anjideng1/Desktop/MALDI/output_Maaike')
ref_df <- read.csv('Maaike_ref_group.csv')
unk_df <- read.csv('Maaike_unk_group.csv')
pearson_df <- read.csv('Maaike_group_pearson.csv')
ref_df$group <- 'ref'
unk_df$group <- 'unk'
head(pearson_df)
## unk_index ref_index pearson
## 1 1 1 0.99944020
## 2 1 2 0.05420860
## 3 1 3 0.05241749
## 4 1 4 0.00000000
## 5 1 5 0.00000000
## 6 1 6 0.00000000
head(ref_df)
## mzvalues intensity group_index group
## 1 22.90899 0.9287653 1 ref
## 2 23.03312 0.9391848 1 ref
## 3 23.15724 0.9540300 1 ref
## 4 23.28137 0.9611189 1 ref
## 5 23.40549 0.9539884 1 ref
## 6 23.52962 0.9410457 1 ref
sum(pearson_df$pearson != 0)
## [1] 769
pearson_df_filtered <- pearson_df[pearson_df$pearson != 0, ]
pearson_df_filtered <- pearson_df_filtered[order(-pearson_df_filtered$pearson), ]
leverage_mz <- NULL
for (i in 1:nrow(pearson_df_filtered)) {
ref_index <- pearson_df_filtered[i, 'ref_index']
unk_index <- pearson_df_filtered[i, 'unk_index']
ref_grp <- ref_df[ref_df$group_index == ref_index, c(1, 2, 4)]
unk_grp <- unk_df[unk_df$group_index == unk_index, c(1, 2, 4)]
df <- rbind(ref_grp, unk_grp)
mod <- lm(intensity ~ mzvalues + group, data = df)
leverage_or_not <- hatvalues(mod) > 3 * mean(hatvalues(mod)) # : 191
# leverage_or_not <- hatvalues(mod) > 2 * mean(hatvalues(mod)) : 528
# leverage_or_not <- hatvalues(mod) > 2 / nrow(df) : 15164
if (any(leverage_or_not)){
leverage <- df[which(leverage_or_not), 'mzvalues']
leverage_mz <- c(leverage_mz, leverage)
}
}
length(leverage_mz)
## [1] 191
leverage_mz
## [1] 800.1794 800.3035 800.4277 838.4099 838.5341 838.6581 800.1794 800.3035
## [9] 800.4277 754.1291 754.2532 754.3773 754.5015 754.6255 526.2357 526.3598
## [17] 526.4839 225.9774 226.1015 226.2257 792.3596 792.4837 792.6078 838.4099
## [25] 838.5341 838.6581 814.3297 814.4538 814.5779 814.7020 327.3875 327.5116
## [33] 327.6357 623.6738 623.7979 623.9220 788.7599 788.8840 789.0082 790.3735
## [41] 790.4977 790.6218 575.0168 575.1409 575.2650 505.2585 505.3827 505.5068
## [49] 513.8232 513.9473 514.0714 526.2357 526.3598 526.4839 187.6228 187.7469
## [57] 187.8710 187.9952 188.1193 800.1794 800.3035 800.4277 749.6606 749.7847
## [65] 749.9088 750.0330 750.1571 225.9774 226.1015 226.2257 385.7262 385.8503
## [73] 385.9745 386.0986 386.2227 792.3596 792.4837 792.6078 526.2357 526.3598
## [81] 526.4839 366.7351 366.8592 366.9834 623.6738 623.7979 623.9220 187.6228
## [89] 187.7469 187.8710 187.9952 188.1193 508.3617 508.4858 508.6099 754.1291
## [97] 754.2532 754.5015 754.6255 526.2357 526.3598 526.4839 510.3477 510.4718
## [105] 510.5959 814.3297 814.4538 814.5779 814.7020 165.7768 165.9009 166.0251
## [113] 166.1492 166.2733 790.3735 790.4977 790.6218 327.3875 327.5116 327.6357
## [121] 604.9309 605.0550 605.1791 345.7580 345.8821 346.0062 346.1304 346.2545
## [129] 749.6606 749.7847 749.9088 750.0330 750.1571 385.7262 385.8503 385.9745
## [137] 386.0986 386.2227 788.7599 788.8840 789.0082 575.0168 575.1409 575.2650
## [145] 601.7036 601.8278 601.9519 187.6228 187.7469 187.8710 187.9952 188.1193
## [153] 800.1794 800.3035 800.4277 366.7351 366.8592 366.9834 513.8232 513.9473
## [161] 514.0714 505.2585 505.3827 505.5068 604.9309 605.0550 605.1791 165.7768
## [169] 165.9009 166.0251 166.1492 166.2733 508.3617 508.4858 508.6099 510.3477
## [177] 510.4718 510.5959 345.7580 345.8821 346.0062 346.1304 346.2545 187.6228
## [185] 187.7469 187.8710 187.9952 188.1193 601.7036 601.8278 601.9519
# all group combinations (including zero pearson's coefficient)
all_leverage <- NULL
for (i in 1:nrow(pearson_df)) {
ref_index <- pearson_df[i, 'ref_index']
unk_index <- pearson_df[i, 'unk_index']
ref_grp <- ref_df[ref_df$group_index == ref_index, c(1, 2, 4)]
unk_grp <- unk_df[unk_df$group_index == unk_index, c(1, 2, 4)]
df <- rbind(ref_grp, unk_grp)
mod <- lm(intensity ~ mzvalues + group, data = df)
leverage_or_not <- hatvalues(mod) > 3 * mean(hatvalues(mod))
all_leverage <- c(all_leverage, any(leverage_or_not))
if (any(leverage_or_not)){
leverage <- df[which(leverage_or_not), 'mzvalues']
all_leverage <- c(all_leverage, leverage)
}
}
length(all_leverage)
## [1] 32918
# fit the whole spectrum
unk_df_filtered <- unk_df[unk_df$group_index %in% pearson_df_filtered$unk_index, ]
ref_df_filtered <- ref_df[ref_df$group_index %in% pearson_df_filtered$ref_index, ]
df_filtered <- rbind(ref_df_filtered, unk_df_filtered)
mod_all <- lm(intensity ~ mzvalues + group, data = df_filtered)
df_filtered[which(hatvalues(mod_all) > 2 * mean(hatvalues(mod_all))), 'mzvalues']
## numeric(0)