Step 1: Reinstall Graphormer
Since your environment has inconsistencies, let’s properly clean and
reinstall Graphormer.
1A. Remove Previous Installations
pip uninstall -y graphormer fairseq fairseq2 fairseq2n torch torchvision torchaudio
pip cache purge
1B. Install Dependencies
pip install torch torchvision torchaudio
pip install numpy networkx rdkit-pypi
pip install --no-cache-dir --force-reinstall git+https://github.com/microsoft/Graphormer.git
1C. Verify Installation
library(reticulate)
graphormer <- import("graphormer")
print(paste("Graphormer version:", graphormer$`__version__`))
If no errors appear, installation is successful.
Step 2: Capstone Training Setup Recap
Your Capstone project focuses on: - Quantum Computing + GIS -
Molecular, fossil fuel, and biological particle datasets - Autoencoders,
anomaly detection, and time-series GIS generation - Tracking molecular
migration and fossil fuel movement - Predicting geological/environmental
shifts using Graphormer
Previously, incorrect Fairseq versions prevented successful model
training.
Step 3: Prepare Data for Graphormer Training
3B. Example Training Code
library(reticulate)
torch <- import("torch")
graphormer <- import("graphormer")
load_ogb_dataset <- graphormer$data$dataset$load_ogb_dataset
# Define model parameters
model <- graphormer$models$graphormer$Graphormer(
num_atoms=100,
num_bonds=5,
num_classes=2,
num_layers=12,
num_heads=8,
hidden_dim=256
)
# Load dataset
dataset <- load_ogb_dataset("ogbg-molhiv")
data_loaders <- dataset$get_dataloader(batch_size=32)
train_loader <- data_loaders[[1]]
val_loader <- data_loaders[[2]]
test_loader <- data_loaders[[3]]
# Define optimizer
optimizer <- torch$optim$Adam(model$parameters(), lr=1e-4)
# Training loop
for (epoch in 1:10) {
model$train()
for (batch in train_loader) {
optimizer$zero_grad()
output <- model(batch)
loss <- torch$nn$functional$cross_entropy(output, batch$y)
loss$backward()
optimizer$step()
}
print(paste("Epoch", epoch, "Loss:", loss$item()))
}
print("Training complete.")
Step 4: Embedding Text for Vector Storage
To embed chunks of text into a vector store effectively, follow these
steps:
4A. Chunk the Text
Divide your text into manageable pieces. Depending on your use case,
you can chunk by sentences, paragraphs, or specific character counts.
This helps ensure that each chunk contains complete thoughts and makes
the embeddings more meaningful.
4B. Generate Embeddings
Use a pre-trained model to convert text chunks into embeddings.
Libraries like Hugging Face’s Transformers or OpenAI’s models can be
useful here. For instance, with Hugging Face’s Transformers, you can use
models like distilbert
for sentence embeddings:
library(reticulate)
torch <- import("torch")
transformers <- import("transformers")
model_name <- "distilbert-base-nli-mean-tokens"
model <- transformers$AutoModel$from_pretrained(model_name)
tokenizer <- transformers$AutoTokenizer$from_pretrained(model_name)
embed_text <- function(text_chunks) {
inputs <- tokenizer(text_chunks, padding=TRUE, truncation=TRUE, return_tensors='pt')
with(torch$no_grad(), {
embeddings <- model(**inputs)$last_hidden_state$mean(dim=1) # Mean pooling
})
return(embeddings)
}
4C. Store Embeddings in a Vector Store
Choose a vector database like FAISS or Pinecone. These databases are
optimized for storing and retrieving high-dimensional vectors
efficiently. Here’s an example of how to store embeddings with
FAISS:
library(reticulate)
faiss <- import("faiss")
numpy <- import("numpy")
embeddings <- embed_text(list_of_chunks) # This must return a NumPy array
index <- faiss$IndexFlatL2(embeddings$shape[1]) # L2 distance index
index$add(embeddings$numpy()) # Add embeddings to the index
4D. Search and Retrieve
To search for similar chunks, convert the query text to an embedding
using the same model and then use the vector store to find the nearest
neighbors:
query_embedding <- embed_text(list(query_text))
result <- index$search(query_embedding$numpy(), k) # k is the number of nearest neighbors
LS0tDQp0aXRsZTogIlRyYWluaW5nIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KIyMgU3RlcCAxOiBSZWluc3RhbGwgR3JhcGhvcm1lcg0KU2luY2UgeW91ciBlbnZpcm9ubWVudCBoYXMgaW5jb25zaXN0ZW5jaWVzLCBsZXQncyBwcm9wZXJseSBjbGVhbiBhbmQgcmVpbnN0YWxsIEdyYXBob3JtZXIuDQoNCiMjIyAxQS4gUmVtb3ZlIFByZXZpb3VzIEluc3RhbGxhdGlvbnMNCmBgYHtiYXNofQ0KcGlwIHVuaW5zdGFsbCAteSBncmFwaG9ybWVyIGZhaXJzZXEgZmFpcnNlcTIgZmFpcnNlcTJuIHRvcmNoIHRvcmNodmlzaW9uIHRvcmNoYXVkaW8NCnBpcCBjYWNoZSBwdXJnZQ0KYGBgDQoNCiMjIyAxQi4gSW5zdGFsbCBEZXBlbmRlbmNpZXMNCmBgYHtiYXNofQ0KcGlwIGluc3RhbGwgdG9yY2ggdG9yY2h2aXNpb24gdG9yY2hhdWRpbw0KcGlwIGluc3RhbGwgbnVtcHkgbmV0d29ya3ggcmRraXQtcHlwaQ0KcGlwIGluc3RhbGwgLS1uby1jYWNoZS1kaXIgLS1mb3JjZS1yZWluc3RhbGwgZ2l0K2h0dHBzOi8vZ2l0aHViLmNvbS9taWNyb3NvZnQvR3JhcGhvcm1lci5naXQNCmBgYA0KDQojIyMgMUMuIFZlcmlmeSBJbnN0YWxsYXRpb24NCmBgYHtyfQ0KbGlicmFyeShyZXRpY3VsYXRlKQ0KZ3JhcGhvcm1lciA8LSBpbXBvcnQoImdyYXBob3JtZXIiKQ0KcHJpbnQocGFzdGUoIkdyYXBob3JtZXIgdmVyc2lvbjoiLCBncmFwaG9ybWVyJGBfX3ZlcnNpb25fX2ApKQ0KYGBgDQpJZiBubyBlcnJvcnMgYXBwZWFyLCBpbnN0YWxsYXRpb24gaXMgc3VjY2Vzc2Z1bC4NCg0KLS0tDQoNCiMjIFN0ZXAgMjogQ2Fwc3RvbmUgVHJhaW5pbmcgU2V0dXAgUmVjYXANCllvdXIgQ2Fwc3RvbmUgcHJvamVjdCBmb2N1c2VzIG9uOg0KLSBRdWFudHVtIENvbXB1dGluZyArIEdJUw0KLSBNb2xlY3VsYXIsIGZvc3NpbCBmdWVsLCBhbmQgYmlvbG9naWNhbCBwYXJ0aWNsZSBkYXRhc2V0cw0KLSBBdXRvZW5jb2RlcnMsIGFub21hbHkgZGV0ZWN0aW9uLCBhbmQgdGltZS1zZXJpZXMgR0lTIGdlbmVyYXRpb24NCi0gVHJhY2tpbmcgbW9sZWN1bGFyIG1pZ3JhdGlvbiBhbmQgZm9zc2lsIGZ1ZWwgbW92ZW1lbnQNCi0gUHJlZGljdGluZyBnZW9sb2dpY2FsL2Vudmlyb25tZW50YWwgc2hpZnRzIHVzaW5nIEdyYXBob3JtZXINCg0KUHJldmlvdXNseSwgaW5jb3JyZWN0IEZhaXJzZXEgdmVyc2lvbnMgcHJldmVudGVkIHN1Y2Nlc3NmdWwgbW9kZWwgdHJhaW5pbmcuDQoNCi0tLQ0KDQojIyBTdGVwIDM6IFByZXBhcmUgRGF0YSBmb3IgR3JhcGhvcm1lciBUcmFpbmluZw0KDQojIyMgM0EuIEVuc3VyZSBEYXRhIEZvcm1hdCAoT0dCIEZvcm1hdCBSZXF1aXJlZCkNCkdyYXBob3JtZXIgcmVxdWlyZXMgZGF0YXNldHMgaW4gT3BlbiBHcmFwaCBCZW5jaG1hcmsgKE9HQikgZm9ybWF0LiBJZiB5b3VyIGRhdGFzZXQgaXMgbm90IGluIE9HQiBmb3JtYXQsIGNvbnZlcnNpb24gaXMgcmVxdWlyZWQuDQoNCiMjIyAzQi4gRXhhbXBsZSBUcmFpbmluZyBDb2RlDQpgYGB7cn0NCmxpYnJhcnkocmV0aWN1bGF0ZSkNCnRvcmNoIDwtIGltcG9ydCgidG9yY2giKQ0KZ3JhcGhvcm1lciA8LSBpbXBvcnQoImdyYXBob3JtZXIiKQ0KbG9hZF9vZ2JfZGF0YXNldCA8LSBncmFwaG9ybWVyJGRhdGEkZGF0YXNldCRsb2FkX29nYl9kYXRhc2V0DQoNCiMgRGVmaW5lIG1vZGVsIHBhcmFtZXRlcnMNCm1vZGVsIDwtIGdyYXBob3JtZXIkbW9kZWxzJGdyYXBob3JtZXIkR3JhcGhvcm1lcigNCiAgICBudW1fYXRvbXM9MTAwLA0KICAgIG51bV9ib25kcz01LA0KICAgIG51bV9jbGFzc2VzPTIsDQogICAgbnVtX2xheWVycz0xMiwNCiAgICBudW1faGVhZHM9OCwNCiAgICBoaWRkZW5fZGltPTI1Ng0KKQ0KDQojIExvYWQgZGF0YXNldA0KZGF0YXNldCA8LSBsb2FkX29nYl9kYXRhc2V0KCJvZ2JnLW1vbGhpdiIpDQpkYXRhX2xvYWRlcnMgPC0gZGF0YXNldCRnZXRfZGF0YWxvYWRlcihiYXRjaF9zaXplPTMyKQ0KdHJhaW5fbG9hZGVyIDwtIGRhdGFfbG9hZGVyc1tbMV1dDQp2YWxfbG9hZGVyIDwtIGRhdGFfbG9hZGVyc1tbMl1dDQp0ZXN0X2xvYWRlciA8LSBkYXRhX2xvYWRlcnNbWzNdXQ0KDQojIERlZmluZSBvcHRpbWl6ZXINCm9wdGltaXplciA8LSB0b3JjaCRvcHRpbSRBZGFtKG1vZGVsJHBhcmFtZXRlcnMoKSwgbHI9MWUtNCkNCg0KIyBUcmFpbmluZyBsb29wDQpmb3IgKGVwb2NoIGluIDE6MTApIHsNCiAgICBtb2RlbCR0cmFpbigpDQogICAgZm9yIChiYXRjaCBpbiB0cmFpbl9sb2FkZXIpIHsNCiAgICAgICAgb3B0aW1pemVyJHplcm9fZ3JhZCgpDQogICAgICAgIG91dHB1dCA8LSBtb2RlbChiYXRjaCkNCiAgICAgICAgbG9zcyA8LSB0b3JjaCRubiRmdW5jdGlvbmFsJGNyb3NzX2VudHJvcHkob3V0cHV0LCBiYXRjaCR5KQ0KICAgICAgICBsb3NzJGJhY2t3YXJkKCkNCiAgICAgICAgb3B0aW1pemVyJHN0ZXAoKQ0KICAgIH0NCiAgICBwcmludChwYXN0ZSgiRXBvY2giLCBlcG9jaCwgIkxvc3M6IiwgbG9zcyRpdGVtKCkpKQ0KfQ0KDQpwcmludCgiVHJhaW5pbmcgY29tcGxldGUuIikNCmBgYA0KDQotLS0NCg0KIyMgU3RlcCA0OiBFbWJlZGRpbmcgVGV4dCBmb3IgVmVjdG9yIFN0b3JhZ2UNClRvIGVtYmVkIGNodW5rcyBvZiB0ZXh0IGludG8gYSB2ZWN0b3Igc3RvcmUgZWZmZWN0aXZlbHksIGZvbGxvdyB0aGVzZSBzdGVwczoNCg0KIyMjIDRBLiBDaHVuayB0aGUgVGV4dA0KRGl2aWRlIHlvdXIgdGV4dCBpbnRvIG1hbmFnZWFibGUgcGllY2VzLiBEZXBlbmRpbmcgb24geW91ciB1c2UgY2FzZSwgeW91IGNhbiBjaHVuayBieSBzZW50ZW5jZXMsIHBhcmFncmFwaHMsIG9yIHNwZWNpZmljIGNoYXJhY3RlciBjb3VudHMuIFRoaXMgaGVscHMgZW5zdXJlIHRoYXQgZWFjaCBjaHVuayBjb250YWlucyBjb21wbGV0ZSB0aG91Z2h0cyBhbmQgbWFrZXMgdGhlIGVtYmVkZGluZ3MgbW9yZSBtZWFuaW5nZnVsLg0KDQojIyMgNEIuIEdlbmVyYXRlIEVtYmVkZGluZ3MNClVzZSBhIHByZS10cmFpbmVkIG1vZGVsIHRvIGNvbnZlcnQgdGV4dCBjaHVua3MgaW50byBlbWJlZGRpbmdzLiBMaWJyYXJpZXMgbGlrZSBIdWdnaW5nIEZhY2XigJlzIFRyYW5zZm9ybWVycyBvciBPcGVuQUnigJlzIG1vZGVscyBjYW4gYmUgdXNlZnVsIGhlcmUuIEZvciBpbnN0YW5jZSwgd2l0aCBIdWdnaW5nIEZhY2UncyBUcmFuc2Zvcm1lcnMsIHlvdSBjYW4gdXNlIG1vZGVscyBsaWtlIGBkaXN0aWxiZXJ0YCBmb3Igc2VudGVuY2UgZW1iZWRkaW5nczoNCg0KYGBge3J9DQpsaWJyYXJ5KHJldGljdWxhdGUpDQp0b3JjaCA8LSBpbXBvcnQoInRvcmNoIikNCnRyYW5zZm9ybWVycyA8LSBpbXBvcnQoInRyYW5zZm9ybWVycyIpDQoNCm1vZGVsX25hbWUgPC0gImRpc3RpbGJlcnQtYmFzZS1ubGktbWVhbi10b2tlbnMiDQptb2RlbCA8LSB0cmFuc2Zvcm1lcnMkQXV0b01vZGVsJGZyb21fcHJldHJhaW5lZChtb2RlbF9uYW1lKQ0KdG9rZW5pemVyIDwtIHRyYW5zZm9ybWVycyRBdXRvVG9rZW5pemVyJGZyb21fcHJldHJhaW5lZChtb2RlbF9uYW1lKQ0KDQplbWJlZF90ZXh0IDwtIGZ1bmN0aW9uKHRleHRfY2h1bmtzKSB7DQogICAgaW5wdXRzIDwtIHRva2VuaXplcih0ZXh0X2NodW5rcywgcGFkZGluZz1UUlVFLCB0cnVuY2F0aW9uPVRSVUUsIHJldHVybl90ZW5zb3JzPSdwdCcpDQogICAgd2l0aCh0b3JjaCRub19ncmFkKCksIHsNCiAgICAgICAgZW1iZWRkaW5ncyA8LSBtb2RlbCgqKmlucHV0cykkbGFzdF9oaWRkZW5fc3RhdGUkbWVhbihkaW09MSkgICMgTWVhbiBwb29saW5nDQogICAgfSkNCiAgICByZXR1cm4oZW1iZWRkaW5ncykNCn0NCmBgYA0KDQojIyMgNEMuIFN0b3JlIEVtYmVkZGluZ3MgaW4gYSBWZWN0b3IgU3RvcmUNCkNob29zZSBhIHZlY3RvciBkYXRhYmFzZSBsaWtlIEZBSVNTIG9yIFBpbmVjb25lLiBUaGVzZSBkYXRhYmFzZXMgYXJlIG9wdGltaXplZCBmb3Igc3RvcmluZyBhbmQgcmV0cmlldmluZyBoaWdoLWRpbWVuc2lvbmFsIHZlY3RvcnMgZWZmaWNpZW50bHkuIEhlcmXigJlzIGFuIGV4YW1wbGUgb2YgaG93IHRvIHN0b3JlIGVtYmVkZGluZ3Mgd2l0aCBGQUlTUzoNCg0KYGBge3J9DQpsaWJyYXJ5KHJldGljdWxhdGUpDQpmYWlzcyA8LSBpbXBvcnQoImZhaXNzIikNCm51bXB5IDwtIGltcG9ydCgibnVtcHkiKQ0KDQplbWJlZGRpbmdzIDwtIGVtYmVkX3RleHQobGlzdF9vZl9jaHVua3MpICAjIFRoaXMgbXVzdCByZXR1cm4gYSBOdW1QeSBhcnJheQ0KaW5kZXggPC0gZmFpc3MkSW5kZXhGbGF0TDIoZW1iZWRkaW5ncyRzaGFwZVsxXSkgICMgTDIgZGlzdGFuY2UgaW5kZXgNCmluZGV4JGFkZChlbWJlZGRpbmdzJG51bXB5KCkpICAjIEFkZCBlbWJlZGRpbmdzIHRvIHRoZSBpbmRleA0KYGBgDQoNCiMjIyA0RC4gU2VhcmNoIGFuZCBSZXRyaWV2ZQ0KVG8gc2VhcmNoIGZvciBzaW1pbGFyIGNodW5rcywgY29udmVydCB0aGUgcXVlcnkgdGV4dCB0byBhbiBlbWJlZGRpbmcgdXNpbmcgdGhlIHNhbWUgbW9kZWwgYW5kIHRoZW4gdXNlIHRoZSB2ZWN0b3Igc3RvcmUgdG8gZmluZCB0aGUgbmVhcmVzdCBuZWlnaGJvcnM6DQoNCmBgYHtyfQ0KcXVlcnlfZW1iZWRkaW5nIDwtIGVtYmVkX3RleHQobGlzdChxdWVyeV90ZXh0KSkNCnJlc3VsdCA8LSBpbmRleCRzZWFyY2gocXVlcnlfZW1iZWRkaW5nJG51bXB5KCksIGspICAjIGsgaXMgdGhlIG51bWJlciBvZiBuZWFyZXN0IG5laWdoYm9ycw0KYGBgDQoNCi0tLQ0KDQojIyBTdGVwIDU6IE5leHQgU3RlcHMNCi0gKipWZXJpZnkgR3JhcGhvcm1lciBpbnN0YWxsYXRpb24qKiAoYGltcG9ydCBncmFwaG9ybWVyYCBzaG91bGQgd29yaykNCi0gKipDb25maXJtIGRhdGFzZXQgZm9ybWF0KiogKENvbnZlcnNpb24gdG8gT0dCIG1heSBiZSBuZWVkZWQpDQotICoqRXhlY3V0ZSB0cmFpbmluZyBzY3JpcHQqKiB1c2luZyB5b3VyIGRhdGFzZXQNCg0KSWYgaXNzdWVzIHBlcnNpc3QsIHByb3ZpZGUgZXJyb3IgbWVzc2FnZXMgZm9yIGRlYnVnZ2luZy4NCg0KIyMjIEFkZGl0aW9uYWwgQ29uc2lkZXJhdGlvbnM6DQoxLiBEbyB5b3UgbmVlZCBkYXRhc2V0IGNvbnZlcnNpb24/DQoyLiBTaG91bGQgR3JhcGhvcm1lciBiZSBhZGp1c3RlZCBmb3IgcXVhbnR1bSBmZWF0dXJlcz8NCjMuIERvIHlvdSByZXF1aXJlIG11bHRpLUdQVSB0cmFpbmluZz8NCg0KLS0tDQoNCiMjIyBSZWNhcDogRml4ZWQgSXNzdWVzDQrinIUgUmVtb3ZlZCBjb25mbGljdGluZyBwYWNrYWdlcw0K4pyFIEluc3RhbGxlZCBjb3JyZWN0IFB5VG9yY2ggJiBDVURBIHZlcnNpb25zDQrinIUgSW5zdGFsbGVkIEdyYXBob3JtZXIgcHJvcGVybHkNCuKchSBFbnN1cmVkIFFNOSBkYXRhc2V0IHJlYWRpbmVzcw0K4pyFIEluaXRpYXRlZCBHcmFwaG9ybWVyIHRyYWluaW5nDQoNCg==