# Example drug descriptions
drug_descriptions = {
    "Acetazolamide": """
    The anticonvulsant activity of Acetazolamide may depend on a direct inhibition of carbonic anhydrase in the CNS, which decreases carbon dioxide tension in the pulmonary alveoli, thus increasing arterial oxygen tension. The diuretic effect depends on the inhibition of carbonic anhydrase, causing a reduction in the availability of hydrogen ions for active transport in the renal tubule lumen. This leads to alkaline urine and an increase in the excretion of bicarbonate, sodium, potassium, and water.
    """,
    "Acarbose": """
    Alpha-glucosidase enzymes are located in the brush-border of the intestinal mucosa and serve to metabolize oligo-, tri-, and disaccharides (e.g. sucrose) into smaller monosaccharides (e.g. glucose, fructose) which are more readily absorbed.4 These work in conjunction with pancreatic alpha-amylase, an enzyme found in the intestinal lumen that hydrolyzes complex starches to oligosaccharides.7
    Acarbose is a complex oligosaccharide that competitively and reversibly inhibits both pancreatic alpha-amylase and membrane-bound alpha-glucosidases - of the alpha-glucosidases, inhibitory potency appears to follow a rank order of glucoamylase > sucrase > maltase > isomaltase.7 By preventing the metabolism and subsequent absorption of dietary carbohydrates, acarbose reduces postprandial blood glucose and insulin levels.
    """
}

# Function to summarize MoA using LLM
def summarize_moa(drug_name, description):
    import requests
    ollama_url = "http://localhost:11434/api/generate"
    prompt = f"Summarize the mechanism of action of the drug {drug_name} in a single phrase:\n\n{description}\n\nSummary:"
    payload = {
    "model": "llama2",
    "prompt": prompt,
    "stream": False
    }
    response = requests.post(ollama_url, json=payload)
    summary = response.json()['response'].strip()
    return summary

summarized_moas = {drug: summarize_moa(drug, desc) for drug, desc in drug_descriptions.items()}
print(summarized_moas)

{'Acetazolamide': "Acetazolamide's mechanism of action involves inhibiting carbonic anhydrase in the CNS to decrease carbon dioxide tension in the lungs, leading to increased arterial oxygen tension, as well as inhibiting carbonic anhydrase in the kidneys to reduce hydrogen ion availability for active transport, resulting in alkaline urine and increased excretion of bicarbonate, sodium, potassium, and water.", 'Acarbose': 'Acarbose inhibits both pancreatic alpha-amylase and membrane-bound alpha-glucosidases, leading to reduced absorption of dietary carbohydrates and decreased postprandial blood glucose and insulin levels.'}

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# use mxbai-embed-large model
import ollama
# test_embed = ollama.embeddings(model='mxbai-embed-large', prompt='Represent this sentence for searching relevant passages: The sky is blue because of Rayleigh scattering')
# test_embed['embedding']
def get_embedding(text):
    response = ollama.embeddings(model='mxbai-embed-large', prompt=f'Represent this sentence for searching relevant passages: {text}')
    return response['embedding']
def get_embedding_clear(text):
    response = ollama.embeddings(model='mxbai-embed-large', prompt=f'{text}')
    return response['embedding']

a=get_embedding("a good idea")
b=get_embedding_clear("a good idea")
embeddings = {drug: get_embedding_clear(summary) for drug, summary in summarized_moas.items()}
# Create similarity matrix
drug_names = list(embeddings.keys())
embedding_matrix = np.array([embeddings[drug] for drug in drug_names])
similarity_matrix = cosine_similarity(embedding_matrix)

# Convert the similarity matrix into a more readable format
import pandas as pd
similarity_df = pd.DataFrame(similarity_matrix, index=drug_names, columns=drug_names)
print("Similarity Matrix:")
print(similarity_df)

Similarity Matrix:
               Acetazolamide  Acarbose
Acetazolamide       1.000000  0.633546
Acarbose            0.633546  1.000000

import os
import getpass
from neo4j import GraphDatabase
import requests

NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = getpass.getpass(prompt='Enter your Neo4j password: ')

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
driver

<neo4j._sync.driver.BoltDriver at 0x74769aba9c60>

# Function to query Neo4j database
def query_neo4j(cypher_query):
    with driver.session() as session:
        result = session.run(cypher_query)
        return [record.data() for record in result]
query_neo4j("MATCH (n) RETURN n LIMIT 1")

[{'n': {'hgnc_id': 'HGNC:36587',
   'uniprotswissprot': '',
   'taxonomy_id': '9606',
   'ensembl_gene_id': '',
   'name': 'ribosomal protein L13 pseudogene 9',
   'hgnc_symbol': 'RPL13P9',
   'entrezgene_id': '100270927',
   'alias_name': '',
   'taxonomy_name': 'Homo sapiens',
   'uuid': 'be00bf9f-db9e-469b-8d28-eae274c1ae02',
   'alias_symbol': ''}}]

# First, ollama serve in default port and ollma run lamma2 or gemma:7b
ollama_url = "http://localhost:11434/api/generate"
# Define the payload (data to be sent in the POST request)
nl_query = "Why is the sky blue?"
nl_query = "Please introduce Cypher language."
payload = {
    "model": "llama2",
    "prompt": nl_query,
    "stream": False
}
# Send the POST request to the Ollama API
response = requests.post(ollama_url, json=payload)

# Print the response from the server
if response.status_code == 200:
    print("Response from Ollama model:")
    print(response.json()['response'])
else:
    print(f"Error: {response.status_code}")
    print(response.text)
# curl http://localhost:11434/api/generate -d '{  "model": "llama2",  "prompt": "Why is the sky blue?",  "stream": false }'

Response from Ollama model:

Cypher is a programming language used for querying and manipulating data in Neo4j, a popular graph database management system. It is a declarative language, meaning that you describe what you want to do with your data, rather than how to do it. This makes it easier to work with complex queries and algorithms, as Cypher abstracts away the underlying details of how the query will be executed.

Here are some key features of Cypher:

1. Graph-based query language: Cypher is designed specifically for querying graph data structures, such as those stored in Neo4j. It provides a set of operators and functions that can be used to traverse and manipulate nodes and relationships in a graph.
2. Declarative syntax: Cypher uses a declarative syntax, which means that you describe what you want to do with your data, rather than how to do it. This makes it easier to write and maintain complex queries.
3. Pattern matching: Cypher provides a powerful pattern-matching capability, allowing you to match on specific patterns in your graph data and perform actions based on those matches.
4. Aggregation and filtering: Cypher includes built-in support for aggregation and filtering operations, such as grouping nodes together or finding nodes that satisfy certain conditions.
5. Integration with other languages: Cypher can be used in conjunction with other programming languages, such as Java and Python, to create more complex data processing pipelines.
6. Extensive library: Cypher has an extensive library of functions and operators that can be used to perform a wide range of tasks, from simple queries to complex data analysis.
7. Ease of use: Cypher is designed to be easy to learn and use, with a simple syntax and intuitive query language.
8. Performance: Cypher is optimized for performance, allowing you to query large graphs quickly and efficiently.
9. Scalability: Cypher can handle large graphs with millions of nodes and relationships, making it a great choice for scalable data processing.
10. Support: Neo4j provides extensive documentation, tutorials, and support for Cypher, making it easy to get started and find help when needed.

Overall, Cypher is a powerful and flexible language that makes it easy to query and manipulate graph data structures in Neo4j. Its declarative syntax and extensive library of functions make it a popular choice among developers and data scientists working with graph data.

# Function to interact with the Ollama lamma2 model
def get_answer_from_natural_language(nl_query):
    ollama_url = "http://localhost:11434/api/generate"
    payload = {
    "model": "llama2",
    "prompt": nl_query,
    "stream": False
    }
    response = requests.post(ollama_url, json = payload)
    response_data = response.json()
    return response_data['response']
get_answer_from_natural_language("Please introduce Cypher language")

'\nCypher is a programming language designed specifically for querying and manipulating data stored in graph databases. It is a declarative language, meaning that you describe what you want to retrieve or transform, rather than how to do it. This makes it easier to write queries that are concise, readable, and maintainable.\n\nHere are some key features of Cypher:\n\n1. Queries are expressed as graphs: In Cypher, queries are represented as graphs, with nodes representing tables or collections, and edges representing relationships between them. This allows for a more visual and intuitive way of expressing queries.\n2. Declarative syntax: Cypher uses a declarative syntax, which means that you describe what you want to retrieve or transform, rather than how to do it. This makes it easier to write concise and readable queries.\n3. Pattern matching: Cypher supports pattern matching, which allows you to extract different values from a single query result based on the structure of the data. For example, you can use pattern matching to extract specific fields from a JSON object.\n4. Aggregations: Cypher provides built-in support for aggregation functions, such as grouping, sorting, and filtering data. These functions can be used to perform complex data transformations without having to write custom code.\n5. Gremlin API compatibility: Cypher is designed to be compatible with the Gremlin API, which means that you can use Cypher queries to retrieve data from a Graph Database, and then use the resulting data in your application.\n6. Support for various data sources: Cypher can be used to query data from a variety of sources, including graph databases, relational databases, and file systems.\n7. Extensive libraries and tools: Cypher has an extensive library of built-in functions and operators, as well as a number of third-party tools and plugins available to extend its functionality.\n8. Flexible data modeling: Cypher allows for flexible data modeling, allowing you to define your own data models and schemas.\n9. Support for transactional queries: Cypher provides support for transactional queries, which allow you to perform multiple operations on a Graph Database in a single, atomic operation.\n10. Integration with other tools and languages: Cypher can be integrated with other tools and languages, such as Python, Java, and JavaScript, making it easy to use in a variety of contexts.\n\nOverall, Cypher is a powerful and flexible language that allows developers to easily query and manipulate data stored in graph databases. Its declarative syntax and built-in support for aggregations and pattern matching make it an ideal choice for working with complex data structures.'

from neo4j import GraphDatabase

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def get_schema_description():
    query = """
    CALL db.schema.visualization() YIELD nodes, relationships
    RETURN nodes, relationships
    """
    
    with driver.session() as session:
        result = session.run(query)
        record = result.single()
        
        # Extract node labels and their properties
        nodes_info = "\n- Nodes:\n"
        for node in record["nodes"]:
            labels = list(node.labels)
            label = labels[0] if labels else None  # Extract the label from the list of labels
            properties = node
            nodes_info += f"  - {label}: Properties: {', '.join(properties)}\n"
        
        # Extract relationship types
        rel_info = "\n- Relationships:\n"
        for rel_type in record["relationships"]:
            rel_info += f"  - {rel_type}\n"
        
        return nodes_info + rel_info

# Get the schema description
schema_description = get_schema_description()

def get_cypher_from_natural_language(nl_query, schema_description, examples):
    ollama_url = "http://localhost:11434/api/generate"
    prompt = f"""
    You are an expert in converting natural language queries into Cypher queries for a Neo4j database. Here is the schema of the database:

    {schema_description}

    Convert the following natural language queries into Cypher queries:

    {examples}

    Now, convert the following natural language query into a Cypher query:

    Natural Language Query: "{nl_query}"
    Cypher Query:
    """
    
    payload = {
        "model": "llama2",
        "prompt": prompt,
        "stream": False
    }
    
    response = requests.post(ollama_url, json=payload)
    
    if response.status_code == 200:
        response_data = response.json()
        generated_text = response_data.get("response", "").strip()
        return generated_text
    else:
        raise Exception(f"Failed to generate Cypher query: {response.status_code}, {response.text}")

# Schema description
schema_description = schema_description

# Examples of natural language queries and corresponding Cypher queries
examples = """
1. Natural Language Query: "How many drugs are in the Neo4j database?"
   Cypher Query: "MATCH (d:DRUG) RETURN COUNT(d) AS drug_count"

2. Natural Language Query: "List all the synthetic lethality genes paird with CALM1, and get the drugs target to all there genes."
   Cypher Query: "MATCH p1=(gene1:GENE)-[r:synthetic_lethality]->(gene2:GENE)
WHERE gene1.hgnc_symbol ='CALM1'
MATCH p2=(gene1)-[r2]-(path1:REACTOME)
MATCH p3=(gene2)-[r3]-(path1)
with p1, p2, p3, COLLECT (gene1) + COLLECT (gene2) AS gall
UNWIND gall AS gallw
OPTIONAL MATCH p4=(c:DRUG)-[ {source: 'DrugBank'}]->(gallw)
with p1, p2, p3, p4
ORDER BY rand() LIMIT 100 
RETURN p1, p2, p3, p4"

3. Natural Language Query: "Find all the diseases related to 'children eiplepsy', and show all the variants and genes related with the disease. "
   Cypher Query: "MATCH (d1:DISEASE)-[r1]-(d2:DISEASE)
WHERE toLower(d1.Name) CONTAINS 'child' AND toLower(d1.Name) CONTAINS 'epilepsy'
OPTIONAL MATCH (d1)-[r2]-(v:VARIANT)-[r3]-(g1:GENE)
OPTIONAL MATCH (d1)-[r4]-(g2:GENE)
RETURN r1, r2, r3, r4"
"""

nl_query = "How many diseases are in the Neo4j database?"
nl_query = "Find all the diseases related to 'children eiplepsy' and not extract variants and genes"
cypher_query = get_cypher_from_natural_language(nl_query, schema_description, examples)
print("Generated Cypher Query:", cypher_query)

Generated Cypher Query: To convert the natural language query "Find all the diseases related to 'children eiplepsy' and not extract variants and genes" into a Cypher query, we can use the `MATCH` statement with various clauses. Here is one way to write this query in Cypher:
```cypher
MATCH (d1:DISEASE)-[r1]-(d2:DISEASE)
WHERE toLower(d1.Name) CONTAINS 'child' AND toLower(d1.Name) CONTAINS 'eiplepsy'
RETURN d1, d2
Explanation:

MATCH is the statement that matches nodes and
relationships in the graph.
(d1:DISEASE)-[r1]-(d2:DISEASE) matches all diseases
(d1) that are connected to other diseases (d2)
through a relationship (r1).
WHERE is used to filter the results based on a
condition. In this case, we want to find diseases that contain the words
“child” and “eiplepsy” in their names.
RETURN d1, d2 returns the matched nodes (diseases) in
the query results.

Note: This is just one possible way to write this query in Cypher. There
may be other ways to accomplish the same task, depending on the specific
requirements of your use case.

start_index = cypher_query.find("```cypher") + len("```cypher")
end_index = cypher_query.find("```", start_index)
# Extract the Cypher query text
extracted_cypher_query = cypher_query[start_index:end_index].strip()
extracted_cypher_query

"MATCH (d1:DISEASE)-[r1]-(d2:DISEASE)\nWHERE toLower(d1.Name) CONTAINS 'child' AND toLower(d1.Name) CONTAINS 'eiplepsy'\nRETURN d1, d2"

query_neo4j(extracted_cypher_query)

[]

import networkx as nx import matplotlib.pyplot as plt


def query_neo4j(cypher_query): with driver.session() as session: result = session.run(cypher_query) return list(result) #
Collect the result records into a list cypher_query = “MATCH (n) RETURN n LIMIT 10”
# Execute the Cypher query result = query_neo4j(cypher_query)
def generate_network(result): G = nx.Graph() for record in result: # Extract node properties
node_data = record[‘n’] node_id = node_data[‘hgnc_symbol’] # Assuming ‘hgnc_symbol’ uniquely identifies nodes
# Add node with properties to the graph G.add_node(node_id,
**node_data)
<span class="c1"># Add edges to the graph</span>
<span class="k">for</span> <span class="n">record</span> <span class="ow">in</span> <span class="n">result</span><span class="p">:</span>
    <span class="c1"># Assuming you want to connect nodes based on their 'hgnc_symbol'</span>
    <span class="n">node_id1</span> <span class="o">=</span> <span class="n">record</span><span class="p">[</span><span class="s1">'n'</span><span class="p">][</span><span class="s1">'hgnc_symbol'</span><span class="p">]</span>
    <span class="k">for</span> <span class="n">record_inner</span> <span class="ow">in</span> <span class="n">result</span><span class="p">:</span>
        <span class="n">node_id2</span> <span class="o">=</span> <span class="n">record_inner</span><span class="p">[</span><span class="s1">'n'</span><span class="p">][</span><span class="s1">'hgnc_symbol'</span><span class="p">]</span>
        <span class="k">if</span> <span class="n">node_id1</span> <span class="o">!=</span> <span class="n">node_id2</span><span class="p">:</span>
            <span class="n">G</span><span class="o">.</span><span class="n">add_edge</span><span class="p">(</span><span class="n">node_id1</span><span class="p">,</span> <span class="n">node_id2</span><span class="p">)</span>

<span class="k">return</span> <span class="n">G</span>
G = generate_network(result)
# nx.draw(G) def visualize_network(G): plt.figure(figsize=(10,
6)) pos = nx.spring_layout(G) nx.draw(G, pos, with_labels=True, font_weight=‘bold’) plt.show()
# Example Cypher query
# Visualize the network visualize_network(G)

result

[<Record n=<Node element_id='4:93f2c4f5-2aa5-4f3d-bb2a-4810c83102d1:0' labels=frozenset({'GENE'}) properties={'hgnc_id': 'HGNC:36587', 'uniprotswissprot': '', 'taxonomy_id': '9606', 'ensembl_gene_id': '', 'name': 'ribosomal protein L13 pseudogene 9', 'hgnc_symbol': 'RPL13P9', 'entrezgene_id': '100270927', 'alias_name': '', 'taxonomy_name': 'Homo sapiens', 'uuid': 'be00bf9f-db9e-469b-8d28-eae274c1ae02', 'alias_symbol': ''}>>,
 <Record n=<Node element_id='4:93f2c4f5-2aa5-4f3d-bb2a-4810c83102d1:1' labels=frozenset({'GENE'}) properties={'hgnc_id': 'HGNC:36588', 'uniprotswissprot': '', 'taxonomy_id': '9606', 'ensembl_gene_id': 'ENSG00000241307', 'name': 'ribosomal protein L7a pseudogene 24', 'hgnc_symbol': 'RPL7AP24', 'entrezgene_id': '100271041', 'alias_name': '', 'taxonomy_name': 'Homo sapiens', 'uuid': '1e430b09-c8b6-4535-922d-f7a40036fdf7', 'alias_symbol': ''}>>,
 <Record n=<Node element_id='4:93f2c4f5-2aa5-4f3d-bb2a-4810c83102d1:2' labels=frozenset({'GENE'}) properties={'hgnc_id': 'HGNC:36589', 'uniprotswissprot': '', 'taxonomy_id': '9606', 'ensembl_gene_id': '', 'name': 'ribosomal protein S28 pseudogene 3', 'hgnc_symbol': 'RPS28P3', 'entrezgene_id': '644862', 'alias_name': '', 'taxonomy_name': 'Homo sapiens', 'uuid': '81e22ae5-7271-4600-8398-ad894451c275', 'alias_symbol': ''}>>,
 <Record n=<Node element_id='4:93f2c4f5-2aa5-4f3d-bb2a-4810c83102d1:3' labels=frozenset({'GENE'}) properties={'hgnc_id': 'HGNC:3659', 'uniprotswissprot': 'Q99689', 'taxonomy_id': '9606', 'ensembl_gene_id': 'ENSG00000149557', 'name': 'fasciculation and elongation protein zeta 1', 'hgnc_symbol': 'FEZ1', 'entrezgene_id': '9638', 'alias_name': 'zygin I', 'taxonomy_name': 'Homo sapiens', 'uuid': '6944af3f-a769-496c-9625-e222c42c1a5f', 'alias_symbol': 'UNC-76'}>>,
 <Record n=<Node element_id='4:93f2c4f5-2aa5-4f3d-bb2a-4810c83102d1:4' labels=frozenset({'GENE'}) properties={'hgnc_id': 'HGNC:36590', 'uniprotswissprot': '', 'taxonomy_id': '9606', 'ensembl_gene_id': '', 'name': 'ribosomal protein lateral stalk subunit P2 pseudogene 2', 'hgnc_symbol': 'RPLP2P2', 'entrezgene_id': '644198', 'alias_name': '', 'taxonomy_name': 'Homo sapiens', 'uuid': '1107808b-e720-4e01-9c5a-c4835d1dfd7b', 'alias_symbol': ''}>>,
 <Record n=<Node element_id='4:93f2c4f5-2aa5-4f3d-bb2a-4810c83102d1:5' labels=frozenset({'GENE'}) properties={'hgnc_id': 'HGNC:36591', 'uniprotswissprot': '', 'taxonomy_id': '9606', 'ensembl_gene_id': 'ENSG00000242899', 'name': 'ribosomal protein L7 pseudogene 16', 'hgnc_symbol': 'RPL7P16', 'entrezgene_id': '729677', 'alias_name': '', 'taxonomy_name': 'Homo sapiens', 'uuid': 'd91235a9-9d96-424b-81e8-8254122cb624', 'alias_symbol': ''}>>,
 <Record n=<Node element_id='4:93f2c4f5-2aa5-4f3d-bb2a-4810c83102d1:6' labels=frozenset({'GENE'}) properties={'hgnc_id': 'HGNC:36592', 'uniprotswissprot': '', 'taxonomy_id': '9606', 'ensembl_gene_id': 'ENSG00000227331', 'name': 'ribosomal protein L7a pseudogene 22', 'hgnc_symbol': 'RPL7AP22', 'entrezgene_id': '100271039', 'alias_name': '', 'taxonomy_name': 'Homo sapiens', 'uuid': '4608d30d-ac49-4c90-b6e7-d7687c3d852a', 'alias_symbol': ''}>>,
 <Record n=<Node element_id='4:93f2c4f5-2aa5-4f3d-bb2a-4810c83102d1:7' labels=frozenset({'GENE'}) properties={'hgnc_id': 'HGNC:36593', 'uniprotswissprot': '', 'taxonomy_id': '9606', 'ensembl_gene_id': 'ENSG00000241746', 'name': 'ribosomal protein L13a pseudogene 18', 'hgnc_symbol': 'RPL13AP18', 'entrezgene_id': '402342', 'alias_name': '', 'taxonomy_name': 'Homo sapiens', 'uuid': 'bfcf876c-937b-4fa7-accd-2667707d5d23', 'alias_symbol': ''}>>,
 <Record n=<Node element_id='4:93f2c4f5-2aa5-4f3d-bb2a-4810c83102d1:8' labels=frozenset({'GENE'}) properties={'hgnc_id': 'HGNC:36594', 'uniprotswissprot': '', 'taxonomy_id': '9606', 'ensembl_gene_id': 'ENSG00000231622', 'name': 'ribosomal protein S29 pseudogene 7', 'hgnc_symbol': 'RPS29P7', 'entrezgene_id': '100128377', 'alias_name': '', 'taxonomy_name': 'Homo sapiens', 'uuid': '3b59b694-af4f-454d-b154-b18f3918ba0f', 'alias_symbol': ''}>>,
 <Record n=<Node element_id='4:93f2c4f5-2aa5-4f3d-bb2a-4810c83102d1:9' labels=frozenset({'GENE'}) properties={'hgnc_id': 'HGNC:36595', 'uniprotswissprot': '', 'taxonomy_id': '9606', 'ensembl_gene_id': 'ENSG00000215184', 'name': 'ribosomal protein S12 pseudogene 16', 'hgnc_symbol': 'RPS12P16', 'entrezgene_id': '100271352', 'alias_name': '', 'taxonomy_name': 'Homo sapiens', 'uuid': 'b5116bf4-fb54-4d28-956b-e427d483a739', 'alias_symbol': ''}>>]

type(result); len(result)

10

G.nodes()

NodeView(('RPL13P9', 'RPL7AP24', 'RPS28P3', 'FEZ1', 'RPLP2P2', 'RPL7P16', 'RPL7AP22', 'RPL13AP18', 'RPS29P7', 'RPS12P16'))

MoA similarity

2024-05-22

Python demo

Summary MoA with LLM¶

Calculate similarities¶

This demo show¶

1. Query neo4j from python¶

2. Interact with local running lamma2 LLM model¶

3. Generate Cypher based on given neo4j db schema¶

Query Neo4j using Natural language.¶

Network visualization¶