Commit 1e122dbc authored by Yaroslav Zdravomyslov's avatar Yaroslav Zdravomyslov
Browse files

wip

parent e7ba75c0
Pipeline #1682 passed with stage
in 31 seconds
......@@ -18,7 +18,7 @@ class DEFAULT(ConfigBase):
MP_HP_EQUAL_RELATIONSHIP_LABEL = "HAS_CROSS_SPECIES_TERM"
MAYBE_RELATIONSHIP_NAME = "_MAYBE_SIMILAR"
UPHENO_RELATIONSHIP_NAME = "_UPHENO_SIMILAR"
UPHENO_RELATIONSHIP_NAME = "_UPHENO_SIMILAR_WEB"
FORBIDDEN_WORDS = [
"frequency",
......@@ -44,4 +44,6 @@ class DEFAULT(ConfigBase):
"central",
]
UPHENO_RAW_TSV_URL = "https://raw.githubusercontent.com/obophenotype/upheno/master/mappings/hp-to-mp-bestmatches.tsv"
NEO4J: dict = {}
from py2neo import Graph
from py2neo.integration import Table
from graphio import RelationshipSet
import csv
import requests
from requests import Response
from Configs import getConfig
from config import DEFAULT
import csv
config: DEFAULT = getConfig()
def get_mp_terms():
def get_mp_terms() -> Table:
"""get (mp_rdfs, mp_sid) tupels by using cypher"""
neo4j_graph = Graph(**config.NEO4J)
neo4j_graph: Graph = Graph(**config.NEO4J)
query = (
query: str = (
"match (n:"
+ config.MP_NODE_LABEL
+ ") WHERE NOT (n)-[:"
......@@ -22,18 +27,19 @@ def get_mp_terms():
+ config.MP_NODE_SID_CLEAN_ATTRIBUTE
)
print("query:\n", query)
print(f"MP-HP-DISTANCE: get MP rdfs & sid list\n{query}")
query_output = neo4j_graph.run(query).to_table()
query_output: Table = neo4j_graph.run(query).to_table()
return query_output
def get_hp_terms():
def get_hp_terms() -> Table:
"""get (hp_rdfs, hp_sid) tupels by using cypher"""
neo4j_graph = Graph(**config.NEO4J)
neo4j_graph: Graph = Graph(**config.NEO4J)
query = (
query: str = (
"match (n:"
+ config.HP_NODE_LABEL
+ ") WHERE NOT (n)-[:"
......@@ -44,37 +50,44 @@ def get_hp_terms():
+ config.HP_NODE_SID_CLEAN_ATTRIBUTE
)
print("query:\n", query)
print(f"MP-HP-DISTANCE: get HP rdfs & sid list\n{query}")
query_output = neo4j_graph.run(query).to_table()
query_output: Table = neo4j_graph.run(query).to_table()
return query_output
def get_equal_names(list_1, list_2):
def get_equal_names(mp_rdfs_id: list, hp_rdfs_id: list) -> list:
"""search for equal names & exclude names if in FORBIDDEN_WORDS list"""
list_1 = [(ele[0].lower(), ele[1]) for ele in list_1 if ele[0]]
list_2 = [(ele[0].lower(), ele[1]) for ele in list_2 if ele[0]]
mp_data: list[tuple] = [
(mp_ele[0].lower(), mp_ele[1]) for mp_ele in mp_rdfs_id if mp_ele[0]
]
hp_data: list[tuple] = [
(hp_ele[0].lower(), hp_ele[1]) for hp_ele in hp_rdfs_id if hp_ele[0]
]
count = 0
res = []
for ele1 in list_1:
if not ele1[0] in config.FORBIDDEN_WORDS:
for ele2 in list_2:
if not ele2[0] in config.FORBIDDEN_WORDS:
if ele1[0] == ele2[0]:
# if count % 100 == 0:
# print(ele1, "==", ele2)
count: int = 0
found_equals: list = []
for mp_entry in mp_data:
if not mp_entry[0] in config.FORBIDDEN_WORDS:
for hp_entry in hp_data:
if not hp_entry[0] in config.FORBIDDEN_WORDS:
if mp_entry[0] == hp_entry[0]:
count += 1
res.append((ele1[1], ele2[1]))
print(count)
found_equals.append((mp_entry[1], hp_entry[1]))
print("MP-HP-DISTANCE: search for equal rdfs names")
print(f"MP-HP-DISTANCE: found {count} equal rdfs names")
return res
return found_equals
def create_maybe_same_relationship(tuple_list):
def create_maybe_relationship(equal_names: list) -> None:
"""create new relationships between given parameter names"""
maybe_relation = RelationshipSet(
maybe_relation_set: RelationshipSet = RelationshipSet(
config.MAYBE_RELATIONSHIP_NAME,
[config.MP_NODE_LABEL],
[config.HP_NODE_LABEL],
......@@ -82,22 +95,26 @@ def create_maybe_same_relationship(tuple_list):
[config.HP_NODE_SID_CLEAN_ATTRIBUTE],
)
for tuple in tuple_list:
for name_tuple in equal_names:
maybe_relation.add_relationship(
{config.MP_NODE_SID_CLEAN_ATTRIBUTE: tuple[0]},
{config.HP_NODE_SID_CLEAN_ATTRIBUTE: tuple[1]},
maybe_relation_set.add_relationship(
{config.MP_NODE_SID_CLEAN_ATTRIBUTE: name_tuple[0]},
{config.HP_NODE_SID_CLEAN_ATTRIBUTE: name_tuple[1]},
)
neo4j_graph = Graph(**config.NEO4J)
print(f"MP-HP-DISTANCE: create {config.MAYBE_RELATIONSHIP_NAME} relationships")
neo4j_graph: Graph = Graph(**config.NEO4J)
maybe_relation.create_index(neo4j_graph)
maybe_relation.merge(neo4j_graph)
maybe_relation_set.create_index(neo4j_graph)
maybe_relation_set.merge(neo4j_graph)
def create_upheno_relationship():
# method using local stored file (here called 'upheno.tsv') as input
def create_upheno_relationship_local() -> None:
"""create new relationships between hp/mp by upheno approach"""
upheno_relation = RelationshipSet(
upheno_relation_set: RelationshipSet = RelationshipSet(
config.UPHENO_RELATIONSHIP_NAME,
[config.MP_NODE_LABEL],
[config.HP_NODE_LABEL],
......@@ -105,28 +122,77 @@ def create_upheno_relationship():
[config.HP_NODE_SID_ATTRIBUTE],
)
count = 0
count: int = 0
with open("upheno.tsv") as file:
with open("MP-HP-DISTANCE/upheno.tsv") as file:
tsv_file = csv.reader(file, delimiter="\t")
for line in tsv_file:
if line:
count += 1
upheno_relation.add_relationship(
upheno_relation_set.add_relationship(
{config.MP_NODE_SID_ATTRIBUTE: line[2]},
{config.HP_NODE_SID_ATTRIBUTE: line[0]},
)
neo4j_graph = Graph(**config.NEO4J)
print(
f"MP-HP-DISTANCE: create {count} {config.UPHENO_RELATIONSHIP_NAME} relationships"
)
neo4j_graph: Graph = Graph(**config.NEO4J)
upheno_relation_set.create_index(neo4j_graph)
upheno_relation_set.merge(neo4j_graph)
# method pulling upheno tsv file from web as input
# oder so? 2tes mit copyfileobj
# https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests
def create_upheno_relationship_web() -> None:
"""create new relationships between hp/mp by upheno approach"""
upheno_relation_set: RelationshipSet = RelationshipSet(
config.UPHENO_RELATIONSHIP_NAME,
[config.MP_NODE_LABEL],
[config.HP_NODE_LABEL],
[config.MP_NODE_SID_ATTRIBUTE],
[config.HP_NODE_SID_ATTRIBUTE],
)
count: int = 0
tsv_request: Response = requests.get(config.UPHENO_RAW_TSV_URL)
for tsv_entry in tsv_request.text.split("\n"):
if tsv_entry:
count += 1
upheno_relation_set.add_relationship(
{
config.MP_NODE_SID_ATTRIBUTE: [
ele for ele in tsv_entry.split("\t") if ele.startswith("MP:")
][0]
},
{
config.HP_NODE_SID_ATTRIBUTE: [
ele for ele in tsv_entry.split("\t") if ele.startswith("HP:")
][0]
},
)
print(
f"MP-HP-DISTANCE: create {count} {config.UPHENO_RELATIONSHIP_NAME} relationships"
)
neo4j_graph: Graph = Graph(**config.NEO4J)
upheno_relation.create_index(neo4j_graph)
upheno_relation.merge(neo4j_graph)
upheno_relation_set.create_index(neo4j_graph)
upheno_relation_set.merge(neo4j_graph)
if __name__ == "__main__":
mp_query = get_mp_terms()
hp_query = get_hp_terms()
equal_names = get_equal_names(mp_query, hp_query)
create_maybe_same_relationship(equal_names)
create_upheno_relationship()
mp_rdfs_id = get_mp_terms()
hp_rdfs_id = get_hp_terms()
equal_names = get_equal_names(mp_rdfs_id, hp_rdfs_id)
create_maybe_relationship(equal_names)
# create_upheno_relationship_local()
create_upheno_relationship_web()
Table of content
[[_TOC_]]
# Introduction
HP - MP distance relationship
Maintainer: angela.dedie@helmholtz-muenchen.de, zdravomyslov@helmholtz-muenchen.de
Status: alpha (dont try this at home)
HP: Human Phenotype - `HP_Term` as node label, with cleaned rdfs names as `rdfs__label_clean` property
MP: Mammalian Phenotype - `MP_Term` as node label, with cleaned rdfs names as `rdfs__label_clean` property
HP - MP distance, this project is intended to identify more connections between nodes in hope of discovering similarity.
Following two approches will be taken:
1) connections based on identical `rdfs__label_clean`
2) connections based on obophenotype/upheno approve
Obophenotype - Upheno (https://github.com/obophenotype/upheno)
hp-to-mp-bestmatches.tsv - generated fuzzy matchings between intologies (https://github.com/obophenotype/upheno/tree/master/mappings)
All nodes/properties were used/created in our graph db (neo4j - bolt://neo4j0X.connect.dzd-ev.de)
* **What?**
- written in python3, using py2neo & graphio libs
- goal: create more connectivity between HP & MP Term nodes, as similarity approach, by identical stringnames & upheno
* **Why?**
- there are mode similarities and thus connections, aim here is to find additional links, i.e. that do no exist in our db
- greater coverage of relationships
- initial approach was based on string distance, to identify cluster, which could lead to similarity - no longer exists
* **How?**
- by connecting identical rdfs strings
- by using mappings of obophenotype/upheno
- tldr: search for connections of non connected nodes by "HAS_CROSS_SPECIES_TERM" relationship
# How to run
src: https://git.connect.dzd-ev.de/dzdtools/mp-hp-distance
there are 2 options to run this software:
- git pull & executing locally
- run docker image
**What do i need to run the software?**
Requirements:
* docker (optional)
* git
* running neo4j instance
* set up reqs.txt for python requirements (without docker)
**How do I install the software**
get lastest release
- docker:
`docker pull registry-gl.connect.dzd-ev.de:443/dzdtools/mp-hp-distance:latest`
- alternative:
`ssh://git@git.connect.dzd-ev.de:22022/dzdtools/mp-hp-distance.git`
**How do I start the software**
Start the container with the following docker run command:
`docker run registry-gl.connect.dzd-ev.de:443/dzdtools/mp-hp-distance:latest`
(do you want some `sudo`?)
**How do i inspect the result**
check your neo4j instance for new "_MAYBE_SIMILAR" & "_UPHENO_SIMILAR" relationships
e.g.
`match p=(m:MP_Term)-[r:_UPHENO_SIMILAR]-(h:HP-Term) return p`
# Config
**Why do they matter?**
defining/using your node names/relationships (and neo4j instance)
**What configuration variables are available?**
docker:
- configurable docker variables will follow soon
non docker:
- HP_NODE_LABEL = label name of HP Term nodes
- HP_NODE_RDFS_ATTRIBUTE = property name of cleaned rdfs name of HP Term nodes
- HP_NODE_SID_CLEAN_ATTRIBUTE = property name of HP Term node's cleaned ID
- HP_NODE_SID_ATTRIBUTE = property name of HP Term node's ID
- MP_NODE_LABEL = label name of MP Term nodes
- MP_NODE_RDFS_ATTRIBUTE = property name of cleaned rdfs name of MP Term nodes
- MP_NODE_SID_CLEAN_ATTRIBUTE = property name of MP Term node's cleaned ID
- MP_NODE_SID_ATTRIBUTE = property name of MP Term node's ID
- MP_HP_EQUAL_RELATIONSHIP_LABEL = already existing relationship name in our neo4j db
- MAYBE_RELATIONSHIP_NAME = name of relationship which will be created, based on identical rdfs names
- UPHENO_RELATIONSHIP_NAME = name of relationship which will be created, based on obophenotype/upheno
- FORBIDDEN_WORDS = list, if rdfs_label_name in list then exclude - why? cuz names like "left"/"central" would always lead to missmatches
- NEO4J = neo4j instance
- UPHENO_RAW_TSV_URL = link to upheno tsv file, for download porpose, not local tsv
**How do I manipulate them?**
all vars are listed in `config.py`
# Datascheme
`match (h:HP_Term)-[r:HAS_CROSS_SPECIES_TERM]-(m:MP_Term) return count(r)`
small visualisation for mp-hp-distance datascheme (as of April 20th, 2022). For zoom or modification find the schema also on [Miro Board DZD - irgendwohingepackt](https://miro.com/app/board/uXjVOWO7Jxc=/?moveToWidget=3458764523657099100&cot=14)
![MP_HP_visualization](MP-HP-DISTANCE/mp_hp_data_scheme.png)
1) 1132 equal names (867 marked in neo4j01) => 265 found (RDFS == RDFS) 227 without forbidden words
2) asdsad
- es gibt rdfs wie "left", "right", "position", "generalized", "frequency"... (╯°□°)╯︵ ┻━┻
-
\ No newline at end of file
todo
- statt print log?
- moar log for that small proj?
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment