Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CU-8692wgmkm: Add neo4J connector that was removed from MedCAT #13

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions helper/neo/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
This folder contains the code removed from MedCAT with the following PR:
https://github.com/CogStack/MedCAT/pull/356

It may be worth rewriting this with some other dependency. But we'll keep it here for now.
231 changes: 231 additions & 0 deletions helper/neo/data_preparation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
import os
import pandas as pd


def get_index_queries():
"""Run before everything to speed up things."""
return ['CREATE INDEX patientId FOR (p:Patient) ON (p.patientId);',
'CREATE INDEX conceptId FOR (c:Concept) ON (c.conceptId);',
'CREATE INDEX documentId FOR (d:Document) ON (d.documentId);']


def create_neo_csv(data, columns, output_dir='/etc/lib/neo4j/import/',
base_name='patients'):
"""Creates a patients CSV for neo4j load csv function

Args:
data:
A dataframe or path to a dataframe with the required data.
columns:
What data to use from the dataframe.
output_dir:
Where to save the CSVs, should be the neo4j imports path if possible.
base_name:
Name of the csv.
"""
if isinstance(data, pd.DataFrame):
df = data
else:
df = pd.read_csv(data)

# Remove duplicates
df = df.drop_duplicates(subset=columns)

out_df = df[columns]
data_path = os.path.join(output_dir, f"{base_name}.csv")
out_df.to_csv(data_path, index=False)


def create_patients_csv(data, output_dir='/etc/lib/neo4j/import/',
base_name='patients'):
"""Creates a patients CSV for neo4j load csv function

Args:
data:
A dataframe or path to a dataframe with the required data: patientId,
sex, ethnicity, dob.
output_dir:
Where to save the CSVs, should be the neo4j imports path if possible,
but writing there could be only admin.

Returns:
str: The query.
"""
query = (
'USING PERIODIC COMMIT 100000 \n'
f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n'
'CREATE (:Patient {patientId: toString(row.patientId), \n'
' sex: toString(row.sex), \n'
' ethnicity: toString(row.ethnicity), \n'
' dob: datetime(row.dob)}) \n'
)

create_neo_csv(data=data, columns=['patientId', 'sex', 'ethnicity', 'dob'],
output_dir=output_dir, base_name=base_name)

return query


def create_documents_csv(data, output_dir='/etc/lib/neo4j/import/',
base_name='documents'):
"""Creates a patients CSV for neo4j load csv function

Args:
data:
A dataframe or path to a dataframe with the required data: documentId.
output_dir:
Where to save the CSVs, should be the neo4j imports path if possible.

Returns:
str: The query.
"""
query = (
'USING PERIODIC COMMIT 100000 \n'
f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n'
'CREATE (:Document {documentId: toString(row.documentId)}) \n'
)

create_neo_csv(data=data, columns=['documentId'],
output_dir=output_dir, base_name=base_name)

return query


def create_concepts_csv(data, output_dir='/etc/lib/neo4j/import/',
base_name='concepts'):
"""Creates a patients CSV for neo4j load csv function

Args:
data:
A dataframe or path to a dataframe with the required data: conceptId,
name and type.
output_dir:
Where to save the CSVs, should be the neo4j imports path if possible.
"""
query = (
'USING PERIODIC COMMIT 100000 \n'
f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n'
'CREATE (:Concept {conceptId: toString(row.conceptId), \n'
' type: toString(row.type), \n'
' name: toString(row.name)}) \n'
)

create_neo_csv(data=data, columns=['conceptId', 'name', 'type'],
output_dir=output_dir, base_name=base_name)

return query


def create_document2patient_csv(data, output_dir='/etc/lib/neo4j/import/',
base_name='document2patient'):

"""Creates a patients CSV for neo4j load csv function

Args:
data:
A dataframe or path to a dataframe with the required data: patientId and
documentId.
output_dir:
Where to save the CSVs, should be the neo4j imports path if possible.
"""
query = (
'USING PERIODIC COMMIT 100000 \n'
f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n'
'MATCH (pt:Patient {patientId: toString(row.patientId)}) \n'
'MATCH (doc:Document {documentId: toString(row.documentId)}) \n'
'CREATE (pt)-[:HAS]->(doc); \n'
)

create_neo_csv(data=data, columns=['patientId', 'documentId'],
output_dir=output_dir, base_name=base_name)

return query


def create_concept_ontology_csv(data, output_dir='/etc/lib/neo4j/import/',
base_name='concept_ontology'):

"""Creates a patients CSV for neo4j load csv function

Args:
data:
A dataframe or path to a dataframe with the required data: child, parent.
output_dir:
Where to save the CSVs, should be the neo4j imports path if possible.
"""
query = (
'USING PERIODIC COMMIT 100000 \n'
f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n'
'MATCH (child:Concept {conceptId: toString(row.child)}) \n'
'MATCH (parent:Concept {conceptId: toString(row.parent)}) \n'
'CREATE (child)-[:IS_A]->(parent); \n'
)

create_neo_csv(data=data, columns=['child', 'parent'],
output_dir=output_dir, base_name=base_name)

return query


def create_document2concept_csv(data, output_dir='/etc/lib/neo4j/import/',
base_name='document2concepts'):
"""Creates a patients CSV for neo4j load csv function

Args:
data:
A dataframe or path to a dataframe with the required data: 'conceptId',
'documentId', 'contextSimilarity', 'start', 'end', 'timestamp',
'metaSubject', 'metaPresence', 'metaTime'.
output_dir:
Where to save the CSVs, should be the neo4j imports path if possible.
"""
query = (
'USING PERIODIC COMMIT 100000 \n'
f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n'
'MATCH (doc:Document{documentId: toString(row.documentId)}) \n'
'MATCH (concept:Concept {conceptId: toString(row.conceptId)}) \n'
'CREATE (doc)-[:HAS {start: toInteger(row.start), \n'
' end: toInteger(row.end), \n'
' timestamp: toInteger(row.timestamp), \n'
' contextSimilarity: toFloat(row.contextSimilarity), \n'
' metaSubject: toString(row.metaSubject), \n'
' metaPresence: toString(row.metaPresence), \n'
' metaTime: toString(row.metaTime) \n'
' }]->(concept); \n'
)

columns = ['conceptId', 'documentId', 'contextSimilarity', 'start',
'end', 'timestamp', 'metaSubject', 'metaPresence', 'metaTime']

create_neo_csv(data=data, columns=columns,
output_dir=output_dir, base_name=base_name)

return query


def get_data_from_docs(docs, doc2pt, doc2time=None):
data = [['conceptId', 'documentId', 'contextSimilarity',
'start', 'end', 'timestamp', 'metaSubject',
'metaPresence', 'metaTime']]

for doc_id, doc in docs.items():
row = []
for ent in doc['entities'].values():
#if ent['meta_anns']['Subject']['value'] == 'Patient' and \
# ent['meta_anns']['Presence']['value'] == 'True':
if doc2time is not None:
t = doc2time[doc_id]
else:
t = ent['document_timestamp']

row = [ent['cui'], doc_id,
ent['context_similarity'],
ent['start'], ent['end'],
t,
ent['meta_anns'].get('Subject', {}).get('value', None),
ent['meta_anns'].get('Presence', {}).get('value', None),
ent['meta_anns'].get('Time', {}).get('value', None)]
data.append(row)
row = []

return data
Loading
Loading