Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CU-863gntc58 Umlspt2ch #322

Merged
merged 7 commits into from
Jun 5, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 117 additions & 9 deletions medcat/utils/preprocess_umls.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@

from typing import List, Union
import pandas as pd
import tqdm
import os
from typing import Dict, Set

_DEFAULT_COLUMNS: list = [
"CUI",
Expand All @@ -20,7 +23,7 @@
"STR",
"SRL",
"SUPPRESS",
"CVF",
"CVF",
]

_DEFAULT_SEM_TYPE_COLUMNS: list = [
Expand All @@ -32,12 +35,24 @@
"CVF",
]

_DEFAULT_MRHIER_COLUMNS: list = [
"CUI",
"AUI",
"CXN",
"PAUI",
"SAB",
"RELA",
"PTR",
"HCD",
"CVF",
]

medcat_csv_mapper: dict = {
'CUI': 'cui',
'STR': 'name',
'SAB': 'ontologies',
'ISPREF': 'name_status',
'TUI': 'type_ids', # from MRSTY.RRF
'TUI': 'type_ids', # from MRSTY.RRF
}


Expand All @@ -57,11 +72,13 @@ class UMLS:
def __init__(self, main_file_name: str, sem_types_file: str, allow_languages: list = ['ENG'], sep: str = '|'):
self.main_file_name = main_file_name
self.sem_types_file = sem_types_file
self.main_columns = list(_DEFAULT_COLUMNS) # copy
self.sem_types_columns = list(_DEFAULT_SEM_TYPE_COLUMNS) # copy
self.main_columns = list(_DEFAULT_COLUMNS) # copy
self.sem_types_columns = list(_DEFAULT_SEM_TYPE_COLUMNS) # copy
self.mrhier_columns = list(_DEFAULT_MRHIER_COLUMNS) # copy
self.sep = sep
# copy in case of default list
self.allow_langugages = list(allow_languages) if allow_languages else allow_languages
self.allow_langugages = list(
allow_languages) if allow_languages else allow_languages

def to_concept_df(self) -> pd.DataFrame:
"""Create a concept DataFrame.
Expand All @@ -72,7 +89,8 @@ def to_concept_df(self) -> pd.DataFrame:
"""
# target columns:
# cui, name, name_status, ontologies, description_type_ids, type_ids
df = pd.read_csv(self.main_file_name, names=self.main_columns, sep=self.sep, index_col=False)
df = pd.read_csv(self.main_file_name,
names=self.main_columns, sep=self.sep, index_col=False)

# filter languages
if self.allow_langugages:
Expand All @@ -82,7 +100,8 @@ def to_concept_df(self) -> pd.DataFrame:

# get TUI

sem_types = pd.read_csv(self.sem_types_file, names=self.sem_types_columns, sep=self.sep, index_col=False)
sem_types = pd.read_csv(
self.sem_types_file, names=self.sem_types_columns, sep=self.sep, index_col=False)
df = df.merge(sem_types)

# rename columns
Expand All @@ -109,7 +128,8 @@ def map_umls2snomed(self) -> pd.DataFrame:
Returns:
pd.DataFrame: Dataframe that contains the SCUI (source CUI) as well as the UMLS CUI for each applicable concept
"""
df = pd.read_csv(self.main_file_name, names=self.main_columns, sep=self.sep, index_col=False, dtype={'SCUI': 'str'})
df = pd.read_csv(self.main_file_name, names=self.main_columns,
sep=self.sep, index_col=False, dtype={'SCUI': 'str'})
# get only SNOMED-CT US based concepts that have a SNOMED-CT (source) CUI
df = df[df.SAB == 'SNOMEDCT_US'][df.SCUI.notna()]
# sort by SCUI
Expand Down Expand Up @@ -154,7 +174,8 @@ def map_umls2source(self, sources: Union[str, List[str]]) -> pd.DataFrame:
Returns:
pd.DataFrame: DataFrame that has the target source codes
"""
df = pd.read_csv(self.main_file_name, names=self.main_columns, sep=self.sep, index_col=False, dtype={'CODE': 'str'})
df = pd.read_csv(self.main_file_name, names=self.main_columns,
sep=self.sep, index_col=False, dtype={'CODE': 'str'})
# get the specified source(s)
if isinstance(sources, list):
df = df[df.SAB.isin(sources)][df.CODE.notna()]
Expand All @@ -166,6 +187,76 @@ def map_umls2source(self, sources: Union[str, List[str]]) -> pd.DataFrame:
df = df[['CODE',] + [col for col in df.columns.values if col != 'CODE']]
return df

def get_pt2ch(self) -> dict:
"""Generates a parent to children dict.

It goes through all the < # TODO

The resulting dictionary maps a CUI to a list of CUIs that
consider that CUI as their parent.

PS:
This expects the MRHIER.RRF file to also exist in the same folder
as the MRCONSO.RRF file.

Raises:
ValueError: If the MRHIER.RRF file wasn't found

Returns:
dict: The dictionary of parent CUI and their children.
"""
path = self.main_file_name.rsplit('/', 1)[0]
hier_file = f"{path}/MRHIER.RRF"

if not os.path.exists(hier_file):
raise ValueError(
f'Expected MRHIER.RRF to exist within the same parent folder ({path})')

conso_df = pd.read_csv(self.main_file_name, names=self.main_columns,
sep=self.sep, index_col=False)

hier_df = pd.read_csv(hier_file, sep=self.sep, index_col=False,
header=None, names=self.mrhier_columns)

# filter languages
if self.allow_langugages:
conso_df = conso_df[conso_df["LAT"].isin(self.allow_langugages)]

# create a AUI -> CUI map
aui_cui = dict(zip(conso_df["AUI"], conso_df["CUI"]))

# remove non-preferred from conso
conso_df = conso_df[conso_df['ISPREF'] == 'Y']

# filter ISA relationships
hier_df = hier_df[hier_df['RELA'] == 'isa']

# merge dataframes
merged_df = pd.merge(conso_df, hier_df, on=['AUI', 'CUI'])

# only keep CUI and parent AUI
cui_parent = merged_df[['CUI', 'PAUI']]
# only include CUIs with a parent
cui_parent = cui_parent[cui_parent['PAUI'].notna()]

# create dict
pt2ch: Dict[str, Set[str]] = {}
for _, row in tqdm.tqdm(cui_parent.iterrows(), total=len(cui_parent.index)):
cur_cui = row['CUI']
paui = row['PAUI']
parent_cui = aui_cui[paui]
# avoid self as parent/child
if parent_cui == cur_cui:
continue
if parent_cui not in pt2ch:
pt2ch[parent_cui] = set()
pt2ch[parent_cui].add(cur_cui)
# move from set to list for consistency with SNOMED
pt2ch: Dict[str, List[str]] = pt2ch # type: ignore
for k, v in pt2ch.items():
pt2ch[k] = list(v)
return pt2ch


if __name__ == '__main__':
import sys
Expand All @@ -187,3 +278,20 @@ def map_umls2source(self, sources: Union[str, List[str]]) -> pd.DataFrame:
to_ICD10_man = umls.map_umls2source(sources=['ICD10'])
print('As ICD-10(MAN):')
print(to_ICD10_man.head())
pt2ch = umls.get_pt2ch()
print('Get parent-child dict', len(pt2ch),
'' if len(pt2ch) > 1_000 else pt2ch)
all_vals = [len(v) for v in pt2ch.values()]
print('LEN of VALS:', sum(all_vals), 'max',
max(all_vals), 'min', min(all_vals), 'mean', sum(all_vals) / len(all_vals))
import random
random_4_keys = random.sample(list(pt2ch.keys()), k=4)

def _get_name(cui: str) -> str:
matches = df[df['cui'] == cui]
if len(matches.index) == 0:
return 'N/A' # UNKNOWN
return matches['name'].iloc[0]
print('FF RAW ', [f"{k}:{pt2ch[k]}" for k in random_4_keys])
print('FIRST FEW', [
(f"{_get_name(key)} ({key})", [f"{_get_name(child)} ({child})" for child in pt2ch[key]]) for key in random_4_keys])