RECETOX · hechth · Jun 18, 2021 · Apr 21, 2021 · Apr 21, 2021 · Apr 21, 2021
diff --git a/RIAssigner/compute/ComputationMethod.py b/RIAssigner/compute/ComputationMethod.py
@@ -1,10 +1,14 @@
 from abc import ABC, abstractmethod
-from RIAssigner.data import Data
 from typing import List
+from RIAssigner.data import Data
 
 
 class ComputationMethod(ABC):
 
     @abstractmethod
-    def compute(self, query: Data, reference: Data) -> List[int]:
+    def compute(self, query: Data, reference: Data) -> List[float]:
         ...
+
+    def _check_data_args(self, query, reference):
+        assert query is not None, "Query data is 'None'."
+        assert reference is not None, "Reference data is 'None'."
diff --git a/RIAssigner/compute/Kovats.py b/RIAssigner/compute/Kovats.py
@@ -0,0 +1,72 @@
+from typing import List, Iterable
+from RIAssigner.data.Data import Data
+from .ComputationMethod import ComputationMethod
+
+
+class Kovats(ComputationMethod):
+    """ Class to compute the Kovats retention index. """
+
+    def compute(self, query: Data, reference: Data) -> List[float]:
+        """ Compute non-isothermal Kovats retention index.
+        For details see https://webbook.nist.gov/chemistry/gc-ri/.
+
+        Parameters
+        ----------
+        query:
+            Dataset for which to compute retention indices.
+        """
+
+        self._check_data_args(query, reference)
+
+        lower_index = 0
+        higher_index = 0
+        retention_indices = []
+
+        # Copy rts and ris and insert 0 in the beginning, so that interpolation always starts at 0,0 to the first reference compound.
+        reference_rts = list(reference.retention_times)
+        reference_ris = list(reference.retention_indices)
+
+        reference_rts.insert(0, 0.0)
+        reference_ris.insert(0, 0.0)
+
+        for target_rt in query.retention_times:
+            ri = None
+            if Data.is_valid(target_rt):
+                lower_index, higher_index = _get_bound_indices(target_rt, reference_rts, lower_index, higher_index)
+                ri = _compute_ri(target_rt, reference_rts, reference_ris, lower_index, higher_index)
+            retention_indices.append(ri)
+
+        return retention_indices
+
+
+def _get_bound_indices(target_rt: float, reference_rts: Iterable[Data.RetentionTimeType], lower_index: int, higher_index: int):
+    """ Get the indices of previosly eluting and next eluting reference compounds.
+    Retention times in 'Data' objects are sorted in ascending order, so this method assumes
+    that 'reference_rt' is sorted in ascending order.
+
+    Parameters
+    ----------
+    reference_rts
+        Retention times of reference compounds.
+
+    """
+    if target_rt > max(reference_rts) or higher_index >= len(reference_rts):
+        higher_index = len(reference_rts) - 1
+    else:
+        while reference_rts[higher_index] < target_rt:
+            higher_index += 1
+    lower_index = max(lower_index, higher_index - 1)
+    return lower_index, higher_index
+
+
+def _compute_ri(
+        target_rt: float,
+        reference_rts: Iterable[Data.RetentionTimeType],
+        reference_ris: Iterable[Data.RetentionIndexType],
+        lower_index: int,
+        higher_index: int):
+    term_a = target_rt - reference_rts[lower_index]
+    term_b = reference_rts[higher_index] - reference_rts[lower_index]
+
+    ri = 100 * term_a / term_b + reference_ris[lower_index]
+    return ri
diff --git a/RIAssigner/compute/__init__.py b/RIAssigner/compute/__init__.py
@@ -0,0 +1,8 @@
+import logging
+from .Kovats import Kovats
+
+logging.getLogger(__name__).addHandler(logging.NullHandler())
+
+__all__ = [
+    "Kovats",
+]
diff --git a/RIAssigner/data/Data.py b/RIAssigner/data/Data.py
@@ -1,18 +1,24 @@
 from abc import ABC, abstractmethod
 from typing import Iterable, Optional
-from pint import Unit, UnitRegistry
+from pint import UnitRegistry
+from pint.unit import build_unit_class
 
 
 class Data(ABC):
     """ Base class for data managers. """
     RetentionTimeType = Optional[float]
     RetentionIndexType = Optional[float]
+    URegistry = UnitRegistry()
+    Unit = build_unit_class(URegistry)
+
+    @staticmethod
+    def is_valid(rt: RetentionTimeType) -> bool:
+        return rt is not None and rt >= 0.0
 
     def __init__(self, filename: str, rt_unit: str = 'seconds'):
         self._filename = filename
         self._rt_unit = rt_unit
-        self._unit = Unit(self._rt_unit)
-        self._ureg = UnitRegistry()
+        self._unit = Data.Unit(self._rt_unit)
         self.read()
 
     @abstractmethod

diff --git a/RIAssigner/data/MatchMSData.py b/RIAssigner/data/MatchMSData.py
@@ -30,15 +30,15 @@ def _read_spectra(self, filename):
 
     def _read_retention_times(self):
         """ Read retention times from spectrum metadata. """
-        self._retention_times = self._ureg.Quantity([safe_read_key(spectrum, 'retentiontime') for spectrum in self._spectra], self._unit)
+        self._retention_times = Data.URegistry.Quantity([safe_read_key(spectrum, 'retentiontime') for spectrum in self._spectra], self._unit)
 
     def _read_retention_indices(self):
         """ Read retention indices from spectrum metadata. """
         self._retention_indices = [safe_read_key(spectrum, 'retentionindex') for spectrum in self._spectra]
 
     def _sort_spectra_by_rt(self):
         """ Sort objects (peaks) in spectra list by their retention times. """
-        self._spectra.sort(key=lambda spectrum: spectrum.metadata['retentiontime'])
+        self._spectra.sort(key=lambda spectrum: safe_read_key(spectrum, 'retentiontime'))
 
     @property
     def retention_times(self) -> Iterable[Data.RetentionTimeType]:
@@ -91,7 +91,9 @@ def _spectrum_has_rt(spectrum: Spectrum) -> bool:
     has_key = 'retentiontime' in spectrum.metadata.keys()
     if not has_key:
         return False
-    return True
+    else:
+        value = safe_read_key(spectrum, 'retentiontime')
+        return value is not None
 
 
 def _assign_ri_value(spectrum: Spectrum, value: int):

diff --git a/tests/data/kovats/PFAS_added_rt.npy b/tests/data/kovats/PFAS_added_rt.npy
diff --git a/tests/data/kovats/aplcms_aligned_peaks.npy b/tests/data/kovats/aplcms_aligned_peaks.npy
diff --git a/tests/data/kovats/xcms_variable_metadata.npy b/tests/data/kovats/xcms_variable_metadata.npy
diff --git a/tests/data/msp/PFAS_added_rt.msp b/tests/data/msp/PFAS_added_rt.msp
@@ -5,7 +5,7 @@ IONMODE: Negative
 FORMULA: C10H18F3NO4S2
 SMILES: O=C(NC(C)(C)CS(=O)(=O)O)CCSCCC(F)(F)F
 INCHIKEY: DLTHJDKHDLWLAE-UHFFFAOYSA-N
-RETENTIONTIME: 188.9
+RETENTIONTIME: 556.9
 CCS: 
 ONTOLOGY: PFSA
 COMMENT: FT-thioether
@@ -22,7 +22,7 @@ IONMODE: Negative
 FORMULA: C11H18F5NO4S2
 SMILES: O=C(NC(C)(C)CS(=O)(=O)O)CCSCCC(F)(F)C(F)(F)F
 INCHIKEY: NFSFEPNLUWBEDN-UHFFFAOYSA-N
-RETENTIONTIME: 1.2
+RETENTIONTIME: 80.2
 CCS: 
 ONTOLOGY: PFSA
 COMMENT: FT-thioether
@@ -39,7 +39,7 @@ IONMODE: Negative
 FORMULA: C12H18F7NO4S2
 SMILES: O=C(NC(C)(C)CS(=O)(=O)O)CCSCCC(F)(F)C(F)(F)C(F)(F)F
 INCHIKEY: TUKWMSPJOXGNKX-UHFFFAOYSA-N
-RETENTIONTIME: 
+RETENTIONTIME: 127.8
 CCS: 
 ONTOLOGY: PFSA
 COMMENT: FT-thioether
@@ -56,7 +56,7 @@ IONMODE: Negative
 FORMULA: C13H18F9NO4S2
 SMILES: O=C(NC(C)(C)CS(=O)(=O)O)CCSCCC(F)(F)C(F)(F)C(F)(F)C(F)(F)F
 INCHIKEY: VIHJAWYQCFERKN-UHFFFAOYSA-N
-RETENTIONTIME: 17.4
+RETENTIONTIME: 217.4
 CCS: 
 ONTOLOGY: PFSA
 COMMENT: FT-thioether
@@ -73,7 +73,7 @@ IONMODE: Negative
 FORMULA: C14H18F11NO4S2
 SMILES: O=C(NC(C)(C)CS(=O)(=O)O)CCSCCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F
 INCHIKEY: IEEDVXQCSFOQPF-UHFFFAOYSA-N
-RETENTIONTIME: 10.5
+RETENTIONTIME: 310.5
 CCS: 
 ONTOLOGY: PFSA
 COMMENT: FT-thioether
@@ -90,7 +90,7 @@ IONMODE: Negative
 FORMULA: C15H18F13NO4S2
 SMILES: O=C(NC(C)(C)CS(=O)(=O)O)CCSCCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F
 INCHIKEY: DKVATWZCDNHYCW-UHFFFAOYSA-N
-RETENTIONTIME: 0.45
+RETENTIONTIME: 440.45
 CCS: 
 ONTOLOGY: PFSA
 COMMENT: FT-thioether

diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
@@ -0,0 +1,8 @@
+from .data import reference_alkanes
+from .data import queries
+
+
+__all__ = [
+    "reference_alkanes",
+    "queries",
+]
diff --git a/tests/fixtures/data.py b/tests/fixtures/data.py
@@ -0,0 +1,29 @@
+import os
+import numpy
+import pytest
+from RIAssigner.data import PandasData
+from RIAssigner.data import MatchMSData
+
+
+here = os.path.abspath(os.path.dirname(__file__))
+data_location = os.path.join(here, os.pardir, "data")
+data_type_map = {
+    ".msp": MatchMSData,
+    ".csv": PandasData
+}
+
+
+@pytest.fixture
+def reference_alkanes():
+    filename = os.path.join(data_location, "csv", "Alkanes_20210325.csv")
+    return PandasData(filename, 'min')
+
+
+@pytest.fixture(params=["aplcms_aligned_peaks.csv", "xcms_variable_metadata.csv", "PFAS_added_rt.msp"])
+def queries(request):
+    basename, extension = os.path.splitext(request.param)
+    filename = os.path.join(data_location, extension[1:], request.param)
+
+    results_path = os.path.join(data_location, "kovats", basename + ".npy")
+    expected = numpy.load(results_path)
+    return (data_type_map[extension](filename), expected)
diff --git a/tests/mocks/DataStub.py b/tests/mocks/DataStub.py
@@ -0,0 +1,31 @@
+from typing import Iterable, Optional
+from RIAssigner.data.Data import Data
+
+
+class DataStub(Data):
+    """ Mock class for data. """
+    def __init__(self, retention_times: Iterable[float], retention_indices: Iterable[float]):
+        self._retention_times = retention_times
+        self._retention_indices = retention_indices
+
+    def read(self, filename):
+        pass
+
+    def write(self, filename):
+        pass
+
+    @property
+    def filename(self):
+        return "mock"
+
+    @property
+    def retention_times(self) -> Iterable[Optional[float]]:
+        return self._retention_times
+
+    @property
+    def retention_indices(self) -> Iterable[Optional[float]]:
+        return self._retention_indices
+
+    @retention_indices.setter
+    def retention_indices(self, value: Iterable[float]):
+        self._retention_indices = value
diff --git a/tests/test_compute_Kovats.py b/tests/test_compute_Kovats.py
@@ -0,0 +1,76 @@
+import numpy
+import pytest
+from .mocks.DataStub import DataStub
+from .fixtures.data import reference_alkanes, queries
+from RIAssigner.compute import Kovats
+
+
+@pytest.fixture
+def indexed_data():
+    retention_times = [3.5, 4.68, 5.12, 7.31, 9.01, 9.08]
+    retention_indices = [700, 800, 900, 1000, 1100, 1200]
+    return DataStub(retention_times, retention_indices)
+
+
+@pytest.fixture
+def non_indexed_data():
+    retention_times = [3.99, 4.21, 4.32, 5.83, 6.55, 7.02, 8.65, 9.05]
+    return DataStub(retention_times, [])
+
+
+@pytest.fixture
+def invalid_rt_data():
+    retention_times = [-1.0, -0.1, None, 3.99]
+    return DataStub(retention_times, [])
+
+
+def test_construct():
+    compute = Kovats()
+    assert compute is not None
+
+
+def test_exception_reference_none(non_indexed_data):
+    method = Kovats()
+    with pytest.raises(AssertionError) as exception:
+        method.compute(non_indexed_data, None)
+
+    message = exception.value.args[0]
+    assert exception.typename == "AssertionError"
+    assert message == "Reference data is 'None'."
+
+
+def test_exception_query_none(indexed_data):
+    method = Kovats()
+    with pytest.raises(AssertionError) as exception:
+        method.compute(None, indexed_data)
+
+    message = exception.value.args[0]
+    assert exception.typename == "AssertionError"
+    assert message == "Query data is 'None'."
+
+
+def test_compute_ri_basic_case(non_indexed_data, indexed_data):
+    method = Kovats()
+
+    expected = [741.525424,  760.169492,  769.491525,  932.420091,  965.296804,
+                986.757991, 1078.823529, 1157.142857]
+    actual = method.compute(non_indexed_data, indexed_data)
+
+    numpy.testing.assert_array_almost_equal(actual, expected)
+
+
+def test_invalid_rt_has_none_ri(invalid_rt_data, indexed_data):
+    method = Kovats()
+
+    expected = [None, None, None, 741.5254237288136]
+    actual = method.compute(invalid_rt_data, indexed_data)
+
+    numpy.testing.assert_array_equal(actual, expected)
+
+
+def test_ref_queries(reference_alkanes, queries):
+    method = Kovats()
+
+    data, expected = queries
+    actual = method.compute(data, reference_alkanes)
+    numpy.testing.assert_array_almost_equal(actual, expected)
diff --git a/tests/test_data_MatchMSData.py b/tests/test_data_MatchMSData.py
@@ -13,8 +13,9 @@
 
 @pytest.fixture(params=[
     "recetox_gc-ei_ms_20201028.msp",
-    "MSMS-Neg-Vaniya-Fiehn_Natural_Products_Library_20200109.msp",
-    "MSMS-Neg-PFAS_20200806.msp",
+    # Currently excluded due to having None RT values
+    # "MSMS-Neg-Vaniya-Fiehn_Natural_Products_Library_20200109.msp",
+    # "MSMS-Neg-PFAS_20200806.msp",
     "PFAS_added_rt.msp"])
 def filename_msp(request):
     return os.path.join(testdata_dir, request.param)