Merge pull request #342 from uhh-lt/duplicate-finder

Duplicate finder
uhh-lt · Jan 29, 2024 · 6412f84 · 6412f84
2 parents e12bf80 + 46731a6
commit 6412f84
Show file tree

Hide file tree

Showing 10 changed files with 770 additions and 162 deletions.
diff --git a/backend/environment.yml b/backend/environment.yml
@@ -39,5 +39,7 @@ dependencies:
   - httpx=0.23.0
   - anyio=3.7.1
   - debugpy=1.8.0
+  - networkx=3.2.1
+  - scikit-learn=1.3.2
   - pip:
       - -r requirements.txt
diff --git a/backend/src/api/endpoints/project.py b/backend/src/api/endpoints/project.py
@@ -10,6 +10,7 @@
 )
 from api.util import get_object_memo_for_user, get_object_memos
 from api.validation import Validate
+from app.core.analysis.duplicate_finder_service import DuplicateFinderService
 from app.core.authorization.authz_user import AuthzUser
 from app.core.data.crud.action import crud_action
 from app.core.data.crud.code import crud_code
@@ -559,3 +560,21 @@ def get_all_metadata(
     db_objs = crud_project_meta.read_by_project(db=db, proj_id=proj_id)
     metadata = [ProjectMetadataRead.model_validate(meta) for meta in db_objs]
     return metadata
+
+
+@router.post(
+    "/{proj_id}/find_duplicate_text_sdocs",
+    response_model=List[List[int]],
+    summary="Returns groups of duplicate sdoc ids.",
+)
+def find_duplicate_text_sdocs(
+    *,
+    db: Session = Depends(get_db_session),
+    proj_id: int,
+    max_different_words: int,
+    authz_user: AuthzUser = Depends(),
+) -> List[List[int]]:
+    authz_user.assert_in_project(proj_id)
+    return DuplicateFinderService().find_duplicate_text_sdocs(
+        project_id=proj_id, max_different_words=max_different_words
+    )
diff --git a/backend/src/app/core/analysis/duplicate_finder_service.py b/backend/src/app/core/analysis/duplicate_finder_service.py
@@ -0,0 +1,94 @@
+from typing import List
+
+import networkx as nx
+import numpy as np
+from sklearn.metrics.pairwise import manhattan_distances
+
+from app.core.data.doc_type import DocType
+from app.core.data.orm.source_document import SourceDocumentORM
+from app.core.data.orm.word_frequency import WordFrequencyORM
+from app.core.db.sql_service import SQLService
+from app.util.singleton_meta import SingletonMeta
+
+
+class DuplicateFinderService(metaclass=SingletonMeta):
+    def __new__(cls, *args, **kwargs):
+        cls.sqls = SQLService()
+        return super(DuplicateFinderService, cls).__new__(cls)
+
+    def find_duplicate_text_sdocs(
+        self, project_id: int, max_different_words: int
+    ) -> List[List[int]]:
+        with self.sqls.db_session() as db:
+            result = (
+                db.query(WordFrequencyORM)
+                .join(WordFrequencyORM.source_document)
+                .filter(
+                    SourceDocumentORM.project_id == project_id,
+                    SourceDocumentORM.doctype == DocType.text,
+                )
+                .all()
+            )
+
+        # unique words in project
+        words = list(set([r.word for r in result]))
+        words.sort()
+        word2idx = {w: i for i, w in enumerate(words)}
+
+        # process result to map
+        sdoc_id2word_id2word_freq = {}
+        for wf in result:
+            if wf.sdoc_id not in sdoc_id2word_id2word_freq:
+                sdoc_id2word_id2word_freq[wf.sdoc_id] = {}
+            sdoc_id2word_id2word_freq[wf.sdoc_id][word2idx[wf.word]] = wf.count
+
+        # X.create document vectors
+        document_vectors = []
+        idx2sdoc_id = {}
+        for idx, sdoc_id in enumerate(sdoc_id2word_id2word_freq.keys()):
+            word_id2_word_freq = sdoc_id2word_id2word_freq[sdoc_id]
+            sdoc_vector = [
+                word_id2_word_freq[word_id] if word_id in word_id2_word_freq else 0
+                for word_id in range(len(words))
+            ]
+            idx2sdoc_id[idx] = sdoc_id
+            document_vectors.append(sdoc_vector)
+        document_vectors = np.array(document_vectors)
+
+        # compute distances
+        word_dists = manhattan_distances(document_vectors, document_vectors)
+
+        # mask out self distances and one half of the matrix
+        minuses = np.ones_like(word_dists) * -1
+        zeroed_minuses = np.triu(minuses, k=0)
+        zeroed_word_dists = np.tril(word_dists, k=-1)
+        masked_word_dists = zeroed_word_dists + zeroed_minuses
+
+        # find duplicates
+        duplicate_pairs = np.transpose(
+            np.where(
+                (masked_word_dists <= max_different_words) & (masked_word_dists >= 0)
+            )
+        ).tolist()
+
+        # map back to sdoc_ids
+        duplicate_sdoc_id_pairs = [
+            (idx2sdoc_id[pair[0]], idx2sdoc_id[pair[1]]) for pair in duplicate_pairs
+        ]
+
+        # we now create a graph with sdocs as nodes and edges between duplicates
+        # we will use this graph to identify connected components, each subgraph is a group of duplicates
+        duplicate_sdoc_ids = list(
+            set(
+                [pair[0] for pair in duplicate_sdoc_id_pairs]
+                + [pair[1] for pair in duplicate_sdoc_id_pairs]
+            )
+        )
+        G = nx.Graph()
+        G.add_nodes_from(duplicate_sdoc_ids)
+        G.add_edges_from(duplicate_sdoc_id_pairs)
+        G.to_undirected()
+        subgraphs = list(nx.connected_components(G))
+        subgraph_nodes = [list(subgraph) for subgraph in subgraphs]
+
+        return subgraph_nodes