Fix possible double writing of the uploaded client files in Upload-Mu…

…ltiple requests (#6952) Fixes #6878 In the case of big files (>2.5 MB by default), the uploaded files could be write-appended twice, leading to bigger raw file sizes than expected. This PR fixes the behavior by excluding repetitive writes where it was not supposed. - Fixed double append-writing of the uploaded files when Upload-Multiple requests are used - Fixed potential DB - disk inconsistencies in the case of upload errors - Added tests
cvat-ai · Oct 17, 2023 · e8db2c3 · e8db2c3
1 parent 021e58c
commit e8db2c3
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 4 deletions.
diff --git a/changelog.d/20231006_152644_fix_6878.md b/changelog.d/20231006_152644_fix_6878.md
@@ -0,0 +1,4 @@
+### Fixed
+
+- Potential double-sized file writes in task data uploading
+  (<https://github.com/opencv/cvat/pull/6952>)
diff --git a/cvat/apps/engine/views.py b/cvat/apps/engine/views.py
@@ -940,10 +940,11 @@ def _maybe_append_upload_info_entry(self, filename: str):
         task_data.client_files.get_or_create(file=filename)
 
     def _append_upload_info_entries(self, client_files: List[Dict[str, Any]]):
-        # batch version without optional insertion
+        # batch version of _maybe_append_upload_info_entry() without optional insertion
         task_data = cast(Data, self._object.data)
         task_data.client_files.bulk_create([
-            ClientFile(**cf, data=task_data) for cf in client_files
+            ClientFile(file=self._prepare_upload_info_entry(cf['file'].name), data=task_data)
+            for cf in client_files
         ])
 
     def _sort_uploaded_files(self, uploaded_files: List[str], ordering: List[str]) -> List[str]:
@@ -987,6 +988,7 @@ def init_tus_upload(self, request):
         return response
 
     # UploadMixin method
+    @transaction.atomic
     def append_files(self, request):
         client_files = self._get_request_client_files(request)
         if self._is_data_uploading() and client_files:
@@ -1023,7 +1025,7 @@ def upload_finished(self, request):
 
             # Append new files to the previous ones
             if uploaded_files := serializer.validated_data.get('client_files', None):
-                self._append_upload_info_entries(uploaded_files)
+                self.append_files(request)
                 serializer.validated_data['client_files'] = [] # avoid file info duplication
 
             # Refresh the db value with the updated file list and other request parameters

diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py
@@ -40,7 +40,7 @@
     patch_method,
     post_method,
 )
-from shared.utils.helpers import generate_image_files, generate_manifest
+from shared.utils.helpers import generate_image_file, generate_image_files, generate_manifest
 
 from .utils import (
     CollectionSimpleFilterTestBase,
@@ -820,6 +820,53 @@ def test_can_create_task_with_exif_rotated_images(self):
                         # original is 480x640 with 90/-90 degrees rotation
                         assert im.height == 640 and im.width == 480
 
+    def test_can_create_task_with_big_images(self):
+        # Checks for regressions about the issue
+        # https://github.com/opencv/cvat/issues/6878
+        # In the case of big files (>2.5 MB by default),
+        # uploaded files could be write-appended twice,
+        # leading to bigger raw file sizes than expected.
+
+        task_spec = {
+            "name": f"test {self._USERNAME} to create a task with big images",
+            "labels": [
+                {
+                    "name": "car",
+                }
+            ],
+        }
+
+        # We need a big file to reproduce the problem
+        image_file = generate_image_file("big_image.bmp", size=(4000, 4000), color=(100, 200, 30))
+        image_bytes = image_file.getvalue()
+        file_size = len(image_bytes)
+        assert 10 * 2**20 < file_size
+
+        task_data = {
+            "client_files": [image_file],
+            "image_quality": 70,
+            "use_cache": False,
+            "use_zip_chunks": True,
+        }
+
+        task_id, _ = create_task(self._USERNAME, task_spec, task_data)
+
+        # check that the original chunk image have the original size
+        # this is less accurate than checking the uploaded file directly, but faster
+        with make_api_client(self._USERNAME) as api_client:
+            _, response = api_client.tasks_api.retrieve_data(
+                task_id, number=0, quality="original", type="chunk", _parse_response=False
+            )
+            chunk_file = io.BytesIO(response.data)
+
+        with zipfile.ZipFile(chunk_file) as chunk_zip:
+            infos = chunk_zip.infolist()
+            assert len(infos) == 1
+            assert infos[0].file_size == file_size
+
+            chunk_image = chunk_zip.read(infos[0])
+            assert chunk_image == image_bytes
+
     @pytest.mark.skip(reason="need to wait new Pillow release till 15 October 2023")
     def test_can_create_task_with_exif_rotated_tif_image(self):
         task_spec = {