Skip to content

Commit

Permalink
Fix possible double writing of the uploaded client files in Upload-Mu…
Browse files Browse the repository at this point in the history
…ltiple requests (#6952)

Fixes #6878 

In the case of big files (>2.5 MB by default), the uploaded files could
be write-appended twice,
leading to bigger raw file sizes than expected. This PR fixes the
behavior by excluding repetitive
writes where it was not supposed.

- Fixed double append-writing of the uploaded files when Upload-Multiple
  requests are used
- Fixed potential DB - disk inconsistencies in the case of upload errors
- Added tests
  • Loading branch information
zhiltsov-max authored Oct 17, 2023
1 parent 021e58c commit e8db2c3
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 4 deletions.
4 changes: 4 additions & 0 deletions changelog.d/20231006_152644_fix_6878.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
### Fixed

- Potential double-sized file writes in task data uploading
(<https://github.com/opencv/cvat/pull/6952>)
8 changes: 5 additions & 3 deletions cvat/apps/engine/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,10 +940,11 @@ def _maybe_append_upload_info_entry(self, filename: str):
task_data.client_files.get_or_create(file=filename)

def _append_upload_info_entries(self, client_files: List[Dict[str, Any]]):
# batch version without optional insertion
# batch version of _maybe_append_upload_info_entry() without optional insertion
task_data = cast(Data, self._object.data)
task_data.client_files.bulk_create([
ClientFile(**cf, data=task_data) for cf in client_files
ClientFile(file=self._prepare_upload_info_entry(cf['file'].name), data=task_data)
for cf in client_files
])

def _sort_uploaded_files(self, uploaded_files: List[str], ordering: List[str]) -> List[str]:
Expand Down Expand Up @@ -987,6 +988,7 @@ def init_tus_upload(self, request):
return response

# UploadMixin method
@transaction.atomic
def append_files(self, request):
client_files = self._get_request_client_files(request)
if self._is_data_uploading() and client_files:
Expand Down Expand Up @@ -1023,7 +1025,7 @@ def upload_finished(self, request):

# Append new files to the previous ones
if uploaded_files := serializer.validated_data.get('client_files', None):
self._append_upload_info_entries(uploaded_files)
self.append_files(request)
serializer.validated_data['client_files'] = [] # avoid file info duplication

# Refresh the db value with the updated file list and other request parameters
Expand Down
49 changes: 48 additions & 1 deletion tests/python/rest_api/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
patch_method,
post_method,
)
from shared.utils.helpers import generate_image_files, generate_manifest
from shared.utils.helpers import generate_image_file, generate_image_files, generate_manifest

from .utils import (
CollectionSimpleFilterTestBase,
Expand Down Expand Up @@ -820,6 +820,53 @@ def test_can_create_task_with_exif_rotated_images(self):
# original is 480x640 with 90/-90 degrees rotation
assert im.height == 640 and im.width == 480

def test_can_create_task_with_big_images(self):
# Checks for regressions about the issue
# https://github.com/opencv/cvat/issues/6878
# In the case of big files (>2.5 MB by default),
# uploaded files could be write-appended twice,
# leading to bigger raw file sizes than expected.

task_spec = {
"name": f"test {self._USERNAME} to create a task with big images",
"labels": [
{
"name": "car",
}
],
}

# We need a big file to reproduce the problem
image_file = generate_image_file("big_image.bmp", size=(4000, 4000), color=(100, 200, 30))
image_bytes = image_file.getvalue()
file_size = len(image_bytes)
assert 10 * 2**20 < file_size

task_data = {
"client_files": [image_file],
"image_quality": 70,
"use_cache": False,
"use_zip_chunks": True,
}

task_id, _ = create_task(self._USERNAME, task_spec, task_data)

# check that the original chunk image have the original size
# this is less accurate than checking the uploaded file directly, but faster
with make_api_client(self._USERNAME) as api_client:
_, response = api_client.tasks_api.retrieve_data(
task_id, number=0, quality="original", type="chunk", _parse_response=False
)
chunk_file = io.BytesIO(response.data)

with zipfile.ZipFile(chunk_file) as chunk_zip:
infos = chunk_zip.infolist()
assert len(infos) == 1
assert infos[0].file_size == file_size

chunk_image = chunk_zip.read(infos[0])
assert chunk_image == image_bytes

@pytest.mark.skip(reason="need to wait new Pillow release till 15 October 2023")
def test_can_create_task_with_exif_rotated_tif_image(self):
task_spec = {
Expand Down

0 comments on commit e8db2c3

Please sign in to comment.