-
Notifications
You must be signed in to change notification settings - Fork 6.8k
Make Gluon download function to be atomic #12572
Changes from 7 commits
b7fc61d
657a921
353221a
32ff8cd
7ded536
b4c6b1f
901ed34
a9d40c4
91f9ca9
9ab6341
1985726
429684a
df2a9ef
602b7d0
3bed37c
f0b8c5d
47f65c2
85f9408
5a01234
8f58746
b4235f0
f0776b9
c7756df
971ecf9
1531594
4dec67c
3e799de
af5b8ec
25e21b6
f690c69
0212e5a
6af8945
a091a97
6765862
f1359be
a7b7b12
e360853
3662d38
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,7 +22,9 @@ | |
'check_sha1', 'download'] | ||
|
||
import os | ||
import sys | ||
import hashlib | ||
import uuid | ||
import warnings | ||
import collections | ||
import weakref | ||
|
@@ -195,6 +197,42 @@ def check_sha1(filename, sha1_hash): | |
return sha1.hexdigest() == sha1_hash | ||
|
||
|
||
if not sys.platform.startswith('win32'): | ||
# refer to https://github.com/untitaker/python-atomicwrites | ||
def _replace_atomic(src, dst): | ||
"""Implement atomic os.replace with linux and OSX. Internal use only""" | ||
try: | ||
os.rename(src, dst) | ||
except OSError: | ||
raise OSError('os.rename failed. Might be wrong source path.') | ||
else: | ||
from ctypes import windll, WinError | ||
|
||
_MOVEFILE_REPLACE_EXISTING = 0x1 | ||
_MOVEFILE_WRITE_THROUGH = 0x8 | ||
_windows_default_flags = _MOVEFILE_WRITE_THROUGH | ||
|
||
text_type = unicode if sys.version_info[0] == 2 else str # noqa | ||
|
||
def _path_to_unicode(x): | ||
"""Handle text decoding. Internal use only""" | ||
if not isinstance(x, text_type): | ||
return x.decode(sys.getfilesystemencoding()) | ||
return x | ||
|
||
def _handle_errors(rv): | ||
"""Handle WinError. Internal use only""" | ||
if not rv: | ||
raise WinError() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems not informative. what could be the causes? what should users do when this happens? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch! will add more comments in docstring |
||
|
||
def _replace_atomic(src, dst): | ||
"""Implement atomic os.replace with windows. Internal use only""" | ||
_handle_errors(windll.kernel32.MoveFileExW( | ||
_path_to_unicode(src), _path_to_unicode(dst), | ||
_windows_default_flags | _MOVEFILE_REPLACE_EXISTING | ||
)) | ||
|
||
|
||
def download(url, path=None, overwrite=False, sha1_hash=None, retries=5, verify_ssl=True): | ||
"""Download an given URL | ||
|
||
|
@@ -231,7 +269,7 @@ def download(url, path=None, overwrite=False, sha1_hash=None, retries=5, verify_ | |
fname = os.path.join(path, url.split('/')[-1]) | ||
else: | ||
fname = path | ||
assert retries >= 0, "Number of retries should be at least 0" | ||
assert retries >= 0, 'Number of retries should be at least 0' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. include the value of retries in the error message. |
||
|
||
if not verify_ssl: | ||
warnings.warn( | ||
|
@@ -242,23 +280,35 @@ def download(url, path=None, overwrite=False, sha1_hash=None, retries=5, verify_ | |
dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname))) | ||
if not os.path.exists(dirname): | ||
os.makedirs(dirname) | ||
while retries+1 > 0: | ||
while retries + 1 > 0: | ||
# Disable pyling too broad Exception | ||
# pylint: disable=W0703 | ||
try: | ||
print('Downloading %s from %s...'%(fname, url)) | ||
print('Downloading {} from {}...'.format(fname, url)) | ||
r = requests.get(url, stream=True, verify=verify_ssl) | ||
if r.status_code != 200: | ||
raise RuntimeError("Failed downloading url %s"%url) | ||
with open(fname, 'wb') as f: | ||
raise RuntimeError('Failed downloading url {}'.format(url)) | ||
# create uuid for temporary files | ||
random_uuid = str(uuid.uuid4()) | ||
with open('{}.{}'.format(fname, random_uuid), 'wb') as f: | ||
for chunk in r.iter_content(chunk_size=1024): | ||
if chunk: # filter out keep-alive new chunks | ||
f.write(chunk) | ||
# if the target file exists(created by other processes), | ||
# delete the temporary file | ||
if os.path.exists(fname): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. check for file hash too, as the existing file might be an aborted incomplete download. |
||
os.remove('{}.{}'.format(fname, random_uuid)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here if It might also be better to first check for the presence of local file and then make the HTTP call in line 285. This way we will prevent making unnecessary calls, instead of actually downloading the file and then deleting it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will add UserWarning and I checked the target file existence before starting downloading at line 280 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so if the check exists there in line 280. why are we repeating it here. What purpose does this check serve? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Under multiprocessing environment, two processes might pass the check and one of them finish the download, rename. while the other one is still downloading or right before rename. In such case, I choose to remove instead of rename because there is a rare case where rename might be not atomic in windows based on some research on MSDN. it uses remove to mitigate the risk There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be protected within try..except? |
||
warnings.warn( | ||
'File {} exists in file system so the downloaded file is deleted'.format(fname)) | ||
else: | ||
# atmoic operation in the same file system | ||
_replace_atomic('{}.{}'.format(fname, random_uuid), fname) | ||
if sha1_hash and not check_sha1(fname, sha1_hash): | ||
raise UserWarning('File {} is downloaded but the content hash does not match.'\ | ||
' The repo may be outdated or download may be incomplete. '\ | ||
'If the "repo_url" is overridden, consider switching to '\ | ||
'the default repo.'.format(fname)) | ||
raise UserWarning( | ||
'File {} is downloaded but the content hash does not match.' | ||
' The repo may be outdated or download may be incomplete. ' | ||
'If the "repo_url" is overridden, consider switching to ' | ||
'the default repo.'.format(fname)) | ||
break | ||
except Exception as e: | ||
retries -= 1 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Error message should be more user friendly. Something like - "Moving downloaded temp file - %s, to %s failed. Please retry the download.".format(src, dst) .
Or something similar.