diff -Nru biomaj3-download-3.1.0/bin/biomaj_download_consumer.py biomaj3-download-3.2.4/bin/biomaj_download_consumer.py --- biomaj3-download-3.1.0/bin/biomaj_download_consumer.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/bin/biomaj_download_consumer.py 2020-12-23 05:49:33.000000000 +0000 @@ -18,7 +18,7 @@ config = None with open(config_file, 'r') as ymlfile: - config = yaml.load(ymlfile) + config = yaml.load(ymlfile, Loader=yaml.FullLoader) Utils.service_config_override(config) diff -Nru biomaj3-download-3.1.0/biomaj_download/biomaj_download_web.py biomaj3-download-3.2.4/biomaj_download/biomaj_download_web.py --- biomaj3-download-3.1.0/biomaj_download/biomaj_download_web.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/biomaj_download/biomaj_download_web.py 2020-12-23 05:49:33.000000000 +0000 @@ -34,7 +34,7 @@ config = None with open(config_file, 'r') as ymlfile: - config = yaml.load(ymlfile) + config = yaml.load(ymlfile, Loader=yaml.FullLoader) Utils.service_config_override(config) diff -Nru biomaj3-download-3.1.0/biomaj_download/download/curl.py biomaj3-download-3.2.4/biomaj_download/download/curl.py --- biomaj3-download-3.1.0/biomaj_download/download/curl.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/biomaj_download/download/curl.py 2020-12-23 05:49:33.000000000 +0000 @@ -1,10 +1,10 @@ -import sys -import os import re from datetime import datetime import hashlib import time import stat +from urllib.parse import urlencode +from io import BytesIO import pycurl import ftputil @@ -12,59 +12,9 @@ import humanfriendly from biomaj_core.utils import Utils -from biomaj_download.download.interface import DownloadInterface - -if sys.version_info[0] < 3: - from urllib import urlencode -else: - from urllib.parse import urlencode - -try: - from io import BytesIO -except ImportError: - from StringIO import StringIO as BytesIO - -# We use stat.filemode to convert from mode octal value to string. -# In python < 3.3, stat.filmode is not defined. -# This code is copied from the current implementation of stat.filemode. -if 'filemode' not in stat.__dict__: - _filemode_table = ( - ((stat.S_IFLNK, "l"), # noqa: E241 - (stat.S_IFREG, "-"), # noqa: E241 - (stat.S_IFBLK, "b"), # noqa: E241 - (stat.S_IFDIR, "d"), # noqa: E241 - (stat.S_IFCHR, "c"), # noqa: E241 - (stat.S_IFIFO, "p")), # noqa: E241 - ((stat.S_IRUSR, "r"),), # noqa: E241 - ((stat.S_IWUSR, "w"),), # noqa: E241 - ((stat.S_IXUSR | stat.S_ISUID, "s"), # noqa: E241 - (stat.S_ISUID, "S"), # noqa: E241 - (stat.S_IXUSR, "x")), # noqa: E241 - ((stat.S_IRGRP, "r"),), # noqa: E241 - ((stat.S_IWGRP, "w"),), # noqa: E241 - ((stat.S_IXGRP | stat.S_ISGID, "s"), # noqa: E241 - (stat.S_ISGID, "S"), # noqa: E241 - (stat.S_IXGRP, "x")), # noqa: E241 - ((stat.S_IROTH, "r"),), # noqa: E241 - ((stat.S_IWOTH, "w"),), # noqa: E241 - ((stat.S_IXOTH | stat.S_ISVTX, "t"), # noqa: E241 - (stat.S_ISVTX, "T"), # noqa: E241 - (stat.S_IXOTH, "x")) # noqa: E241 - ) - - def _filemode(mode): - """Convert a file's mode to a string of the form '-rwxrwxrwx'.""" - perm = [] - for table in _filemode_table: - for bit, char in table: - if mode & bit == bit: - perm.append(char) - break - else: - perm.append("-") - return "".join(perm) +from biomaj_core.config import BiomajConfig - stat.filemode = _filemode +from biomaj_download.download.interface import DownloadInterface class HTTPParse(object): @@ -113,6 +63,21 @@ ftputil.stat.MSParser(), ] + # Valid values for ftp_method options as string and int + VALID_FTP_FILEMETHOD = { + "default": pycurl.FTPMETHOD_DEFAULT, + "multicwd": pycurl.FTPMETHOD_MULTICWD, + "nocwd": pycurl.FTPMETHOD_NOCWD, + "singlecwd": pycurl.FTPMETHOD_SINGLECWD, + } + + # Valid values for ssh_new_host options as string and int + VALID_SSH_NEW_HOST = { + "reject": pycurl.KHSTAT_REJECT, + "accept": pycurl.KHSTAT_FINE, + "add": pycurl.KHSTAT_FINE_ADD_TO_FILE, + } + def __init__(self, curl_protocol, host, rootdir, http_parse=None): """ Initialize a CurlDownloader. @@ -131,6 +96,19 @@ """ DownloadInterface.__init__(self) self.logger.debug('Download') + + # Check for ssh support + curl_opts_info = pycurl.version_info() + curl_opts = [] + for opt in curl_opts_info: + if isinstance(opt, tuple): + for o in opt: + curl_opts.append(o) + else: + curl_opts.append(opt) + if 'sftp' not in curl_opts: + CurlDownload.ALL_PROTOCOLS = CurlDownload.FTP_PROTOCOL_FAMILY + CurlDownload.HTTP_PROTOCOL_FAMILY + self.logger.warning("sftp not supported by curl: %s" % str(curl_opts_info)) # Initialize curl_protocol. # Note that we don't change that field in set_protocol since this # method uses the protocol from the configuration file. It's not clear @@ -143,15 +121,15 @@ if self.curl_protocol in self.FTP_PROTOCOL_FAMILY: self.protocol_family = "ftp" self._parse_result = self._ftp_parse_result - self.ERRCODE_OK = 226 + self.ERRCODE_OK = [221, 226] elif self.curl_protocol in self.HTTP_PROTOCOL_FAMILY: self.protocol_family = "http" self._parse_result = self._http_parse_result - self.ERRCODE_OK = 200 + self.ERRCODE_OK = [200] elif self.curl_protocol in self.SFTP_PROTOCOL_FAMILY: self.protocol_family = "sftp" self._parse_result = self._ftp_parse_result - self.ERRCODE_OK = 0 + self.ERRCODE_OK = [0] else: # Should not happen since we check before raise ValueError("Unknown protocol") self.rootdir = rootdir @@ -162,7 +140,9 @@ # This object is shared by all operations to use the cache. # Before using it, call method:`_basic_curl_configuration`. self.crl = pycurl.Curl() + # # Initialize options + # # Should we skip SSL verification (cURL -k/--insecure option) self.ssl_verifyhost = True self.ssl_verifypeer = True @@ -170,11 +150,34 @@ self.ssl_server_cert = None # Keep alive self.tcp_keepalive = 0 + # FTP method (cURL --ftp-method option) + self.ftp_method = pycurl.FTPMETHOD_DEFAULT # Use cURL default + # TODO: Don't store default values in BiomajConfig.DEFAULTS for + # ssh_hosts_file and ssh_new_hosts + # known_hosts file + self.ssh_hosts_file = BiomajConfig.DEFAULTS["ssh_hosts_file"] + # How to treat unknown host + self.ssh_new_host = self.VALID_SSH_NEW_HOST[BiomajConfig.DEFAULTS["ssh_new_host"]] + # Allow redirections + self.allow_redirections = True + + def _accept_new_hosts(self, known_key, found_key, match): + # Key found in file: we can accept it + # Don't use KHSTAT_FINE_ADD_TO_FILE because the key would be duplicated + # See https://github.com/curl/curl/issues/4953. + if match == pycurl.KHMATCH_OK: + return pycurl.KHSTAT_FINE + # Key not found in file: use the ssh_new_host option + elif match == pycurl.KHMATCH_MISSING: + return self.ssh_new_host + # Key missmatch: the best option is to reject it + else: + return pycurl.KHSTAT_REJECT - def _basic_curl_configuration(self): + def _network_configuration(self): """ Perform basic configuration (i.e. that doesn't depend on the - operation: _download or list). This method shoulmd be called before any + operation: _download or list). This method should be called before any operation. """ # Reset cURL options before setting them @@ -188,11 +191,19 @@ if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) + # Hosts file & function to decide for new hosts + if self.curl_protocol in self.SFTP_PROTOCOL_FAMILY: + self.crl.setopt(pycurl.SSH_KNOWNHOSTS, self.ssh_hosts_file) + self.crl.setopt(pycurl.SSH_KEYFUNCTION, self._accept_new_hosts) + # Configure TCP keepalive if self.tcp_keepalive: - self.crl.setopt(pycurl.TCP_KEEPALIVE, True) - self.crl.setopt(pycurl.TCP_KEEPIDLE, self.tcp_keepalive * 2) - self.crl.setopt(pycurl.TCP_KEEPINTVL, self.tcp_keepalive) + try: + self.crl.setopt(pycurl.TCP_KEEPALIVE, True) + self.crl.setopt(pycurl.TCP_KEEPIDLE, self.tcp_keepalive * 2) + self.crl.setopt(pycurl.TCP_KEEPINTVL, self.tcp_keepalive) + except Exception as e: + self.logger.exception("TCP keepalive option failed: " + str(e)) # Configure SSL verification (on some platforms, disabling # SSL_VERIFYPEER implies disabling SSL_VERIFYHOST so we set @@ -208,6 +219,12 @@ # CURLOPT_CAPATH is for a directory of certificates. self.crl.setopt(pycurl.CAINFO, self.ssl_server_cert) + # Configure ftp method + self.crl.setopt(pycurl.FTP_FILEMETHOD, self.ftp_method) + + # Configure redirections + self.crl.setopt(pycurl.FOLLOWLOCATION, self.allow_redirections) + # Configure timeouts self.crl.setopt(pycurl.CONNECTTIMEOUT, 300) self.crl.setopt(pycurl.TIMEOUT, self.timeout) @@ -248,16 +265,32 @@ super(CurlDownload, self).set_server(server) self.url = self.curl_protocol + '://' + self.server - def set_options(self, protocol_options): - super(CurlDownload, self).set_options(protocol_options) - if "ssl_verifyhost" in protocol_options: - self.ssl_verifyhost = Utils.to_bool(protocol_options["ssl_verifyhost"]) - if "ssl_verifypeer" in protocol_options: - self.ssl_verifypeer = Utils.to_bool(protocol_options["ssl_verifypeer"]) - if "ssl_server_cert" in protocol_options: - self.ssl_server_cert = protocol_options["ssl_server_cert"] - if "tcp_keepalive" in protocol_options: - self.tcp_keepalive = Utils.to_int(protocol_options["tcp_keepalive"]) + def set_options(self, options): + super(CurlDownload, self).set_options(options) + if "ssl_verifyhost" in options: + self.ssl_verifyhost = Utils.to_bool(options["ssl_verifyhost"]) + if "ssl_verifypeer" in options: + self.ssl_verifypeer = Utils.to_bool(options["ssl_verifypeer"]) + if "ssl_server_cert" in options: + self.ssl_server_cert = options["ssl_server_cert"] + if "tcp_keepalive" in options: + self.tcp_keepalive = Utils.to_int(options["tcp_keepalive"]) + if "ftp_method" in options: + # raw_val is a string which contains the name of the option as in the CLI. + # We always convert raw_val to a valid integer + raw_val = options["ftp_method"].lower() + if raw_val not in self.VALID_FTP_FILEMETHOD: + raise ValueError("Invalid value for ftp_method") + self.ftp_method = self.VALID_FTP_FILEMETHOD[raw_val] + if "ssh_hosts_file" in options: + self.ssh_hosts_file = options["ssh_hosts_file"] + if "ssh_new_host" in options: + raw_val = options["ssh_new_host"].lower() + if raw_val not in self.VALID_SSH_NEW_HOST: + raise ValueError("Invalid value for ssh_new_host") + self.ssh_new_host = self.VALID_SSH_NEW_HOST[raw_val] + if "allow_redirections" in options: + self.allow_redirections = Utils.to_bool(options["allow_redirections"]) def _append_file_to_download(self, rfile): # Add url and root to the file if needed (for safety) @@ -270,66 +303,74 @@ def _file_url(self, rfile): # rfile['root'] is set to self.rootdir if needed but may be different. # We don't use os.path.join because rfile['name'] may starts with / - return self.url + '/' + rfile['root'] + rfile['name'] + url = self.url + '/' + rfile['root'] + rfile['name'] + url_elts = url.split('://') + if len(url_elts) == 2: + url_elts[1] = re.sub("/{2,}", "/", url_elts[1]) + return '://'.join(url_elts) + return re.sub("/{2,}", "/", url) def _download(self, file_path, rfile): """ + Download one file and return False in case of success and True + otherwise. + This method is designed to work for FTP(S), HTTP(S) and SFTP. """ error = True - nbtry = 1 # Forge URL of remote file file_url = self._file_url(rfile) - while(error is True and nbtry < 3): - - self._basic_curl_configuration() - try: - self.crl.setopt(pycurl.URL, file_url) - except Exception: - self.crl.setopt(pycurl.URL, file_url.encode('ascii', 'ignore')) - - # Create file and assign it to the pycurl object - fp = open(file_path, "wb") - self.crl.setopt(pycurl.WRITEFUNCTION, fp.write) - - # This is specific to HTTP - if self.method == 'POST': - # Form data must be provided already urlencoded. - postfields = urlencode(self.param) - # Sets request method to POST, - # Content-Type header to application/x-www-form-urlencoded - # and data to send in request body. - self.crl.setopt(pycurl.POSTFIELDS, postfields) + try: + self.crl.setopt(pycurl.URL, file_url) + except Exception: + self.crl.setopt(pycurl.URL, file_url.encode('ascii', 'ignore')) - # Try download - try: - self.crl.perform() - errcode = self.crl.getinfo(pycurl.RESPONSE_CODE) - if int(errcode) != self.ERRCODE_OK: - error = True - self.logger.error('Error while downloading ' + file_url + ' - ' + str(errcode)) - else: - error = False - except Exception as e: - self.logger.error('Could not get errcode:' + str(e)) + # Create file and assign it to the pycurl object + fp = open(file_path, "wb") + self.crl.setopt(pycurl.WRITEFUNCTION, fp.write) + + # This is specific to HTTP + if self.method == 'POST': + # Form data must be provided already urlencoded. + postfields = urlencode(self.param) + # Sets request method to POST, + # Content-Type header to application/x-www-form-urlencoded + # and data to send in request body. + self.crl.setopt(pycurl.POSTFIELDS, postfields) + + # Try download (we don't raise errors here since its the return value + # ('error') that matters for the calling method; this is set to True + # only in case of success). + try: + self.crl.perform() + errcode = self.crl.getinfo(pycurl.RESPONSE_CODE) + if int(errcode) not in self.ERRCODE_OK: + error = True + self.logger.error('Error while downloading ' + file_url + ' - ' + str(errcode)) + else: + error = False + except Exception as e: + self.logger.error('Error while downloading ' + file_url + ' - ' + str(e)) - # Close file - fp.close() + # Check if we were redirected + if self.curl_protocol in self.HTTP_PROTOCOL_FAMILY: + n_redirect = self.crl.getinfo(pycurl.REDIRECT_COUNT) + if n_redirect: + real_url = self.crl.getinfo(pycurl.EFFECTIVE_URL) + redirect_time = self.crl.getinfo(pycurl.REDIRECT_TIME) + msg_fmt = 'Download was redirected to %s (%i redirection(s), took %ss)' + msg = msg_fmt % (real_url, n_redirect, redirect_time) + self.logger.info(msg) - # Check that the archive is correct - if not error and not self.skip_check_uncompress: - archive_status = Utils.archive_check(file_path) - if not archive_status: - self.logger.error('Archive is invalid or corrupted, deleting file and retrying download') - error = True - if os.path.exists(file_path): - os.remove(file_path) + # Close file + fp.close() - # Increment retry counter - nbtry += 1 + if error: + return error - return error + # Our part is done so call parent _download + return super(CurlDownload, self)._download(file_path, rfile) def list(self, directory=''): ''' @@ -340,10 +381,11 @@ This is a generic method for HTTP and FTP. The protocol-specific parts are done in __parse_result. ''' - dir_url = self.url + self.rootdir + directory + dirbase = re.sub('//+', '/', self.rootdir + directory) + dir_url = self.url + dirbase self.logger.debug('Download:List:' + dir_url) - self._basic_curl_configuration() + self._network_configuration() try: self.crl.setopt(pycurl.URL, dir_url) @@ -357,8 +399,25 @@ # Try to list try: self.crl.perform() + errcode = self.crl.getinfo(pycurl.RESPONSE_CODE) + if int(errcode) not in self.ERRCODE_OK: + msg = 'Error while listing ' + dir_url + ' - ' + str(errcode) + self.logger.error(msg) + raise Exception(msg) except Exception as e: - self.logger.error('Could not get errcode:' + str(e)) + msg = 'Error while listing ' + dir_url + ' - ' + str(e) + self.logger.error(msg) + raise e + + # Check if we were redirected + if self.curl_protocol in self.HTTP_PROTOCOL_FAMILY: + n_redirect = self.crl.getinfo(pycurl.REDIRECT_COUNT) + if n_redirect: + real_url = self.crl.getinfo(pycurl.EFFECTIVE_URL) + redirect_time = self.crl.getinfo(pycurl.REDIRECT_TIME) + msg_fmt = 'Download was redirected to %s (%i redirection(s), took %ss)' + msg = msg_fmt % (real_url, n_redirect, redirect_time) + self.logger.info(msg) # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. diff -Nru biomaj3-download-3.1.0/biomaj_download/download/direct.py biomaj3-download-3.2.4/biomaj_download/download/direct.py --- biomaj3-download-3.1.0/biomaj_download/download/direct.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/biomaj_download/download/direct.py 2020-12-23 05:49:33.000000000 +0000 @@ -1,9 +1,9 @@ """ Subclasses for direct download (i.e. downloading without regexp). The usage is a bit different: instead of calling method:`list` and method:`match`, client -code explicitely calls method:`set_files_to_download` (passing a list +code explicitly calls method:`set_files_to_download` (passing a list containing only the file name). method:`list` is used to get more information -about the file (if possile). method:`match` matches everything. +about the file (if possible). method:`match` matches everything. Also client code can use method:`set_save_as` to indicate the name of the file to save. @@ -21,21 +21,13 @@ import pycurl import re import hashlib -import sys +import os +from urllib.parse import urlencode +from io import BytesIO from biomaj_download.download.curl import CurlDownload from biomaj_core.utils import Utils -if sys.version_info[0] < 3: - from urllib import urlencode -else: - from urllib.parse import urlencode - -try: - from io import BytesIO -except ImportError: - from StringIO import StringIO as BytesIO - class DirectFTPDownload(CurlDownload): ''' @@ -44,29 +36,38 @@ ALL_PROTOCOLS = ["ftp", "ftps"] - def _append_file_to_download(self, filename): + def _append_file_to_download(self, rfile): ''' Initialize the files in list with today as last-modification date. Size is also preset to zero. ''' + filename = None + # workaround to handle file dict info or file name + # this is dirty, we expect to handle dicts now, + # biomaj workflow should fix this + if isinstance(rfile, dict): + filename = rfile['name'] + else: + # direct protocol send directly some filename + filename = rfile today = datetime.date.today() - rfile = {} - rfile['root'] = self.rootdir - rfile['permissions'] = '' - rfile['group'] = '' - rfile['user'] = '' - rfile['size'] = 0 - rfile['month'] = today.month - rfile['day'] = today.day - rfile['year'] = today.year + new_rfile = {} + new_rfile['root'] = self.rootdir + new_rfile['permissions'] = '' + new_rfile['group'] = '' + new_rfile['user'] = '' + new_rfile['size'] = 0 + new_rfile['month'] = today.month + new_rfile['day'] = today.day + new_rfile['year'] = today.year if filename.endswith('/'): - rfile['name'] = filename[:-1] + new_rfile['name'] = filename[:-1] else: - rfile['name'] = filename - rfile['hash'] = None + new_rfile['name'] = filename + new_rfile['hash'] = None # Use self.save_as even if we use it in list(). This is important. - rfile['save_as'] = self.save_as - super(DirectFTPDownload, self)._append_file_to_download(rfile) + new_rfile['save_as'] = self.save_as + super(DirectFTPDownload, self)._append_file_to_download(new_rfile) def set_files_to_download(self, files_to_download): if len(files_to_download) > 1: @@ -76,11 +77,63 @@ raise ValueError(msg) return super(DirectFTPDownload, self).set_files_to_download(files_to_download) + def _file_url(self, rfile): + # rfile['root'] is set to self.rootdir if needed but may be different. + # We don't use os.path.join because rfile['name'] may starts with / + url = self.url + '/' + rfile['root'] + rfile['name'] + url_elts = url.split('://') + url_elts[1] = re.sub("/{2,}", "/", url_elts[1]) + return '://'.join(url_elts) + def list(self, directory=''): ''' FTP protocol does not give us the possibility to get file date from remote url ''' - # TODO: are we sure about this implementation ? + self._network_configuration() + # Specific configuration + # With those options, cURL will issue a sequence of commands (SIZE, + # MDTM) to get the file size and last modification time and then issue + # a REST command. This usually ends with code 350. Therefore we + # explicitly handle this in this method. + # Note that very old servers may not support the MDTM command. + # Therefore, cURL will raise an error (although we can probably + # download the file). + self.crl.setopt(pycurl.OPT_FILETIME, True) + self.crl.setopt(pycurl.NOBODY, True) + for rfile in self.files_to_download: + if self.save_as is None: + self.save_as = os.path.basename(rfile['name']) + rfile['save_as'] = self.save_as + file_url = self._file_url(rfile) + try: + self.crl.setopt(pycurl.URL, file_url) + except Exception: + self.crl.setopt(pycurl.URL, file_url.encode('ascii', 'ignore')) + self.crl.setopt(pycurl.URL, file_url) + + try: + self.crl.perform() + errcode = int(self.crl.getinfo(pycurl.RESPONSE_CODE)) + # As explained, 350 is correct. We check against ERRCODE_OK + # just in case. + if errcode != 350 and errcode not in self.ERRCODE_OK: + msg = 'Error while listing ' + file_url + ' - ' + str(errcode) + self.logger.error(msg) + raise Exception(msg) + except Exception as e: + msg = 'Error while listing ' + file_url + ' - ' + str(e) + self.logger.error(msg) + raise e + + timestamp = self.crl.getinfo(pycurl.INFO_FILETIME) + dt = datetime.datetime.fromtimestamp(timestamp) + size_file = int(self.crl.getinfo(pycurl.CONTENT_LENGTH_DOWNLOAD)) + + rfile['year'] = dt.year + rfile['month'] = dt.month + rfile['day'] = dt.day + rfile['size'] = size_file + rfile['hash'] = hashlib.md5(str(timestamp).encode('utf-8')).hexdigest() return (self.files_to_download, []) def match(self, patterns, file_list, dir_list=None, prefix='', submatch=False): @@ -109,8 +162,15 @@ ''' Try to get file headers to get last_modification and size ''' - self._basic_curl_configuration() + self._network_configuration() # Specific configuration + # With those options, cURL will issue a HEAD request. This may not be + # supported especially on resources that are accessed using POST. In + # this case, HTTP will return code 405. We explicitely handle this case + # in this method. + # Note also that in many cases, there is no Last-Modified field in + # headers since this is usually dynamic content (Content-Length is + # usually present). self.crl.setopt(pycurl.HEADER, True) self.crl.setopt(pycurl.NOBODY, True) for rfile in self.files_to_download: @@ -129,7 +189,24 @@ output = BytesIO() self.crl.setopt(pycurl.WRITEFUNCTION, output.write) - self.crl.perform() + try: + self.crl.perform() + errcode = int(self.crl.getinfo(pycurl.RESPONSE_CODE)) + if errcode == 405: + # HEAD not supported by the server for this URL so we can + # skip the rest of the loop (we won't have metadata about + # the file but biomaj should be fine). + msg = 'Listing ' + file_url + ' not supported. This is fine, continuing.' + self.logger.info(msg) + continue + elif errcode not in self.ERRCODE_OK: + msg = 'Error while listing ' + file_url + ' - ' + str(errcode) + self.logger.error(msg) + raise Exception(msg) + except Exception as e: + msg = 'Error while listing ' + file_url + ' - ' + str(e) + self.logger.error(msg) + raise e # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. diff -Nru biomaj3-download-3.1.0/biomaj_download/download/interface.py biomaj3-download-3.2.4/biomaj_download/download/interface.py --- biomaj3-download-3.1.0/biomaj_download/download/interface.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/biomaj_download/download/interface.py 2020-12-23 05:49:33.000000000 +0000 @@ -3,8 +3,13 @@ import datetime import time import re +import copy + +import tenacity +from simpleeval import simple_eval, ast from biomaj_core.utils import Utils +from biomaj_core.config import BiomajConfig class _FakeLock(object): @@ -43,6 +48,96 @@ files_num_threads = 4 + # + # Constants to parse retryer + # + # Note that due to the current implementation of operators, tenacity allows + # nonsensical operations. For example the following snippets are valid: + # stop_after_attempt(1, 2) + 4 + # stop_after_attempt(1, 2) + stop_none. + # Of course, trying to use those wait policies will raise cryptic errors. + # The situation is similar for stop conditions. + # See https://github.com/jd/tenacity/issues/211. + # To avoid such errors, we test the objects in _set_retryer. + # + # Another confusing issue is that stop_never is an object (instance of the + # class _stop_never). For parsing, if we consider stop_never as a + # function then both "stop_never" and "stop_never()" are parsed correctly + # but the later raises error. Considering it has a name is slightly more + # clear (since then we must write "stop_none" as we do when we use tenacity + # directly). For consistency, we create a name for wait_none (as an + # instance of the class wait_none). + # + + # Functions available when parsing stop condition: those are constructors + # of stop conditions classes (then using them will create objects). Note + # that there is an exception for stop_never. + ALL_STOP_CONDITIONS = { + # "stop_never": tenacity.stop._stop_never, # In case, we want to use it like a function (see above) + "stop_when_event_set": tenacity.stop_when_event_set, + "stop_after_attempt": tenacity.stop_after_attempt, + "stop_after_delay": tenacity.stop_after_delay, + "stop_any": tenacity.stop_any, # Similar to | + "stop_all": tenacity.stop_all, # Similar to & + } + + # tenacity.stop_never is an instance of _stop_never, not a class so we + # import it as a name. + ALL_STOP_NAMES = { + "stop_never": tenacity.stop_never, + } + + # Operators for stop conditions: | means to stop if one of the conditions + # is True, & means to stop if all the conditions are True. + ALL_STOP_OPERATORS = { + ast.BitOr: tenacity.stop.stop_base.__or__, + ast.BitAnd: tenacity.stop.stop_base.__and__, + } + + # Functions available when parsing wait policy: those are constructors + # of wait policies classes (then using them will create objects). Note + # that there is an exception for wait_none. + ALL_WAIT_POLICIES = { + # "wait_none": tenacity.wait_none, # In case, we want to use it like a function (see above) + "wait_fixed": tenacity.wait_fixed, + "wait_random": tenacity.wait_random, + "wait_incrementing": tenacity.wait_incrementing, + "wait_exponential": tenacity.wait_exponential, + "wait_random_exponential": tenacity.wait_random_exponential, + "wait_combine": tenacity.wait_combine, # Sum of wait policies (similar to +) + "wait_chain": tenacity.wait_chain, # Give a list of wait policies (one for each attempt) + } + + # Create an instance of wait_none to use it like a constant. + ALL_WAIT_NAMES = { + "wait_none": tenacity.wait.wait_none() + } + + # Operators for wait policies: + means to sum waiting times of wait + # policies. + ALL_WAIT_OPERATORS = { + ast.Add: tenacity.wait.wait_base.__add__ + } + + @staticmethod + def is_true(download_error): + """Method used by retryer to determine if we should retry to downlaod a + file based on the return value of method:`_download` (passed as the + argument): we must retry while this value is True. + + See method:`_set_retryer`. + """ + return download_error is True + + @staticmethod + def return_last_value(retry_state): + """Method used by the retryer to determine the return value of the + retryer: we return the result of the last attempt. + + See method:`_set_retryer`. + """ + return retry_state.outcome.result() + def __init__(self): # This variable defines the protocol as passed by the config file (i.e. # this is directftp for DirectFTPDownload). It is used by the workflow @@ -68,8 +163,15 @@ self.server = None self.offline_dir = None # Options - self.protocol_options = {} + self.options = {} # This field is used to forge the download message self.skip_check_uncompress = False + # TODO: Don't store default values in BiomajConfig.DEFAULTS for + # wait_policy and stop_condition + # Construct default retryer (may be replaced in set_options) + self._set_retryer( + BiomajConfig.DEFAULTS["stop_condition"], + BiomajConfig.DEFAULTS["wait_policy"] + ) # # Setters for downloader @@ -128,16 +230,86 @@ ''' self.credentials = userpwd - def set_options(self, protocol_options): + def set_options(self, options): + """ + Set download options. + + Subclasses that override this method must call this implementation. """ - Set protocol specific options. + # Copy the option dict + self.options = options + if "skip_check_uncompress" in options: + self.skip_check_uncompress = Utils.to_bool(options["skip_check_uncompress"]) + # If stop_condition or wait_policy is specified, we reconstruct the retryer + if "stop_condition" or "wait_policy" in options: + stop_condition = options.get("stop_condition", BiomajConfig.DEFAULTS["stop_condition"]) + wait_policy = options.get("wait_policy", BiomajConfig.DEFAULTS["wait_policy"]) + self._set_retryer(stop_condition, wait_policy) - Subclasses that override this method must call the - parent implementation. + def _set_retryer(self, stop_condition, wait_policy): """ - self.protocol_options = protocol_options - if "skip_check_uncompress" in protocol_options: - self.skip_check_uncompress = Utils.to_bool(protocol_options["skip_check_uncompress"]) + Add a retryer to retry the current download if it fails. + """ + # Try to construct stop condition + if isinstance(stop_condition, tenacity.stop.stop_base): + # Use the value directly + stop_cond = stop_condition + elif isinstance(stop_condition, str): + # Try to parse the string + try: + stop_cond = simple_eval(stop_condition, + functions=self.ALL_STOP_CONDITIONS, + operators=self.ALL_STOP_OPERATORS, + names=self.ALL_STOP_NAMES) + # Check that it is an instance of stop_base + if not isinstance(stop_cond, tenacity.stop.stop_base): + raise ValueError(stop_condition + " doesn't yield a stop condition") + # Test that this is a correct stop condition by calling it. + # We use a deepcopy to be sure to not alter the object (even + # if it seems that calling a wait policy doesn't modify it). + try: + s = copy.deepcopy(stop_cond) + s(tenacity.compat.make_retry_state(0, 0)) + except Exception: + raise ValueError(stop_condition + " doesn't yield a stop condition") + except Exception as e: + raise ValueError("Error while parsing stop condition: %s" % e) + else: + raise TypeError("Expected tenacity.stop.stop_base or string, got %s" % type(stop_condition)) + # Try to construct wait policy + if isinstance(wait_policy, tenacity.wait.wait_base): + # Use the value directly + wait_pol = wait_policy + elif isinstance(wait_policy, str): + # Try to parse the string + try: + wait_pol = simple_eval(wait_policy, + functions=self.ALL_WAIT_POLICIES, + operators=self.ALL_WAIT_OPERATORS, + names=self.ALL_WAIT_NAMES) + # Check that it is an instance of wait_base + if not isinstance(wait_pol, tenacity.wait.wait_base): + raise ValueError(wait_policy + " doesn't yield a wait policy") + # Test that this is a correct wait policy by calling it. + # We use a deepcopy to be sure to not alter the object (even + # if it seems that calling a stop condition doesn't modify it). + try: + w = copy.deepcopy(wait_pol) + w(tenacity.compat.make_retry_state(0, 0)) + except Exception: + raise ValueError(wait_policy + " doesn't yield a wait policy") + except Exception as e: + raise ValueError("Error while parsing wait policy: %s" % e) + else: + raise TypeError("Expected tenacity.stop.wait_base or string, got %s" % type(wait_policy)) + + self.retryer = tenacity.Retrying( + stop=stop_cond, + wait=wait_pol, + retry_error_callback=self.return_last_value, + retry=tenacity.retry_if_result(self.is_true), + reraise=True + ) # # File operations (match, list, download) and associated hook methods @@ -157,6 +329,8 @@ if self.param: if 'param' not in rfile or not rfile['param']: rfile['param'] = self.param + # Remove duplicate */* if any + rfile['name'] = re.sub('//+', '/', rfile['name']) self.files_to_download.append(rfile) def set_files_to_download(self, files): @@ -183,7 +357,6 @@ :type submatch: bool ''' self.logger.debug('Download:File:RegExp:' + str(patterns)) - if dir_list is None: dir_list = [] @@ -232,6 +405,7 @@ rfile['name'] = prefix + '/' + rfile['name'] self._append_file_to_download(rfile) self.logger.debug('Download:File:MatchRegExp:' + rfile['name']) + if not submatch and len(self.files_to_download) == 0: raise Exception('no file found matching expressions') @@ -280,11 +454,13 @@ for dfile in self.files_to_download: if index < len(new_or_modified_files) and \ dfile['name'] == new_or_modified_files[index][0]: - new_files_to_download.append(dfile) index += 1 else: - if not check_exists or os.path.exists(os.path.join(root_dir, dfile['name'])): + fileName = dfile["name"] + if dfile["name"].startswith('/'): + fileName = dfile["name"][1:] + if not check_exists or os.path.exists(os.path.join(root_dir, fileName)): dfile['root'] = root_dir self.logger.debug('Copy file instead of downloading it: %s' % (os.path.join(root_dir, dfile['name']))) self.files_to_copy.append(dfile) @@ -293,7 +469,10 @@ else: # Copy everything for dfile in self.files_to_download: - if not check_exists or os.path.exists(os.path.join(root_dir, dfile['name'])): + fileName = dfile["name"] + if dfile["name"].startswith('/'): + fileName = dfile["name"][1:] + if not check_exists or os.path.exists(os.path.join(root_dir, fileName)): dfile['root'] = root_dir self.files_to_copy.append(dfile) else: @@ -304,7 +483,28 @@ def _download(self, file_path, rfile): ''' Download one file and return False in case of success and True - otherwise. This must be implemented in subclasses. + otherwise. + + Subclasses that override this method must call this implementation + at the end to perform test on archives. + + Note that this method is executed inside a retryer. + ''' + error = False + # Check that the archive is correct + if not self.skip_check_uncompress: + archive_status = Utils.archive_check(file_path) + if not archive_status: + self.logger.error('Archive is invalid or corrupted, deleting file and retrying download') + error = True + if os.path.exists(file_path): + os.remove(file_path) + return error + + def _network_configuration(self): + ''' + Perform some configuration before network operations (list and + download). This must be implemented in subclasses. ''' raise NotImplementedError() @@ -319,6 +519,7 @@ :return: list of downloaded files ''' self.logger.debug(self.__class__.__name__ + ':Download') + self._network_configuration() nb_files = len(self.files_to_download) cur_files = 1 self.offline_dir = local_dir @@ -346,7 +547,7 @@ cur_files += 1 start_time = datetime.datetime.now() start_time = time.mktime(start_time.timetuple()) - error = self._download(file_path, rfile) + error = self.retryer(self._download, file_path, rfile) if error: rfile['download_time'] = 0 rfile['error'] = True diff -Nru biomaj3-download-3.1.0/biomaj_download/download/localcopy.py biomaj3-download-3.2.4/biomaj_download/download/localcopy.py --- biomaj3-download-3.1.0/biomaj_download/download/localcopy.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/biomaj_download/download/localcopy.py 2020-12-23 05:49:33.000000000 +0000 @@ -8,13 +8,16 @@ class LocalDownload(DownloadInterface): ''' - Base class to copy file from local system + Base class to copy file from local system. protocol=cp server=localhost remote.dir=/blast/db/FASTA/ remote.files=^alu.*\\.gz$ + + Note that we redefine download and list in such a way that we don't need to + define _download and _network_configuration. ''' def __init__(self, rootdir, use_hardlinks=False): @@ -57,7 +60,12 @@ rfiles = [] rdirs = [] - files = [f for f in os.listdir(self.rootdir + directory)] + try: + files = [f for f in os.listdir(self.rootdir + directory)] + except Exception as e: + msg = 'Error while listing ' + self.rootdir + ' - ' + str(e) + self.logger.error(msg) + raise e for file_in_files in files: rfile = {} fstat = os.stat(os.path.join(self.rootdir + directory, file_in_files)) diff -Nru biomaj3-download-3.1.0/biomaj_download/download/protocolirods.py biomaj3-download-3.2.4/biomaj_download/download/protocolirods.py --- biomaj3-download-3.1.0/biomaj_download/download/protocolirods.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/biomaj_download/download/protocolirods.py 2020-12-23 05:49:33.000000000 +0000 @@ -1,5 +1,6 @@ from biomaj_download.download.interface import DownloadInterface from irods.session import iRODSSession +from irods.exception import iRODSException from irods.models import DataObject, User @@ -34,37 +35,48 @@ self.port = int(param['port']) def list(self, directory=''): - session = iRODSSession(host=self.server, port=self.port, user=self.user, password=self.password, zone=self.zone) + self._network_configuration() rfiles = [] rdirs = [] rfile = {} date = None - query = session.query(DataObject.name, DataObject.size, - DataObject.owner_name, DataObject.modify_time) - results = query.filter(User.name == self.user).get_results() - for result in results: - # Avoid duplication - if rfile != {} and rfile['name'] == str(result[DataObject.name]) \ - and date == str(result[DataObject.modify_time]).split(" ")[0].split('-'): - continue - rfile = {} - date = str(result[DataObject.modify_time]).split(" ")[0].split('-') - rfile['permissions'] = "-rwxr-xr-x" - rfile['size'] = int(result[DataObject.size]) - rfile['month'] = int(date[1]) - rfile['day'] = int(date[2]) - rfile['year'] = int(date[0]) - rfile['name'] = str(result[DataObject.name]) - rfiles.append(rfile) - session.cleanup() + # Note that iRODS raise errors when trying to use the results + # and not after query(). Therefore, the whole loop is inside + # try/catch. + try: + query = self.session.query(DataObject.name, DataObject.size, + DataObject.owner_name, DataObject.modify_time) + results = query.filter(User.name == self.user).get_results() + for result in results: + # Avoid duplication + if rfile != {} and rfile['name'] == str(result[DataObject.name]) \ + and date == str(result[DataObject.modify_time]).split(" ")[0].split('-'): + continue + rfile = {} + date = str(result[DataObject.modify_time]).split(" ")[0].split('-') + rfile['permissions'] = "-rwxr-xr-x" + rfile['size'] = int(result[DataObject.size]) + rfile['month'] = int(date[1]) + rfile['day'] = int(date[2]) + rfile['year'] = int(date[0]) + rfile['name'] = str(result[DataObject.name]) + rfiles.append(rfile) + except Exception as e: + msg = 'Error while listing ' + self.remote_dir + ' - ' + repr(e) + self.logger.error(msg) + raise e + finally: + self.session.cleanup() return (rfiles, rdirs) - def _download(self, file_dir, rfile): + def _network_configuration(self): + self.session = iRODSSession(host=self.server, port=self.port, + user=self.user, password=self.password, + zone=self.zone) + + def _download(self, file_path, rfile): error = False self.logger.debug('IRODS:IRODS DOWNLOAD') - session = iRODSSession(host=self.server, port=self.port, - user=self.user, password=self.password, - zone=self.zone) try: # iRODS don't like multiple "/" if rfile['root'][-1] == "/": @@ -73,17 +85,14 @@ file_to_get = rfile['root'] + "/" + rfile['name'] # Write the file to download in the wanted file_dir with the # python-irods iget - obj = session.data_objects.get(file_to_get, file_dir) - except ExceptionIRODS as e: - self.logger.error(self.__class__.__name__ + ":Download:Error:Can't get irods object " + str(obj)) - self.logger.error(self.__class__.__name__ + ":Download:Error:" + str(e)) - session.cleanup() - return(error) - + self.session.data_objects.get(file_to_get, file_path) + except iRODSException as e: + error = True + self.logger.error(self.__class__.__name__ + ":Download:Error:Can't get irods object " + file_to_get) + self.logger.error(self.__class__.__name__ + ":Download:Error:" + repr(e)) -class ExceptionIRODS(Exception): - def __init__(self, exception_reason): - self.exception_reason = exception_reason + if error: + return error - def __str__(self): - return self.exception_reason + # Our part is done so call parent _download + return super(IRODSDownload, self)._download(file_path, rfile) diff -Nru biomaj3-download-3.1.0/biomaj_download/download/rsync.py biomaj3-download-3.2.4/biomaj_download/download/rsync.py --- biomaj3-download-3.1.0/biomaj_download/download/rsync.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/biomaj_download/download/rsync.py 2020-12-23 05:49:33.000000000 +0000 @@ -2,7 +2,6 @@ # standard_library.install_aliases() # from builtins import str import re -import os import subprocess from biomaj_download.download.interface import DownloadInterface @@ -33,12 +32,6 @@ else: self.server = None self.rootdir = server - # give a working directory to run rsync - if self.local_mode: - try: - os.chdir(self.rootdir) - except TypeError: - self.logger.error("RSYNC:Could not find local dir " + self.rootdir) def _append_file_to_download(self, rfile): if 'root' not in rfile or not rfile['root']: @@ -51,7 +44,14 @@ url = rfile['root'] + "/" + rfile['name'] if not self.local_mode: url = self.server + ":" + url - return url + return re.sub("/{2,}", "/", url) + + def _network_configuration(self): + ''' + Perform some configuration before network operations (list and + download). + ''' + pass def _download(self, file_path, rfile): error = False @@ -62,7 +62,7 @@ cmd = str(self.real_protocol) + " " + str(self.credentials) + "@" + url + " " + str(file_path) else: cmd = str(self.real_protocol) + " " + url + " " + str(file_path) - self.logger.debug('RSYNC:RSYNC DOwNLOAD:' + cmd) + self.logger.debug('RSYNC:RSYNC DOWNLOAD:' + cmd) # Launch the command (we are in offline_dir) try: p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) @@ -75,7 +75,11 @@ if err_code != 0: self.logger.error('Error while downloading ' + rfile["name"] + ' - ' + str(err_code)) error = True - return(error) + if error: + return error + + # Our part is done so call parent _download + return super(RSYNCDownload, self)._download(file_path, rfile) def test_stderr_rsync_error(self, stderr): stderr = str(stderr.decode('utf-8')) @@ -105,18 +109,21 @@ remote = str(self.server) + ":" + str(self.rootdir) + str(directory) if self.credentials: remote = str(self.credentials) + "@" + remote - cmd = str(self.real_protocol) + " --list-only " + remote + cmd = str(self.real_protocol) + " --list-only --no-motd " + remote try: p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) list_rsync, err = p.communicate() self.test_stderr_rsync_message(err) self.test_stderr_rsync_error(err) err_code = p.returncode + if err_code != 0: + msg = 'Error while listing ' + remote + ' - ' + str(err_code) + self.logger.error(msg) + raise Exception(msg) except ExceptionRsync as e: - self.logger.error("RsyncError:" + str(e)) - if err_code != 0: - self.logger.error('Error while listing ' + str(err_code)) - return(rfiles, rdirs) + msg = 'Error while listing ' + remote + ' - ' + str(e) + self.logger.error(msg) + raise e list_rsync = str(list_rsync.decode('utf-8')) lines = list_rsync.rstrip().split("\n") for line in lines: diff -Nru biomaj3-download-3.1.0/biomaj_download/downloadclient.py biomaj3-download-3.2.4/biomaj_download/downloadclient.py --- biomaj3-download-3.1.0/biomaj_download/downloadclient.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/biomaj_download/downloadclient.py 2020-12-23 05:49:33.000000000 +0000 @@ -3,17 +3,13 @@ import logging import uuid import time -import sys +from queue import Queue + import pika from biomaj_download.download.downloadthreads import DownloadThread from biomaj_download.message import downmessage_pb2 -if sys.version_info[0] < 3: - from Queue import Queue -else: - from queue import Queue - class DownloadClient(DownloadService): diff -Nru biomaj3-download-3.1.0/biomaj_download/downloadservice.py biomaj3-download-3.2.4/biomaj_download/downloadservice.py --- biomaj3-download-3.1.0/biomaj_download/downloadservice.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/biomaj_download/downloadservice.py 2020-12-23 05:49:33.000000000 +0000 @@ -85,7 +85,7 @@ self.bank = None self.download_callback = None with open(config_file, 'r') as ymlfile: - self.config = yaml.load(ymlfile) + self.config = yaml.load(ymlfile, Loader=yaml.FullLoader) Utils.service_config_override(self.config) Zipkin.set_config(self.config) @@ -130,7 +130,7 @@ credentials=None, http_parse=None, http_method=None, param=None, proxy=None, proxy_auth='', save_as=None, timeout_download=None, offline_dir=None, - protocol_options={}): + options={}): protocol = downmessage_pb2.DownloadFile.Protocol.Value(protocol_name.upper()) downloader = None if protocol in [0, 1]: # FTP, SFTP @@ -190,9 +190,9 @@ # Set the name of the BioMAJ protocol to which we respond. downloader.set_protocol(protocol_name) - if protocol_options is not None: - self.logger.debug("Received protocol options: " + str(protocol_options)) - downloader.set_options(protocol_options) + if options is not None: + self.logger.debug("Received options: " + str(options)) + downloader.set_options(options) downloader.logger = self.logger downloader.set_files_to_download(remote_files) @@ -243,7 +243,7 @@ save_as=biomaj_file_info.remote_file.save_as, timeout_download=biomaj_file_info.timeout_download, offline_dir=biomaj_file_info.local_dir, - protocol_options=biomaj_file_info.protocol_options + options=biomaj_file_info.options ) def clean(self, biomaj_file_info=None): diff -Nru biomaj3-download-3.1.0/biomaj_download/message/downmessage_pb2.py biomaj3-download-3.2.4/biomaj_download/message/downmessage_pb2.py --- biomaj3-download-3.1.0/biomaj_download/message/downmessage_pb2.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/biomaj_download/message/downmessage_pb2.py 2020-12-23 05:49:33.000000000 +0000 @@ -19,7 +19,7 @@ package='biomaj.download', syntax='proto2', serialized_options=None, - serialized_pb=_b('\n\x11\x64ownmessage.proto\x12\x0f\x62iomaj.download\"\x9d\x02\n\x04\x46ile\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x0c\n\x04root\x18\x02 \x01(\t\x12\x0f\n\x07save_as\x18\x03 \x01(\t\x12\x0b\n\x03url\x18\x04 \x01(\t\x12\x30\n\x08metadata\x18\x05 \x01(\x0b\x32\x1e.biomaj.download.File.MetaData\x1a\xa8\x01\n\x08MetaData\x12\x13\n\x0bpermissions\x18\x01 \x01(\t\x12\r\n\x05group\x18\x02 \x01(\t\x12\x0c\n\x04size\x18\x03 \x01(\x03\x12\x0c\n\x04hash\x18\x04 \x01(\t\x12\x0c\n\x04year\x18\x05 \x01(\x05\x12\r\n\x05month\x18\x06 \x01(\x05\x12\x0b\n\x03\x64\x61y\x18\x07 \x01(\x05\x12\x0e\n\x06\x66ormat\x18\x08 \x01(\t\x12\x0b\n\x03md5\x18\t \x01(\t\x12\x15\n\rdownload_time\x18\n \x01(\x03\"0\n\x08\x46ileList\x12$\n\x05\x66iles\x18\x01 \x03(\x0b\x32\x15.biomaj.download.File\"\xaa\x02\n\tOperation\x12\x32\n\x04type\x18\x01 \x02(\x0e\x32$.biomaj.download.Operation.OPERATION\x12/\n\x08\x64ownload\x18\x02 \x01(\x0b\x32\x1d.biomaj.download.DownloadFile\x12)\n\x07process\x18\x03 \x01(\x0b\x32\x18.biomaj.download.Process\x12/\n\x05trace\x18\x04 \x01(\x0b\x32 .biomaj.download.Operation.Trace\x1a*\n\x05Trace\x12\x10\n\x08trace_id\x18\x01 \x02(\t\x12\x0f\n\x07span_id\x18\x02 \x02(\t\"0\n\tOPERATION\x12\x08\n\x04LIST\x10\x00\x12\x0c\n\x08\x44OWNLOAD\x10\x01\x12\x0b\n\x07PROCESS\x10\x02\"\x17\n\x07Process\x12\x0c\n\x04\x65xec\x18\x01 \x02(\t\"\xad\x0b\n\x0c\x44ownloadFile\x12\x0c\n\x04\x62\x61nk\x18\x01 \x02(\t\x12\x0f\n\x07session\x18\x02 \x02(\t\x12\x11\n\tlocal_dir\x18\x03 \x02(\t\x12\x18\n\x10timeout_download\x18\x04 \x01(\x05\x12=\n\x0bremote_file\x18\x05 \x02(\x0b\x32(.biomaj.download.DownloadFile.RemoteFile\x12\x32\n\x05proxy\x18\x06 \x01(\x0b\x32#.biomaj.download.DownloadFile.Proxy\x12\x43\n\x0bhttp_method\x18\x08 \x01(\x0e\x32).biomaj.download.DownloadFile.HTTP_METHOD:\x03GET\x12L\n\x10protocol_options\x18\t \x03(\x0b\x32\x32.biomaj.download.DownloadFile.ProtocolOptionsEntry\x1a$\n\x05Param\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\r\n\x05value\x18\x02 \x02(\t\x1a\xcd\x03\n\tHttpParse\x12\x91\x01\n\x08\x64ir_line\x18\x01 \x02(\t:\x7f[\\s]*.*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})\x12\xa5\x01\n\tfile_line\x18\x02 \x02(\t:\x91\x01[\\s][\\s]*.*([\\d]{2}-[\\w\\d]{2,5}-[\\d]{4}\\s[\\d]{2}:[\\d]{2})\x12\xa5\x01\n\tfile_line\x18\x02 \x02(\t:\x91\x01[\\s] protocol_options = 9; + + map options = 9; } diff -Nru biomaj3-download-3.1.0/CHANGES.txt biomaj3-download-3.2.4/CHANGES.txt --- biomaj3-download-3.1.0/CHANGES.txt 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/CHANGES.txt 2020-12-23 05:49:33.000000000 +0000 @@ -1,34 +1,80 @@ +3.2.4: + #39 directhttp download issues + biomaj sends file name instead of file dict, workaround this + #28 CurlDownload crashses if cURL doesn't support SFTP + Minor python doc and error message updates + Suppress yaml warnings + +3.2.3: + #30: raise errors when something in list() fail + DirectHTTP(s)/DirectFTP(s): do not raise error on list step as HEAD may not be supported + #35: allow redirections (closes #33) + +3.2.2: + #31 fix URL with multiple slashes + Update demo password for ftps web site tests + Remove python2 support + +3.2.1: + #26 Accept new keys for SFTP servers (Closes #25) + Strip extra slash characters in remote file list (due to regexp parsing) + #20 Add a configurable mechanism to retry download when it fails + #24 Speed up IRODDownload + Introduce a method to perform configuration before network methods. Adapt implementation of generic methods and subclasses. + Resolve bug when the parser analyse also the Message Of The Day when it wants only list of file. (#23) + +3.1.2: + #18 Add a protocol option to set CURLOPT_FTP_FILEMETHOD + #19 Rename protocol options to options + Fix copy of production files instead of download when files are in subdirectories + +3.1.1: + #17 Support MDTM command in directftp + 3.1.0: #16 Don't change name after download in DirectHTTPDownloader PR #7 Refactor downloaders (*WARNING* breaks API) + 3.0.27: Fix previous release broken with a bug in direct protocols + 3.0.26: Change default download timeout to 1h #12 Allow FTPS protocol #14 Add mechanism for protocol specific options + 3.0.25: Allow to use hardlinks in LocalDownload + 3.0.24: Remove debug logs + 3.0.23: Support spaces in remote file names + 3.0.22: Fix **/* remote.files parsing + 3.0.21: Fix traefik labels + 3.0.20: Update pika dependency release Add tags for traefik support + 3.0.19: Check archives after download Fix python regexps syntax (deprecation) + 3.0.18: Rename protobuf and use specific package to avoid conflicts + 3.0.17: Regenerate protobuf message desc, failing on python3 + 3.0.16: Add missing req in setup.py + 3.0.15: Fix progress download control where could have infinite loop Add irods download diff -Nru biomaj3-download-3.1.0/debian/changelog biomaj3-download-3.2.4/debian/changelog --- biomaj3-download-3.1.0/debian/changelog 2019-11-12 10:18:15.000000000 +0000 +++ biomaj3-download-3.2.4/debian/changelog 2021-01-17 10:28:54.000000000 +0000 @@ -1,13 +1,41 @@ +biomaj3-download (3.2.4-1) unstable; urgency=medium + + [ Michael R. Crusoe ] + * Team upload. + * New upstream version + * debhelper-compat 12 + * Standards-Version: 4.4.1 + * Respect DEB_BUILD_OPTIONS in override_dh_auto_test target + * Remove trailing whitespace in debian/changelog + * Remove empty debian/patches/series. + * Move the autodep8 autopkgtest to an explicit one, as + the module name (biomaj-downloand) doesn't match the package name + (python3-biomaj3-download). + + [ Andreas Tille ] + * Standards-Version: 4.5.1 (routine-update) + * debhelper-compat 13 (routine-update) + * Testsuite: autopkgtest-pkg-python (routine-update) + * Add salsa-ci file (routine-update) + * Rules-Requires-Root: no (routine-update) + * Set upstream metadata fields: Bug-Database, Bug-Submit, Repository, + Repository-Browse. + * Build-Depends: python3-tenacity, python3-simpleeval + * Bump versioned Depends from python3-biomaj3-core + * Lintian override for script-with-language-extension + + -- Andreas Tille Sun, 17 Jan 2021 11:28:54 +0100 + biomaj3-download (3.1.0-1) unstable; urgency=medium [ Olivier Sallou ] - * New upstream release + * New upstream release -- Olivier Sallou Tue, 12 Nov 2019 10:18:15 +0000 biomaj3-download (3.0.21-1) unstable; urgency=medium - * New upstream release + * New upstream release * d/rules: use DEB_BUILD_MAINT_OPTIONS * d/patches: remove irods protocol support as python lib is not available in Debian. diff -Nru biomaj3-download-3.1.0/debian/compat biomaj3-download-3.2.4/debian/compat --- biomaj3-download-3.1.0/debian/compat 2019-11-12 10:18:15.000000000 +0000 +++ biomaj3-download-3.2.4/debian/compat 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -12 diff -Nru biomaj3-download-3.1.0/debian/control biomaj3-download-3.2.4/debian/control --- biomaj3-download-3.1.0/debian/control 2019-11-12 10:18:15.000000000 +0000 +++ biomaj3-download-3.2.4/debian/control 2021-01-17 10:28:54.000000000 +0000 @@ -4,7 +4,7 @@ Section: python Testsuite: autopkgtest-pkg-python Priority: optional -Build-Depends: debhelper (>= 12~), +Build-Depends: debhelper-compat (= 13), dh-python, protobuf-compiler, python3-all, @@ -24,22 +24,23 @@ python3-requests, python3-setuptools, python3-yaml, - python3-biomaj3-core (>= 3.0.19), + python3-biomaj3-core (>= 3.0.26), python3-biomaj3-zipkin, python3-ftputil, + python3-tenacity , + python3-simpleeval , rsync -Standards-Version: 4.3.0 +Standards-Version: 4.5.1 Vcs-Browser: https://salsa.debian.org/med-team/biomaj3-download Vcs-Git: https://salsa.debian.org/med-team/biomaj3-download.git Homepage: https://github.com/genouest/biomaj-download +Rules-Requires-Root: no Package: python3-biomaj3-download Architecture: all Depends: ${misc:Depends}, ${python3:Depends} -Recommends: ${python3:Recommends} -Suggests: ${python3:Suggests}, - python3-gunicorn, +Suggests: python3-gunicorn, mongodb, redis-server Description: BioMAJ download management library @@ -53,4 +54,3 @@ . This package contains the library and microservice to manage downloads in BioMAJ3 -XB-Python-Egg-Name: biomaj-download diff -Nru biomaj3-download-3.1.0/debian/lintian-overrides biomaj3-download-3.2.4/debian/lintian-overrides --- biomaj3-download-3.1.0/debian/lintian-overrides 1970-01-01 00:00:00.000000000 +0000 +++ biomaj3-download-3.2.4/debian/lintian-overrides 2021-01-17 10:28:54.000000000 +0000 @@ -0,0 +1,2 @@ +# see https://lists.debian.org/debian-med/2018/06/msg00043.html +python3-biomaj3-download: script-with-language-extension usr/bin/*.* diff -Nru biomaj3-download-3.1.0/debian/patches/remove_irods.patch biomaj3-download-3.2.4/debian/patches/remove_irods.patch --- biomaj3-download-3.1.0/debian/patches/remove_irods.patch 2019-11-12 10:18:15.000000000 +0000 +++ biomaj3-download-3.2.4/debian/patches/remove_irods.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,135 +0,0 @@ -Subject: python irods not available, remove it from supported protocols -Description: biomaj supports irods as download protocol but irods is not - available in Debian. In the meanwhile remove support for this protocol -Author: Olivier Sallou -Last-Updated: 2019-03-09 -Forwarded: no ---- a/requirements.txt -+++ b/requirements.txt -@@ -14,4 +14,3 @@ - biomaj_zipkin - flake8 - humanfriendly --python-irodsclient ---- a/setup.py -+++ b/setup.py -@@ -54,8 +54,7 @@ - 'prometheus_client>=0.0.18', - 'protobuf', - 'requests', -- 'humanfriendly', -- 'python-irodsclient' -+ 'humanfriendly' - ], - 'tests_require': ['nose', 'mock'], - 'test_suite': 'nose.collector', ---- a/biomaj_download/download/protocolirods.py -+++ b/biomaj_download/download/protocolirods.py -@@ -5,8 +5,6 @@ - - from biomaj_core.utils import Utils - from biomaj_download.download.interface import DownloadInterface --from irods.session import iRODSSession --from irods.models import Collection, DataObject, User - - - class IRODSDownload(DownloadInterface): -@@ -31,27 +29,9 @@ - self.zone = str(param['zone']) - - def list(self, directory=''): -- session = iRODSSession(host=self.server, port=self.port, user=self.user, password=self.password, zone=self.zone) - rfiles = [] - rdirs = [] -- rfile = {} -- date = None -- for result in session.query(Collection.name, DataObject.name, DataObject.size, DataObject.owner_name, DataObject.modify_time).filter(User.name == self.user).get_results(): -- # if the user is biomaj : he will have access to all the irods data (biomaj ressource) : drwxr-xr-x -- # Avoid duplication -- if rfile != {} and rfile['name'] == str(result[DataObject.name]) and date == str(result[DataObject.modify_time]).split(" ")[0].split('-'): -- continue -- rfile = {} -- date = str(result[DataObject.modify_time]).split(" ")[0].split('-') -- rfile['permissions'] = "-rwxr-xr-x" -- rfile['size'] = int(result[DataObject.size]) -- rfile['month'] = int(date[1]) -- rfile['day'] = int(date[2]) -- rfile['year'] = int(date[0]) -- rfile['name'] = str(result[DataObject.name]) -- rfile['download_path'] = str(result[Collection.name]) -- rfiles.append(rfile) -- session.cleanup() -+ raise Exception("IRODS:NotSupported") - return (rfiles, rdirs) - - def download(self, local_dir, keep_dirs=True): -@@ -65,67 +45,10 @@ - :return: list of downloaded files - ''' - logging.debug('IRODS:Download') -- try: -- os.chdir(local_dir) -- except TypeError: -- logging.error("IRODS:list:Could not find offline_dir") -- nb_files = len(self.files_to_download) -- cur_files = 1 -- # give a working directory to copy the file from irods -- remote_dir = self.remote_dir -- for rfile in self.files_to_download: -- if self.kill_received: -- raise Exception('Kill request received, exiting') -- file_dir = local_dir -- if 'save_as' not in rfile or rfile['save_as'] is None: -- rfile['save_as'] = rfile['name'] -- if keep_dirs: -- file_dir = local_dir + os.path.dirname(rfile['save_as']) -- file_path = file_dir + '/' + os.path.basename(rfile['save_as']) -- # For unit tests only, workflow will take in charge directory creation before to avoid thread multi access -- if not os.path.exists(file_dir): -- os.makedirs(file_dir) -- -- logging.debug('IRODS:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' downloading file ' + rfile['name']) -- logging.debug('IRODS:Download:Progress:' + str(cur_files) + '/' + str(nb_files) + ' save as ' + rfile['save_as']) -- cur_files += 1 -- start_time = datetime.now() -- start_time = time.mktime(start_time.timetuple()) -- self.remote_dir = rfile['root'] -- error = self.irods_download(file_dir, str(self.remote_dir), str(rfile['name'])) -- if error: -- rfile['download_time'] = 0 -- rfile['error'] = True -- raise Exception("IRODS:Download:Error:" + rfile['root'] + '/' + rfile['name']) -- else: -- archive_status = Utils.archive_check(file_path) -- if not archive_status: -- self.logger.error('Archive is invalid or corrupted, deleting file') -- rfile['error'] = True -- if os.path.exists(file_path): -- os.remove(file_path) -- raise Exception("IRODS:Download:Error:" + rfile['root'] + '/' + rfile['name']) -- -- end_time = datetime.now() -- end_time = time.mktime(end_time.timetuple()) -- rfile['download_time'] = end_time - start_time -- self.set_permissions(file_path, rfile) -- self.remote_dir = remote_dir -- return(self.files_to_download) -+ raise Exception("IRODS:NotSupported") - - def irods_download(self, file_dir, file_path, file_to_download): -- error = False -- logging.debug('IRODS:IRODS DOWNLOAD') -- session = iRODSSession(host=self.server, port=self.port, user=self.user, password=self.password, zone=self.zone) -- try: -- file_to_get = str(file_path) + str(file_to_download) -- # Write the file to download in the wanted file_dir : with the python-irods iget -- obj = session.data_objects.get(file_to_get, file_dir) -- except ExceptionIRODS as e: -- logging.error("RsyncError:" + str(e)) -- logging.error("RsyncError: irods object" + str(obj)) -- session.cleanup() -- return(error) -+ return("irods not supported") - - - class ExceptionIRODS(Exception): diff -Nru biomaj3-download-3.1.0/debian/py3dist-overrides biomaj3-download-3.2.4/debian/py3dist-overrides --- biomaj3-download-3.1.0/debian/py3dist-overrides 1970-01-01 00:00:00.000000000 +0000 +++ biomaj3-download-3.2.4/debian/py3dist-overrides 2021-01-17 10:28:54.000000000 +0000 @@ -0,0 +1 @@ +py_bcrypt python3-bcrypt diff -Nru biomaj3-download-3.1.0/debian/rules biomaj3-download-3.2.4/debian/rules --- biomaj3-download-3.1.0/debian/rules 2019-11-12 10:18:15.000000000 +0000 +++ biomaj3-download-3.2.4/debian/rules 2021-01-17 10:28:54.000000000 +0000 @@ -16,4 +16,6 @@ sed -i '1s;^;#!/usr/bin/python3\n;' debian/python3-biomaj3-download/usr/bin/biomaj_download_consumer.py override_dh_auto_test: +ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS))) nosetests3 -a !network +endif diff -Nru biomaj3-download-3.1.0/debian/salsa-ci.yml biomaj3-download-3.2.4/debian/salsa-ci.yml --- biomaj3-download-3.1.0/debian/salsa-ci.yml 1970-01-01 00:00:00.000000000 +0000 +++ biomaj3-download-3.2.4/debian/salsa-ci.yml 2021-01-17 10:28:54.000000000 +0000 @@ -0,0 +1,4 @@ +--- +include: + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml diff -Nru biomaj3-download-3.1.0/debian/test/control biomaj3-download-3.2.4/debian/test/control --- biomaj3-download-3.1.0/debian/test/control 1970-01-01 00:00:00.000000000 +0000 +++ biomaj3-download-3.2.4/debian/test/control 2021-01-17 10:28:54.000000000 +0000 @@ -0,0 +1,3 @@ +Test-Command: set -e ; for py in $(py3versions -r 2>/dev/null) ; do cd "$AUTOPKGTEST_TMP" ; echo "Testing with $py:" ; $py -c "import biomaj-download; print(biomaj-download)" ; done +Depends: python3-all, python3-biomaj3-download +Restrictions: allow-stderr, superficial diff -Nru biomaj3-download-3.1.0/debian/upstream/metadata biomaj3-download-3.2.4/debian/upstream/metadata --- biomaj3-download-3.1.0/debian/upstream/metadata 1970-01-01 00:00:00.000000000 +0000 +++ biomaj3-download-3.2.4/debian/upstream/metadata 2021-01-17 10:28:54.000000000 +0000 @@ -0,0 +1,5 @@ +--- +Bug-Database: https://github.com/genouest/biomaj-download/issues +Bug-Submit: https://github.com/genouest/biomaj-download/issues/new +Repository: https://github.com/genouest/biomaj-download.git +Repository-Browse: https://github.com/genouest/biomaj-download diff -Nru biomaj3-download-3.1.0/README.md biomaj3-download-3.2.4/README.md --- biomaj3-download-3.1.0/README.md 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/README.md 2020-12-23 05:49:33.000000000 +0000 @@ -7,6 +7,8 @@ A protobuf interface is available in biomaj_download/message/message_pb2.py to exchange messages between BioMAJ and the download service. Messages go through RabbitMQ (to be installed). +Python3 support only, python2 support is dropped + # Protobuf To compile protobuf, in biomaj_download/message: @@ -15,7 +17,7 @@ # Development - flake8 biomaj_download/\*.py biomaj_download/download + flake8 --ignore E501 biomaj_download/\*.py biomaj_download/download # Test @@ -58,3 +60,155 @@ Web processes should be behind a proxy/load balancer, API base url /api/download Prometheus endpoint metrics are exposed via /metrics on web server + +# Retrying + +A common problem when downloading a large number of files is the handling of temporary failures (network issues, server too busy to answer, etc.). +Since version 3.1.2, `biomaj-download` uses the [Tenacity library](https://github.com/jd/tenacity) which is designed to handle this. +This mechanism is configurable through 2 downloader-specific options (see [Download options](#download-options)): **stop_condition** and **wait_policy**. + +When working on python code, you can pass instances of Tenacity's `stop_base` and `wait_base` respectively. +This includes classes defined in Tenacity or your own derived classes. + +For bank configuration those options also parse strings read from the configuration file. +This parsing is based on the [Simple Eval library](https://github.com/danthedeckie/simpleeval). +The rules are straightforward: + + * All concrete stop and wait classes defined in Tenacity (i.e. classes inheriting from `stop_base` and `wait_base` respectively) can be used + by calling their constructor with the expected parameters. + For example, the string `"stop_after_attempt(5)"` will create the desired object. + Note that stop and wait classes that need no argument must be used as constants (i.e. use `"stop_never"` and not `"stop_never()"`). + Currently, this is the case for `"stop_never"` (as in Tenacity) and `"wait_none"` (this slightly differs from Tenacity where it is `"wait_none()"`). + * You can use classes that allow to combine other stop conditions (namely `stop_all` and `stop_any`) or wait policies (namely `wait_combine`). + * Operator `+` can be used to add wait policies (similar to `wait_combine`). + * Operators `&` and `|` can be used to compose stop conditions (similar to `wait_all` and `wait_none` respectively). + +However, in this case, you can't use your own conditions. +The complete list of stop conditions is: + +* `stop_never` (although its use is discouraged) +* `stop_after_attempt` +* `stop_after_delay` +* `stop_when_event_set` +* `stop_all` +* `stop_any` + +The complete list of wait policies is: + +* `wait_none` +* `wait_fixed` +* `wait_random` +* `wait_incrementing` +* `wait_exponential` +* `wait_random_exponential` +* `wait_combine` +* `wait_chain` + +Please refer to [Tenacity doc](https://tenacity.readthedocs.io/en/latest/) for their meaning and their parameters. + +Examples (inspired by Tenacity doc): + + * `"wait_fixed(3) + wait_random(0, 2)"` and `"wait_combine(wait_fixed(3), wait_random(0, 2))"` are equivalent and will wait 3 seconds + up to 2 seconds of random delay + * `"wait_chain(*([wait_fixed(3) for i in range(3)] + [wait_fixed(7) for i in range(2)] + [wait_fixed(9)]))"` will wait 3s for 3 attempts, 7s for the next 2 attempts and 9s for all attempts thereafter (here `+` is the list concatenation). + * `"wait_none + wait_random(1,2)"` will wait between 1s and 2s (since `wait_none` doesn't wait). + * `"stop_never | stop_after_attempt(5)"` will stop after 5 attempts (since `stop_never` never stops). + +Note that some protocols (e.g. FTP) classify errors as temporary or permanent (for example trying to download inexisting file). +More generally, we could distinguish permanent errors based on error codes, etc. and not retry in this case. +However in our experience, so called permanent errors may well be temporary. +Therefore downloaders always retry whatever the error. +In some cases, this is a waste of time but generally this is worth it. + +# Host keys + +When using the `sftp` protocol, `biomaj-download` must check the host key. +Those keys are stored in a file (for instance `~/.ssh/known_hosts`). + +Two options are available to configure this: + + - **ssh_hosts_file** which sets the file to use + - **ssh_new_host** which sets what to do for a new host + +When the host and the key are found in the file, the connection is accepted. +If the host is found but the key missmatches, the connection is rejected +(this usually indicates a problem or a change of configuration on the remote server). +When the host is not found, the decision depends on the value of **ssh_new_host**: + + - `reject` means that the connection is rejected + - `accept` means that the connection is accepted + - `add` means that the connection is accepted and the key is added to the file + +See the description of the options in [Download options](#download-options). + +# Download options + +Since version 3.0.26, you can use the `set_options` method to pass a dictionary of downloader-specific options. +The following list shows some options and their effect (the option to set is the key and the parameter is the associated value): + + * **stop_condition**: + * parameter: an instance of Tenacity `stop_base` or a string (see [Retrying](#retrying)). + * downloader(s): all (except `LocalDownload`). + * effect: sets the condition on which we should stop retrying to download a file. + * default: `stop_after_attempt(3)` (i.e. stop after 3 attempts). + * note: introduced in version 3.2.1. + * **wait_policy**: + * parameter: an instance of Tenacity `wait_base` or a string (see [Retrying](#retrying)). + * downloader(s): all (except `LocalDownload`). + * effect: sets the wait policy between download attempts. + * default: `wait_fixed(3)` (i.e. wait 3 seconds between attempts). + * note: introduced in version 3.2.1. + * **skip_check_uncompress**: + * parameter: bool. + * downloader(s): all (except `LocalDownload`). + * effect: if true, don't test the archives after download. + * default: false (i.e. test the archives). + * **ssl_verifyhost**: + * parameter: bool. + * downloader(s): `CurlDownload` (and derived classes: `DirectFTPDownload`, `DirectHTTPDownload`). + * effect: if false, don't check that the name of the remote server is the same than in the SSL certificate. + * default: true (i.e. check host name). + * note: it's generally a bad idea to disable this verification. However some servers are badly configured. See [here](https://curl.haxx.se/libcurl/c/CURLOPT_SSL_VERIFYHOST.html) for the corresponding cURL option. + * **ssl_verifypeer**: + * parameter: bool. + * downloader(s): `CurlDownload` (and derived classes: `DirectFTPDownload`, `DirectHTTPDownload`). + * effect: if false, don't check the authenticity of the peer's certificate. + * default: true (i.e. check authenticity). + * note: it's generally a bad idea to disable this verification. However some servers are badly configured. See [here](https://curl.haxx.se/libcurl/c/CURLOPT_SSL_VERIFYPEER.html) for the corresponding cURL option. + * **ssl_server_cert**: + * parameter: path of the certificate file. + * downloader(s): `CurlDownload` (and derived classes: `DirectFTPDownload`, `DirectHTTPDownload`). + * effect: use the certificate(s) in this file to verify the peer with. + * default: use OS certificates. + * note: see [here](https://curl.haxx.se/libcurl/c/CURLOPT_CAINFO.html) for the corresponding cURL option. + * parameter: int. + * downloader(s): `CurlDownload` (and derived classes: `DirectFTPDownload`, `DirectHTTPDownload`). + * effect: sets the interval, in seconds, that the operating system will wait between sending keepalive probes. + * default: cURL default (60s at the time of this writing). + * note: see [here](https://curl.haxx.se/libcurl/c/CURLOPT_TCP_KEEPINTVL.html) for the corresponding cURL option. + * **ftp_method**: + * parameter: one of `default`, `multicwd`, `nocwd`, `singlecwd` (case insensitive). + * downloader(s): `CurlDownload` (and derived classes: `DirectFTPDownload`, `DirectHTTPDownload`) - only used for `FTP(S)`. + * effect: sets the method used to reach a file on a FTP(S) server (`nocwd` and `singlecwd` are usually faster but not always supported). + * default: `default` (which is `multicwd` at the time of this writing as in cURL). + * note: see [here](https://curl.haxx.se/libcurl/c/CURLOPT_FTP_FILEMETHOD.html) for the corresponding cURL option; introduced in version 3.1.2. + * **ssh_hosts_file**: + * parameter: path of the known hosts file. + * downloader(s): `CurlDownload` (and derived classes: `DirectFTPDownload`, `DirectHTTPDownload`) - only used for `SFTP`. + * effect: sets the file used to read/store host keys for `SFTP`. + * default: `~/.ssh/known_hosts` (where `~` is the home directory of the current user). + * note: see [here](https://curl.haxx.se/libcurl/c/CURLOPT_SSH_KNOWNHOSTS.html) for the corresponding cURL option and the option below; introduced in version 3.2.1. + * **ssh_new_host**: + * parameter: one of `reject`, `accept`, `add`. + * downloader(s): `CurlDownload` (and derived classes: `DirectFTPDownload`, `DirectHTTPDownload`) - only used for `SFTP`. + * effect: sets the policy to use for an unknown host. + * default: `reject` (i.e. refuse new hosts - you must add them in the file for instance with `ssh` or `sftp`). + * note: see [here](https://curl.haxx.se/libcurl/c/CURLOPT_SSH_KEYFUNCTION.html) for the corresponding cURL option and the option above; introduced in version 3.2.1. + * *allow_redirections*: + * parameter: bool. + * downloader(s): `CurlDownload` (and derived classes: `DirectFTPDownload`, `DirectHTTPDownload`) - only used for `HTTPS(S)`. + * effect: sets the policy for `HTTP` redirections. + * default: `true` (i.e. follow redirections). + * note: see [here](https://curl.haxx.se/libcurl/c/CURLOPT_FOLLOWLOCATION.html) for the corresponding cURL option; introduced in version 3.2.3. + +Those options can be set in bank properties. +See file `global.properties.example` in [biomaj module](https://github.com/genouest/biomaj). diff -Nru biomaj3-download-3.1.0/requirements.txt biomaj3-download-3.2.4/requirements.txt --- biomaj3-download-3.1.0/requirements.txt 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/requirements.txt 2020-12-23 05:49:33.000000000 +0000 @@ -15,3 +15,5 @@ flake8 humanfriendly python-irodsclient +simpleeval +tenacity diff -Nru biomaj3-download-3.1.0/setup.py biomaj3-download-3.2.4/setup.py --- biomaj3-download-3.1.0/setup.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/setup.py 2020-12-23 05:49:33.000000000 +0000 @@ -22,8 +22,8 @@ 'url': 'http://biomaj.genouest.org', 'download_url': 'http://biomaj.genouest.org', 'author_email': 'olivier.sallou@irisa.fr', - 'version': '3.1.0', - 'classifiers': [ + 'version': '3.2.4', + 'classifiers': [ # How mature is this project? Common values are # 3 - Alpha # 4 - Beta @@ -39,14 +39,17 @@ 'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)', # Specify the Python versions you support here. In particular, ensure # that you indicate whether you support Python 2, Python 3 or both. - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4' + 'Programming Language :: Python :: 3 :: Only', + 'Programming Language :: Python :: 3.6' ], + 'python_requires': '>=3.6, <4', 'install_requires': [ 'biomaj_core', 'biomaj_zipkin', 'pycurl', 'ftputil', + 'tenacity', + 'simpleeval', 'py-bcrypt', 'pika==0.13.0', 'redis', diff -Nru biomaj3-download-3.1.0/tests/biomaj_tests.py biomaj3-download-3.2.4/tests/biomaj_tests.py --- biomaj3-download-3.1.0/tests/biomaj_tests.py 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/tests/biomaj_tests.py 2020-12-23 05:49:33.000000000 +0000 @@ -1,5 +1,9 @@ """ Note that attributes 'network' and 'local_irods' are ignored for CI. + +To run 'local_irods' tests, you need an iRODS server on localhost (default port, +user 'rods', password 'rods') and a zone /tempZone/home/rods. See +UtilsForLocalIRODSTest. """ from nose.plugins.attrib import attr @@ -12,8 +16,11 @@ from mock import patch +from irods.session import iRODSSession + from biomaj_core.config import BiomajConfig from biomaj_core.utils import Utils +from biomaj_download.download.interface import DownloadInterface from biomaj_download.download.curl import CurlDownload, HTTPParse from biomaj_download.download.direct import DirectFTPDownload, DirectHTTPDownload from biomaj_download.download.localcopy import LocalDownload @@ -21,6 +28,8 @@ from biomaj_download.download.protocolirods import IRODSDownload import unittest +import tenacity + class UtilsForTest(): """ @@ -63,6 +72,11 @@ if self.bank_properties is None: self.__copy_test_bank_properties() + # Create an invalid archive file (empty file). This is deleted by clean(). + # See TestBiomajRSYNCDownload.test_rsync_download_skip_check_uncompress. + self.invalid_archive = os.path.join(self.test_dir, 'invalid.gz') + open(self.invalid_archive, 'w').close() + def clean(self): """ Deletes temp directory @@ -72,13 +86,15 @@ def __copy_test_bank_properties(self): if self.bank_properties is not None: return - self.bank_properties = ['alu', 'local', 'testhttp','directhttp'] + # Copy bank configuration (those bank use external resources so there is no tuning to do) + self.bank_properties = ['alu', 'testhttp', 'directhttp', 'multi'] curdir = os.path.dirname(os.path.realpath(__file__)) for b in self.bank_properties: from_file = os.path.join(curdir, b+'.properties') to_file = os.path.join(self.conf_dir, b+'.properties') shutil.copyfile(from_file, to_file) + # Copy bank process self.bank_process = ['test.sh'] curdir = os.path.dirname(os.path.realpath(__file__)) procdir = os.path.join(curdir, 'bank/process') @@ -88,11 +104,11 @@ shutil.copyfile(from_file, to_file) os.chmod(to_file, stat.S_IRWXU) - # Manage local bank test, use bank test subdir as remote - properties = ['multi.properties', 'computederror.properties', 'error.properties', 'local.properties', 'localprocess.properties', 'testhttp.properties', 'computed.properties', 'computed2.properties', 'sub1.properties', 'sub2.properties'] + # Copy and adapt bank configuration that use local resources: we use the "bank" dir in current test directory as remote + properties = ['local', 'localprocess', 'computed', 'computed2', 'sub1', 'sub2', 'computederror', 'error'] for prop in properties: - from_file = os.path.join(curdir, prop) - to_file = os.path.join(self.conf_dir, prop) + from_file = os.path.join(curdir, prop+'.properties') + to_file = os.path.join(self.conf_dir, prop+'.properties') fout = open(to_file,'w') with open(from_file,'r') as fin: for line in fin: @@ -111,6 +127,7 @@ curdir = os.path.dirname(os.path.realpath(__file__)) global_template = os.path.join(curdir,'global.properties') fout = open(self.global_properties,'w') + # Adapt directories in global configuration to the current test directory with open(global_template,'r') as fin: for line in fin: if line.startswith('conf.dir'): @@ -128,44 +145,79 @@ fout.close() -class TestBiomajUtils(unittest.TestCase): +class UtilsForLocalIRODSTest(UtilsForTest): + """ + This class is used to prepare 'local_irods' tests. + """ + SERVER = "localhost" + PORT = 1247 + ZONE = "tempZone" + USER = "rods" + PASSWORD = "rods" + COLLECTION = os.path.join("/" + ZONE, "home/rods/") # Don't remove or add / - def setUp(self): - self.utils = UtilsForTest() + def __init__(self): + super(UtilsForLocalIRODSTest, self).__init__() + self._session = iRODSSession(host=self.SERVER, port=self.PORT, + user=self.USER, password=self.PASSWORD, + zone=self.ZONE) + self.curdir = os.path.dirname(os.path.realpath(__file__)) + # Copy some valid archives (bank/test.fasta.gz) + file_ = os.path.join(self.curdir, "bank/test.fasta.gz") + self._session.data_objects.put(file_, self.COLLECTION) + # Copy invalid.gz + self._session.data_objects.put(self.invalid_archive, self.COLLECTION) + + def clean(self): + super(UtilsForLocalIRODSTest, self).clean() + # Remove files on iRODS (use force otherwise the files are put in trash) + # Remove test.fasta.gz + self._session.data_objects.unlink(os.path.join(self.COLLECTION, "test.fasta.gz"), force=True) + # Remove invalid.gz + self._session.data_objects.unlink(os.path.join(self.COLLECTION, "invalid.gz"), force=True) - def tearDown(self): - self.utils.clean() +class TestDownloadInterface(unittest.TestCase): + """ + Test of the interface. + """ + + def test_retry_parsing(self): + """ + Test parsing of stop and wait conditions. + """ + downloader = DownloadInterface() + # Test some garbage + d = dict(stop_condition="stop_after_attempts") # no param + self.assertRaises(ValueError, downloader.set_options, d) + d = dict(stop_condition="1 & 1") # not a stop_condition + self.assertRaises(ValueError, downloader.set_options, d) + d = dict(stop_condition="stop_after_attempts(5) & 1") # not a stop_condition + self.assertRaises(ValueError, downloader.set_options, d) + # Test some garbage + d = dict(wait_policy="wait_random") # no param + self.assertRaises(ValueError, downloader.set_options, d) + d = dict(wait_policy="I love python") # not a wait_condition + self.assertRaises(ValueError, downloader.set_options, d) + d = dict(wait_policy="wait_random(5) + 3") # not a wait_condition + self.assertRaises(ValueError, downloader.set_options, d) + # Test operators + d = dict(stop_condition="stop_never | stop_after_attempt(5)", + wait_policy="wait_none + wait_random(1, 2)") + downloader.set_options(d) + # Test wait_combine, wait_chain + d = dict(wait_policy="wait_combine(wait_fixed(3), wait_random(1, 2))") + downloader.set_options(d) + d = dict(wait_policy="wait_chain(wait_fixed(3), wait_random(1, 2))") + downloader.set_options(d) + # Test stop_any and stop_all + stop_condition = "stop_any(stop_after_attempt(5), stop_after_delay(10))" + d = dict(stop_condition=stop_condition) + downloader.set_options(d) + stop_condition = "stop_all(stop_after_attempt(5), stop_after_delay(10))" + d = dict(stop_condition=stop_condition) + downloader.set_options(d) - def test_mimes(self): - fasta_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),'bank/test2.fasta') - (mime, encoding) = Utils.detect_format(fasta_file) - self.assertTrue('application/fasta' == mime) - - @attr('compress') - def test_uncompress(self): - from_file = { 'root': os.path.dirname(os.path.realpath(__file__)), - 'name': 'bank/test.fasta.gz' - } - - to_dir = self.utils.data_dir - Utils.copy_files([from_file], to_dir) - Utils.uncompress(os.path.join(to_dir, from_file['name'])) - self.assertTrue(os.path.exists(to_dir+'/bank/test.fasta')) - - def test_copy_with_regexp(self): - from_dir = os.path.dirname(os.path.realpath(__file__)) - to_dir = self.utils.data_dir - Utils.copy_files_with_regexp(from_dir, to_dir, ['.*\.py']) - self.assertTrue(os.path.exists(to_dir+'/biomaj_tests.py')) - - def test_copy(self): - from_dir = os.path.dirname(os.path.realpath(__file__)) - local_file = 'biomaj_tests.py' - files_to_copy = [ {'root': from_dir, 'name': local_file}] - to_dir = self.utils.data_dir - Utils.copy_files(files_to_copy, to_dir) - self.assertTrue(os.path.exists(to_dir+'/biomaj_tests.py')) class TestBiomajLocalDownload(unittest.TestCase): """ @@ -190,6 +242,16 @@ locald.close() self.assertTrue(len(file_list) > 1) + def test_local_list_error(self): + locald = LocalDownload("/tmp/foo/") + # Check that we raise an exception and log a message + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = locald.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") + locald.close() + def test_local_download(self): locald = LocalDownload(self.examples) (file_list, dir_list) = locald.list() @@ -230,7 +292,8 @@ except Exception: msg = "In %s: copy worked but hardlinks were not used." % self.id() logging.info(msg) - + + @attr('network') @attr('http') class TestBiomajHTTPDownload(unittest.TestCase): @@ -240,14 +303,16 @@ def setUp(self): self.utils = UtilsForTest() BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False) + # Create an HTTPParse object used for most tests from the config file testhttp self.config = BiomajConfig('testhttp') - self.http_parse = HTTPParse(self.config.get('http.parse.dir.line'), + self.http_parse = HTTPParse( + self.config.get('http.parse.dir.line'), self.config.get('http.parse.file.line'), int(self.config.get('http.group.dir.name')), int(self.config.get('http.group.dir.date')), int(self.config.get('http.group.file.name')), int(self.config.get('http.group.file.date')), - self.config.get('http.group.file.date_format', None), + self.config.get('http.group.file.date_format'), int(self.config.get('http.group.file.size')) ) @@ -260,26 +325,37 @@ httpd.close() self.assertTrue(len(file_list) == 1) + def test_http_list_error(self): + """ + Test that errors in list are correctly caught. + """ + # Test access to non-existent directory + httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/foo/', self.http_parse) + # Check that we raise an exception and log a message + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = httpd.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") + def test_http_list_dateregexp(self): - #self.http_parse.file_date_format = "%%d-%%b-%%Y %%H:%%M" - self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M" httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse) (file_list, dir_list) = httpd.list() httpd.close() self.assertTrue(len(file_list) == 1) def test_http_download_no_size(self): - self.http_parse = HTTPParse(self.config.get('http.parse.dir.line'), + # Create a custom http_parse without size + http_parse = HTTPParse(self.config.get('http.parse.dir.line'), self.config.get('http.parse.file.line'), int(self.config.get('http.group.dir.name')), int(self.config.get('http.group.dir.date')), int(self.config.get('http.group.file.name')), int(self.config.get('http.group.file.date')), - self.config.get('http.group.file.date_format', None), + self.config.get('http.group.file.date_format'), -1 ) - self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M" - httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse) + httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', http_parse) (file_list, dir_list) = httpd.list() httpd.match([r'^README$'], file_list, dir_list) httpd.download(self.utils.data_dir) @@ -287,16 +363,17 @@ self.assertTrue(len(httpd.files_to_download) == 1) def test_http_download_no_date(self): - self.http_parse = HTTPParse(self.config.get('http.parse.dir.line'), + # Create a custom http_parse without date + http_parse = HTTPParse(self.config.get('http.parse.dir.line'), self.config.get('http.parse.file.line'), int(self.config.get('http.group.dir.name')), int(self.config.get('http.group.dir.date')), int(self.config.get('http.group.file.name')), -1, - self.config.get('http.group.file.date_format', None), + None, int(self.config.get('http.group.file.size')) ) - httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse) + httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', http_parse) (file_list, dir_list) = httpd.list() httpd.match([r'^README$'], file_list, dir_list) httpd.download(self.utils.data_dir) @@ -304,7 +381,6 @@ self.assertTrue(len(httpd.files_to_download) == 1) def test_http_download(self): - self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M" httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/dists/', self.http_parse) (file_list, dir_list) = httpd.list() print(str(file_list)) @@ -314,7 +390,6 @@ self.assertTrue(len(httpd.files_to_download) == 1) def test_http_download_in_subdir(self): - self.http_parse.file_date_format = "%%Y-%%m-%%d %%H:%%M" httpd = CurlDownload('http', 'ftp2.fr.debian.org', '/debian/', self.http_parse) (file_list, dir_list) = httpd.list() httpd.match([r'^dists/README$'], file_list, dir_list) @@ -322,6 +397,41 @@ httpd.close() self.assertTrue(len(httpd.files_to_download) == 1) + def test_redirection(self): + """ + Test HTTP redirections + """ + # The site used in this test redirects to https (see #33). + http_parse = HTTPParse( + r'[\s][\s][\w\-\.]+.tar.gz)<\/a>[\s]+([0-9]{2}-[A-Za-z]{3}-[0-9]{4}[\s][0-9]{2}:[0-9]{2})[\s]+([0-9]+[A-Za-z])", "([\w\-\.]+.tar.gz)<\/a>[\s]+([0-9]{2}-[A-Za-z]{3}-[0-9]{4}[\s][0-9]{2}:[0-9]{2})[\s]+([0-9]+[A-Za-z])", 1, 2, 1, 2, - None, + "%%d-%%b-%%Y %%H:%%M", 3 ) - self.http_parse.file_date_format = "%%d-%%b-%%Y %%H:%%M" - httpd = CurlDownload('https', 'mirrors.edge.kernel.org', '/pub/software/scm/git/debian/', self.http_parse) + httpd = CurlDownload('https', 'mirrors.edge.kernel.org', '/pub/software/scm/git/debian/', http_parse) (file_list, dir_list) = httpd.list() httpd.match([r'^git-core-0.99.6.tar.gz$'], file_list, dir_list) httpd.download(self.utils.data_dir) @@ -364,17 +472,48 @@ Test SFTP downloader """ - PROTOCOL = "ftps" - + PROTOCOL = "sftp" + def setUp(self): self.utils = UtilsForTest() + # Temporary host key file in test dir (so this is cleaned) + (_, self.khfile) = tempfile.mkstemp(dir=self.utils.test_dir) def tearDown(self): self.utils.clean() + def test_list_error(self): + """ + Test that errors in list are correctly caught. + """ + # Test access to non-existent directory + sftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/toto") + sftpd.set_credentials("demo:password") + # Check that we raise an exception and log a message + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = sftpd.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") + sftpd.close() + # Test with wrong password + sftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/") + sftpd.set_credentials("demo:badpassword") + # Check that we raise an exception and log a message + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = sftpd.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") + sftpd.close() + def test_download(self): sftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/") sftpd.set_credentials("demo:password") + sftpd.set_options({ + "ssh_hosts_file": self.khfile, + "ssh_new_host": "add" + }) (file_list, dir_list) = sftpd.list() sftpd.match([r'^readme.txt$'], file_list, dir_list) sftpd.download(self.utils.data_dir) @@ -403,6 +542,22 @@ ftpd.close() self.assertTrue(len(file_list) == 1) + def test_ftp_list_error(self): + """ + Test that errors in list are correctly caught. + """ + # Test access to non-existent directory + file_list = ['/toto/debian/doc/mailing-lists.txt'] + ftpd = DirectFTPDownload('ftp', 'ftp.fr.debian.org', '') + ftpd.set_files_to_download(file_list) + # Check that we raise an exception and log a message + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = ftpd.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") + ftpd.close() + def test_download(self): file_list = ['/debian/doc/mailing-lists.txt'] ftpd = DirectFTPDownload('ftp', 'ftp.fr.debian.org', '') @@ -413,14 +568,13 @@ self.assertTrue(os.path.exists(os.path.join(self.utils.data_dir,'mailing-lists.txt'))) - @attr('directftps') @attr('network') class TestBiomajDirectFTPSDownload(unittest.TestCase): """ Test DirectFTP downloader with FTPS. """ - + def setUp(self): self.utils = UtilsForTest() @@ -473,6 +627,22 @@ self.assertTrue(file_list[0]['size']!=0) self.assertFalse(fyear == ftpd.files_to_download[0]['year'] and fmonth == ftpd.files_to_download[0]['month'] and fday == ftpd.files_to_download[0]['day']) + def test_http_list_error(self): + """ + Test that errors in list are correctly caught. + """ + # Test access to non-existent directory + file_list = ['/toto/debian/README.html'] + ftpd = DirectHTTPDownload('http', 'ftp2.fr.debian.org', '') + ftpd.set_files_to_download(file_list) + # Check that we raise an exception and log a message + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = ftpd.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") + ftpd.close() + def test_download(self): file_list = ['/debian/README.html'] ftpd = DirectHTTPDownload('http', 'ftp2.fr.debian.org', '') @@ -497,7 +667,6 @@ my_json = json.loads(content) self.assertTrue(my_json['args']['key1'] == 'value1') - @attr('test') def test_download_save_as(self): file_list = ['/debian/README.html'] ftpd = DirectHTTPDownload('http', 'ftp2.fr.debian.org', '') @@ -524,6 +693,32 @@ content = content_file.read() my_json = json.loads(content) self.assertTrue(my_json['form']['key1'] == 'value1') + + def test_redirection(self): + """ + Test HTTP redirections + """ + # The site used in this test redirects to https (see #33). + # First test: allow redirections + httpd = DirectHTTPDownload('http', 'plasmodb.org', '/common/downloads/Current_Release/') + httpd.set_files_to_download(['Build_number']) + httpd.download(self.utils.data_dir) + # Check that we have been redirected to HTTPS by inspecting logs + with self.assertLogs(logger="biomaj", level="INFO") as cm: + httpd.download(self.utils.data_dir) + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Download was redirected to https://") + httpd.close() + self.assertTrue(len(httpd.files_to_download) == 1) + # Second test: block redirections + httpd = DirectHTTPDownload('http', 'plasmodb.org', '/common/downloads/Current_Release/') + httpd.set_files_to_download(['Build_number']) + httpd.set_options({ + "allow_redirections": False + }) + with self.assertRaises(Exception): + httpd.download(self.utils.data_dir) + httpd.close() @attr('ftp') @@ -545,7 +740,31 @@ ftpd.close() self.assertTrue(len(file_list) > 1) - @attr('test') + def test_ftp_list_error(self): + """ + Test that errors in list are correctly caught. + """ + # Test access to non-existent directory + ftpd = CurlDownload("ftp", "test.rebex.net", "/toto") + ftpd.set_credentials("demo:password") + # Check that we raise an exception and log a message + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = ftpd.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") + ftpd.close() + # Test with wrong password + ftpd = CurlDownload("ftp", "test.rebex.net", "/") + ftpd.set_credentials("demo:badpassword") + # Check that we raise an exception and log a message + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = ftpd.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") + ftpd.close() + def test_download(self): ftpd = CurlDownload('ftp', 'speedtest.tele2.net', '/') (file_list, dir_list) = ftpd.list() @@ -604,6 +823,21 @@ self.assertTrue(len(ftpd.files_to_download)==2) self.assertTrue(len(ftpd.files_to_copy)==2) + @attr('test') + def test_download_or_copy_directhttp(self): + ftpd = DirectHTTPDownload('https', 'ftp.fr.debian.org', '/debian/') + ftpd.files_to_download = [ + {'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10}, + ] + available_files = [ + {'name':'/test1', 'year': '2020', 'month': '11', 'day': '10', 'size': 10}, + # {"root": "/", "permissions": "", "group": "", "user": "", "size": 23723408, "month": 6, "day": 19, "year": 2018, "name": "/common/downloads/release-38/Pfalciparum3D7/fasta/data/PlasmoDB-38_Pfalciparum3D7_Genome.fasta", "hash": "e58669a71eacff7a9dcceed04a8ecdd1", "save_as": "PlasmoDB-38_Pfalciparum3D7_Genome.fasta", "url": "https://plasmodb.org"} + ] + ftpd.download_or_copy(available_files, '/biomaj', False) + ftpd.close() + self.assertTrue(len(ftpd.files_to_download)==1) + self.assertTrue(len(ftpd.files_to_copy)==0) + def test_get_more_recent_file(self): files = [ {'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10}, @@ -616,8 +850,55 @@ self.assertTrue(release['month']=='11') self.assertTrue(release['day']=='12') + def test_download_retry(self): + """ + Try to download fake files to test retry. + """ + n_attempts = 5 + ftpd = CurlDownload("ftp", "speedtest.tele2.net", "/") + # Download a fake file + ftpd.set_files_to_download([ + {'name': 'TOTO.zip', 'year': '2016', 'month': '02', 'day': '19', + 'size': 1, 'save_as': 'TOTO1KB'} + ]) + ftpd.set_options(dict(stop_condition=tenacity.stop.stop_after_attempt(n_attempts), + wait_condition=tenacity.wait.wait_none())) + self.assertRaisesRegex( + Exception, "^CurlDownload:Download:Error:", + ftpd.download, self.utils.data_dir, + ) + logging.debug(ftpd.retryer.statistics) + self.assertTrue(len(ftpd.files_to_download) == 1) + self.assertTrue(ftpd.retryer.statistics["attempt_number"] == n_attempts) + # Try to download another file to ensure that it retryies + ftpd.set_files_to_download([ + {'name': 'TITI.zip', 'year': '2016', 'month': '02', 'day': '19', + 'size': 1, 'save_as': 'TOTO1KB'} + ]) + self.assertRaisesRegex( + Exception, "^CurlDownload:Download:Error:", + ftpd.download, self.utils.data_dir, + ) + self.assertTrue(len(ftpd.files_to_download) == 1) + self.assertTrue(ftpd.retryer.statistics["attempt_number"] == n_attempts) + ftpd.close() + def test_ms_server(self): + ftpd = CurlDownload("ftp", "test.rebex.net", "/") + ftpd.set_credentials("demo:password") + (file_list, dir_list) = ftpd.list() + ftpd.match(["^readme.txt$"], file_list, dir_list) + ftpd.download(self.utils.data_dir) + ftpd.close() + self.assertTrue(len(ftpd.files_to_download) == 1) + + def test_download_tcp_keepalive(self): + """ + Test setting tcp_keepalive (it probably doesn't change anything here but + we test that there is no obvious mistake in the code). + """ ftpd = CurlDownload("ftp", "test.rebex.net", "/") + ftpd.set_options(dict(tcp_keepalive=10)) ftpd.set_credentials("demo:password") (file_list, dir_list) = ftpd.list() ftpd.match(["^readme.txt$"], file_list, dir_list) @@ -625,13 +906,13 @@ ftpd.close() self.assertTrue(len(ftpd.files_to_download) == 1) - def test_download_tcp_keepalive(self): + def test_download_ftp_method(self): """ - Test setting tcp_keepalive (it probably doesn't change anything here but - we test that there is no obvious mistake in the code). + Test setting ftp_method (it probably doesn't change anything here but we + test that there is no obvious mistake in the code). """ ftpd = CurlDownload("ftp", "test.rebex.net", "/") - ftpd.set_options(dict(tcp_keepalive=10)) + ftpd.set_options(dict(ftp_method="nocwd")) ftpd.set_credentials("demo:password") (file_list, dir_list) = ftpd.list() ftpd.match(["^readme.txt$"], file_list, dir_list) @@ -661,6 +942,31 @@ ftpd.close() self.assertTrue(len(file_list) == 1) + def test_ftps_list_error(self): + """ + Test that errors in list are correctly caught. + """ + # Test access to non-existent directory + ftpd = CurlDownload("ftps", "test.rebex.net", "/toto") + ftpd.set_credentials("demo:password") + # Check that we raise an exception and log a message + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = ftpd.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") + ftpd.close() + # Test with wrong password + ftpd = CurlDownload("ftps", "test.rebex.net", "/") + ftpd.set_credentials("demo:badpassword") + # Check that we raise an exception and log a message + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = ftpd.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") + ftpd.close() + def test_download(self): ftpd = CurlDownload(self.PROTOCOL, "test.rebex.net", "/") ftpd.set_credentials("demo:password") @@ -674,7 +980,7 @@ # This server is misconfigured hence we disable all SSL verification SERVER = "demo.wftpserver.com" DIRECTORY = "/download/" - CREDENTIALS = "demo-user:demo-user" + CREDENTIALS = "demo:demo" ftpd = CurlDownload(self.PROTOCOL, SERVER, DIRECTORY) ftpd.set_options(dict(ssl_verifyhost="False", ssl_verifypeer="False")) ftpd.set_credentials(CREDENTIALS) @@ -686,7 +992,7 @@ # This server is misconfigured hence we disable all SSL verification SERVER = "demo.wftpserver.com" DIRECTORY = "/download/" - CREDENTIALS = "demo-user:demo-user" + CREDENTIALS = "demo:demo" ftpd = CurlDownload(self.PROTOCOL, SERVER, DIRECTORY) ftpd.set_options(dict(ssl_verifyhost="False", ssl_verifypeer="False")) ftpd.set_credentials(CREDENTIALS) @@ -696,12 +1002,12 @@ ftpd.close() self.assertTrue(len(ftpd.files_to_download) == 1) - def test_download_ssl_certficate(self): + def test_download_ssl_certificate(self): # This server is misconfigured but we use its certificate # The hostname is wrong so we disable host verification SERVER = "demo.wftpserver.com" DIRECTORY = "/download/" - CREDENTIALS = "demo-user:demo-user" + CREDENTIALS = "demo:demo" ftpd = CurlDownload(self.PROTOCOL, SERVER, DIRECTORY) curdir = os.path.dirname(os.path.realpath(__file__)) cert_file = os.path.join(curdir, "caert.demo.wftpserver.com.pem") @@ -735,6 +1041,16 @@ (files_list, dir_list) = rsyncd.list() self.assertTrue(len(files_list) != 0) + def test_rsync_list_error(self): + # Access a non-existent directory + rsyncd = RSYNCDownload("/tmp/foo/", "") + # Check that we raise an exception and log a message + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = rsyncd.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") + def test_rsync_match(self): rsyncd = RSYNCDownload(self.examples, "") (files_list, dir_list) = rsyncd.list() @@ -772,6 +1088,51 @@ rsyncd.download(self.utils.data_dir) self.assertTrue(len(rsyncd.files_to_download) == 3) + def test_rsync_download_skip_check_uncompress(self): + """ + Download the fake archive file with RSYNC but skip check. + """ + rsyncd = RSYNCDownload(self.utils.test_dir + '/', "") + rsyncd.set_options(dict(skip_check_uncompress=True)) + (file_list, dir_list) = rsyncd.list() + rsyncd.match([r'invalid.gz'], file_list, dir_list, prefix='') + rsyncd.download(self.utils.data_dir) + self.assertTrue(len(rsyncd.files_to_download) == 1) + + def test_rsync_download_retry(self): + """ + Try to download fake files to test retry. + """ + n_attempts = 5 + rsyncd = RSYNCDownload(self.utils.test_dir + '/', "") + rsyncd.set_options(dict(skip_check_uncompress=True)) + # Download a fake file + rsyncd.set_files_to_download([ + {'name': 'TOTO.zip', 'year': '2016', 'month': '02', 'day': '19', + 'size': 1, 'save_as': 'TOTO1KB'} + ]) + rsyncd.set_options(dict(stop_condition=tenacity.stop.stop_after_attempt(n_attempts), + wait_condition=tenacity.wait.wait_none())) + self.assertRaisesRegex( + Exception, "^RSYNCDownload:Download:Error:", + rsyncd.download, self.utils.data_dir, + ) + logging.debug(rsyncd.retryer.statistics) + self.assertTrue(len(rsyncd.files_to_download) == 1) + self.assertTrue(rsyncd.retryer.statistics["attempt_number"] == n_attempts) + # Try to download another file to ensure that it retryies + rsyncd.set_files_to_download([ + {'name': 'TITI.zip', 'year': '2016', 'month': '02', 'day': '19', + 'size': 1, 'save_as': 'TOTO1KB'} + ]) + self.assertRaisesRegex( + Exception, "^RSYNCDownload:Download:Error:", + rsyncd.download, self.utils.data_dir, + ) + self.assertTrue(len(rsyncd.files_to_download) == 1) + self.assertTrue(rsyncd.retryer.statistics["attempt_number"] == n_attempts) + rsyncd.close() + class iRodsResult(object): @@ -844,6 +1205,7 @@ my_test_file = open("tests/test.fasta.gz", "r+") return(my_test_file) + @attr('irods') @attr('roscoZone') @attr('network') @@ -872,18 +1234,107 @@ (files_list, dir_list) = irodsd.list() self.assertTrue(len(files_list) != 0) - @attr('local_irods') + +@attr('local_irods') +@attr('network') +class TestBiomajLocalIRODSDownload(unittest.TestCase): + """ + Test with a local iRODS server. + """ + + def setUp(self): + self.utils = UtilsForLocalIRODSTest() + self.curdir = os.path.dirname(os.path.realpath(__file__)) + self.examples = os.path.join(self.curdir,'bank') + '/' + BiomajConfig.load_config(self.utils.global_properties, allow_user_config=False) + + def tearDown(self): + self.utils.clean() + def test_irods_download(self): - # To run this test, you need an iRODS server on localhost (default - # port, user 'rods', password 'rods'), and populate a zone - # /tempZone/home/rods with a file that matches r'^test.*\.gz$' (for - # instance, by copying tests/bank/test/test.fasta.gz). - irodsd = IRODSDownload("localhost", "/tempZone/home/rods") + irodsd = IRODSDownload(self.utils.SERVER, self.utils.COLLECTION) irodsd.set_param(dict( - user='rods', - password='rods', + user=self.utils.USER, + password=self.utils.PASSWORD, )) (file_list, dir_list) = irodsd.list() irodsd.match([r'^test.*\.gz$'], file_list, dir_list, prefix='') irodsd.download(self.utils.data_dir) self.assertTrue(len(irodsd.files_to_download) == 1) + + def test_irods_download_skip_check_uncompress(self): + """ + Download the fake archive file with iRODS but skip check. + """ + irodsd = IRODSDownload(self.utils.SERVER, self.utils.COLLECTION) + irodsd.set_options(dict(skip_check_uncompress=True)) + irodsd.set_param(dict( + user=self.utils.USER, + password=self.utils.PASSWORD, + )) + (file_list, dir_list) = irodsd.list() + irodsd.match([r'invalid.gz$'], file_list, dir_list, prefix='') + irodsd.download(self.utils.data_dir) + self.assertTrue(len(irodsd.files_to_download) == 1) + + def test_irods_download_retry(self): + """ + Try to download fake files to test retry. + """ + n_attempts = 5 + irodsd = IRODSDownload(self.utils.SERVER, self.utils.COLLECTION) + irodsd.set_options(dict(skip_check_uncompress=True)) + irodsd.set_param(dict( + user=self.utils.USER, + password=self.utils.PASSWORD, + )) + # Download a fake file + irodsd.set_files_to_download([ + {'name': 'TOTO.zip', 'year': '2016', 'month': '02', 'day': '19', + 'size': 1, 'save_as': 'TOTO1KB'} + ]) + irodsd.set_options(dict(stop_condition=tenacity.stop.stop_after_attempt(n_attempts), + wait_condition=tenacity.wait.wait_none())) + self.assertRaisesRegex( + Exception, "^IRODSDownload:Download:Error:", + irodsd.download, self.utils.data_dir, + ) + logging.debug(irodsd.retryer.statistics) + self.assertTrue(len(irodsd.files_to_download) == 1) + self.assertTrue(irodsd.retryer.statistics["attempt_number"] == n_attempts) + # Try to download another file to ensure that it retryies + irodsd.set_files_to_download([ + {'name': 'TITI.zip', 'year': '2016', 'month': '02', 'day': '19', + 'size': 1, 'save_as': 'TOTO1KB'} + ]) + self.assertRaisesRegex( + Exception, "^IRODSDownload:Download:Error:", + irodsd.download, self.utils.data_dir, + ) + self.assertTrue(len(irodsd.files_to_download) == 1) + self.assertTrue(irodsd.retryer.statistics["attempt_number"] == n_attempts) + irodsd.close() + + def test_irods_list_error(self): + # Non-existing collection + irodsd = IRODSDownload(self.utils.SERVER, "fake_collection") + irodsd.set_param(dict( + user=self.utils.USER, + password=self.utils.PASSWORD, + )) + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = irodsd.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") + # Test with wrong password + irodsd = IRODSDownload(self.utils.SERVER, self.utils.COLLECTION) + irodsd.set_param(dict( + user=self.utils.USER, + password="badpassword", + )) + with self.assertLogs(logger="biomaj", level="ERROR") as cm: + with self.assertRaises(Exception): + (file_list, dir_list) = irodsd.list() + # Test log message format (we assume that there is only 1 message) + self.assertRegex(cm.output[0], "Error while listing") diff -Nru biomaj3-download-3.1.0/tests/testhttp.properties biomaj3-download-3.2.4/tests/testhttp.properties --- biomaj3-download-3.1.0/tests/testhttp.properties 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/tests/testhttp.properties 2020-12-23 05:49:33.000000000 +0000 @@ -38,7 +38,7 @@ http.parse.dir.line=.*([\d]{4}-[\w\d]{2,5}-[\d]{2}\s[\d]{2}:[\d]{2}) http.parse.file.line=<\/td>([\d\.]+[MKG]{0,1}) -http.group.file.date_format="%%Y-%%m-%%d %%H:%%M" +http.group.file.date_format=%%Y-%%m-%%d %%H:%%M ### Deployment ### keep.old.version=1 diff -Nru biomaj3-download-3.1.0/.travis.yml biomaj3-download-3.2.4/.travis.yml --- biomaj3-download-3.1.0/.travis.yml 2019-10-21 06:47:18.000000000 +0000 +++ biomaj3-download-3.2.4/.travis.yml 2020-12-23 05:49:33.000000000 +0000 @@ -1,10 +1,12 @@ +arch: +- amd64 +- ppc64le language: python sudo: false python: -- '2.7' -- '3.4' -- '3.5' - '3.6' +- '3.7' +- '3.8' services: - redis branches: