diff -Nru fsspec-0.6.1/.coveragerc fsspec-0.8.4/.coveragerc --- fsspec-0.6.1/.coveragerc 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/.coveragerc 2020-10-14 16:51:19.000000000 +0000 @@ -2,6 +2,9 @@ omit = */test_*.py fsspec/_version.py + fsspec/implementations/dvc.py + fsspec/implementations/github.py + fsspec/implementations/hdfs.py source = fsspec diff -Nru fsspec-0.6.1/debian/changelog fsspec-0.8.4/debian/changelog --- fsspec-0.6.1/debian/changelog 2019-12-04 12:48:56.000000000 +0000 +++ fsspec-0.8.4/debian/changelog 2020-10-16 16:41:33.000000000 +0000 @@ -1,3 +1,20 @@ +fsspec (0.8.4-1) unstable; urgency=medium + + [Emmanuel Arias] + * New upstream release. + * d/control: Bump debhelper-compat to 13. + * d/salsa-ci.yml: enable salsa-ci. + * d/rules: fix PYBUILD_NAME. + + [Debian Janitor ] + * Set upstream metadata fields: Bug-Database, Bug-Submit, Repository, + Repository-Browse. + * Update standards version to 4.5.0, no changes needed. + * Apply multi-arch hints. + + python-fsspec-doc: Add Multi-Arch: foreign. + + -- Emmanuel Arias Fri, 16 Oct 2020 13:41:33 -0300 + fsspec (0.6.1-0.1) unstable; urgency=medium * Non-maintainer upload. diff -Nru fsspec-0.6.1/debian/control fsspec-0.8.4/debian/control --- fsspec-0.6.1/debian/control 2019-11-18 23:04:45.000000000 +0000 +++ fsspec-0.8.4/debian/control 2020-10-15 19:32:14.000000000 +0000 @@ -1,19 +1,19 @@ Source: fsspec Section: python Priority: optional -Maintainer: Debian Python Modules Team +Maintainer: Debian Python Team Uploaders: Emmanuel Arias -Build-Depends: debhelper-compat (= 12), +Build-Depends: debhelper-compat (= 13), dh-python, python3-setuptools, python3-all, python3-numpydoc, python3-certifi, python3-sphinx, -Standards-Version: 4.4.1 +Standards-Version: 4.5.0 Homepage: https://github.com/intake/filesystem_spec -Vcs-Browser: https://salsa.debian.org/python-team/modules/python-fsspec -Vcs-Git: https://salsa.debian.org/python-team/modules/python-fsspec.git +Vcs-Browser: https://salsa.debian.org/python-team/packages/python-fsspec +Vcs-Git: https://salsa.debian.org/python-team/packages/python-fsspec.git Testsuite: autopkgtest-pkg-python Package: python3-fsspec @@ -30,6 +30,7 @@ Architecture: all Section: doc Depends: ${sphinxdoc:Depends}, ${misc:Depends} +Multi-Arch: foreign Description: specification that Python filesystems should adhere to (documentation) The package produce a template or specification for a file-system interface, that specific implementations should follow, so that applications making use diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/caching.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/caching.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/caching.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/caching.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,379 +0,0 @@ -import os -import io -import functools -import logging -import math - -logger = logging.getLogger("fsspec") - - -class BaseCache(object): - """Pass-though cache: doesn't keep anything, calls every time - - Acts as base class for other cachers - - Parameters - ---------- - blocksize: int - How far to read ahead in numbers of bytes - fetcher: func - Function of the form f(start, end) which gets bytes from remote as - specified - size: int - How big this file is - """ - - def __init__(self, blocksize, fetcher, size): - self.blocksize = blocksize - self.fetcher = fetcher - self.size = size - - def _fetch(self, start, end): - return self.fetcher(start, end) - - def __getitem__(self, item: slice): - if not isinstance(item, slice): - raise TypeError( - "Cache indices must be a contiguous slice. Got {} instead.".format( - type(item) - ) - ) - if item.step and item.step != 1: - raise ValueError( - "Cache indices must be a contiguous slice. 'item' has step={}".format( - item.step - ) - ) - - # handle endpoints - if item.start is None: - item = slice(0, item.stop) - elif item.start < 0: - item = slice(self.size + item.start, item.stop) - if item.stop is None: - item = slice(item.start, self.size) - elif item.stop < 0: - item = slice(item.start, self.size + item.stop) - - return self._fetch(item.start, item.stop) - - -class MMapCache(BaseCache): - """memory-mapped sparse file cache - - Opens temporary file, which is filled blocks-wise when data is requested. - Ensure there is enough disc space in the temporary location. - - This cache method might only work on posix - """ - - def __init__(self, blocksize, fetcher, size, location=None, blocks=None): - super().__init__(blocksize, fetcher, size) - self.blocks = set() if blocks is None else blocks - self.location = location - self.cache = self._makefile() - - def _makefile(self): - import tempfile - import mmap - - if self.size == 0: - return bytearray() - - # posix version - if self.location is None or not os.path.exists(self.location): - if self.location is None: - fd = tempfile.TemporaryFile() - self.blocks = set() - else: - fd = io.open(self.location, "wb+") - fd.seek(self.size - 1) - fd.write(b"1") - fd.flush() - else: - fd = io.open(self.location, "rb+") - - return mmap.mmap(fd.fileno(), self.size) - - def _fetch(self, start, end): - start_block = start // self.blocksize - end_block = end // self.blocksize - need = [i for i in range(start_block, end_block + 1) if i not in self.blocks] - while need: - # TODO: not a for loop so we can consolidate blocks later to - # make fewer fetch calls; this could be parallel - i = need.pop(0) - sstart = i * self.blocksize - send = min(sstart + self.blocksize, self.size) - self.cache[sstart:send] = self.fetcher(sstart, send) - self.blocks.add(i) - - return self.cache[start:end] - - def __getstate__(self): - state = self.__dict__.copy() - # Remove the unpicklable entries. - del state["cache"] - return state - - def __setstate__(self, state): - # Restore instance attributes - self.__dict__.update(state) - self.cache = self._makefile() - - -class ReadAheadCache(BaseCache): - """ Cache which reads only when we get beyond a block of data - - This is a much simpler version of BytesCache, and does not attempt to - fill holes in the cache or keep fragments alive. It is best suited to - many small reads in a sequential order (e.g., reading lines from a file). - """ - - def __init__(self, blocksize, fetcher, size): - super().__init__(blocksize, fetcher, size) - self.cache = b"" - self.start = 0 - self.end = 0 - - def _fetch(self, start, end): - end = min(self.size, end) - l = end - start - if start >= self.size: - return b"" - elif start >= self.start and end <= self.end: - # cache hit - return self.cache[start - self.start : end - self.start] - elif self.start <= start < self.end: - # partial hit - part = self.cache[start - self.start :] - l -= len(part) - start = self.end - else: - # miss - part = b"" - end = min(self.size, end + self.blocksize) - self.cache = self.fetcher(start, end) # new block replaces old - self.start = start - self.end = self.start + len(self.cache) - return part + self.cache[:l] - - -class BlockCache(BaseCache): - """ - Cache holding memory as a set of blocks. - - Requests are only ever made `blocksize` at a time, and are - stored in an LRU cache. The least recently accessed block is - discarded when more than `maxblocks` are stored. - - Parameters - ---------- - blocksize : int - The number of bytes to store in each block. - Requests are only ever made for `blocksize`, so this - should balance the overhead of making a request against - the granularity of the blocks. - fetcher : Callable - size : int - The total size of the file being cached. - maxblocks : int - The maximum number of blocks to cache for. The maximum memory - use for this cache is then ``blocksize * maxblocks``. - """ - - def __init__(self, blocksize, fetcher, size, maxblocks=32): - super().__init__(blocksize, fetcher, size) - self.nblocks = math.ceil(size / blocksize) - self.maxblocks = maxblocks - self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block) - - def __repr__(self): - return "".format( - self.blocksize, self.size, self.nblocks - ) - - def cache_info(self): - """ - The statistics on the block cache. - - Returns - ---------- - NamedTuple - Returned directly from the LRU Cache used internally. - """ - return self._fetch_block_cached.cache_info() - - def __getstate__(self): - state = self.__dict__ - del state["_fetch_block_cached"] - return state - - def __setstate__(self, state): - self.__dict__.update(state) - self._fetch_block_cached = functools.lru_cache(state["maxblocks"])( - self._fetch_block - ) - - def _fetch(self, start, end): - if end < start: - raise ValueError( - "'end' ({}) is smaller than 'start' ({}).".format(end, start) - ) - - if end > self.size: - raise ValueError("'end={}' larger than size ('{}')".format(end, self.size)) - - # byte position -> block numbers - start_block_number = start // self.blocksize - end_block_number = end // self.blocksize - - # these are cached, so safe to do multiple calls for the same start and end. - for block_number in range(start_block_number, end_block_number + 1): - self._fetch_block(block_number) - - return self._read_cache( - start, - end, - start_block_number=start_block_number, - end_block_number=end_block_number, - ) - - def _fetch_block(self, block_number): - """ - Fetch the block of data for `block_number`. - """ - if block_number > self.nblocks: - raise ValueError( - "'block_number={}' is greater than the number of blocks ({})".format( - block_number, self.nblocks - ) - ) - - start = block_number * self.blocksize - end = start + self.blocksize - logger.info("BlockCache fetching block %d", block_number) - block_contents = super()._fetch(start, end) - return block_contents - - def _read_cache(self, start, end, start_block_number, end_block_number): - """ - Read from our block cache. - - Parameters - ---------- - start, end : int - The start and end byte positions. - start_block_number, end_block_number : int - The start and end block numbers. - """ - start_pos = start % self.blocksize - end_pos = end % self.blocksize - - if start_block_number == end_block_number: - block = self._fetch_block_cached(start_block_number) - return block[start_pos:end_pos] - - else: - # read from the initial - out = [] - out.append(self._fetch_block_cached(start_block_number)[start_pos:]) - - # intermediate blocks - # Note: it'd be nice to combine these into one big request. However - # that doesn't play nicely with our LRU cache. - for block_number in range(start_block_number + 1, end_block_number): - out.append(self._fetch_block_cached(block_number)) - - # final block - out.append(self._fetch_block_cached(end_block_number)[:end_pos]) - - return b"".join(out) - - -class BytesCache(BaseCache): - """Cache which holds data in a in-memory bytes object - - Implements read-ahead by the block size, for semi-random reads progressing - through the file. - - Parameters - ---------- - trim: bool - As we read more data, whether to discard the start of the buffer when - we are more than a blocksize ahead of it. - """ - - def __init__(self, blocksize, fetcher, size, trim=True): - super().__init__(blocksize, fetcher, size) - self.cache = b"" - self.start = None - self.end = None - self.trim = trim - - def _fetch(self, start, end): - # TODO: only set start/end after fetch, in case it fails? - # is this where retry logic might go? - if ( - self.start is not None - and start >= self.start - and self.end is not None - and end < self.end - ): - # cache hit: we have all the required data - offset = start - self.start - return self.cache[offset : offset + end - start] - - if self.blocksize: - bend = min(self.size, end + self.blocksize) - else: - bend = end - - if bend == start or start > self.size: - return b"" - - if (self.start is None or start < self.start) and ( - self.end is None or end > self.end - ): - # First read, or extending both before and after - self.cache = self.fetcher(start, bend) - self.start = start - elif start < self.start: - if self.end - end > self.blocksize: - self.cache = self.fetcher(start, bend) - self.start = start - else: - new = self.fetcher(start, self.start) - self.start = start - self.cache = new + self.cache - elif bend > self.end: - if self.end > self.size: - pass - elif end - self.end > self.blocksize: - self.cache = self.fetcher(start, bend) - self.start = start - else: - new = self.fetcher(self.end, bend) - self.cache = self.cache + new - - self.end = self.start + len(self.cache) - offset = start - self.start - out = self.cache[offset : offset + end - start] - if self.trim: - num = (self.end - self.start) // (self.blocksize + 1) - if num > 1: - self.start += self.blocksize * num - self.cache = self.cache[self.blocksize * num :] - return out - - def __len__(self): - return len(self.cache) - - -caches = { - "none": BaseCache, - "mmap": MMapCache, - "bytes": BytesCache, - "readahead": ReadAheadCache, - "block": BlockCache, -} diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/compression.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/compression.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/compression.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/compression.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,152 +0,0 @@ -"""Helper functions for a standard streaming compression API""" -from bz2 import BZ2File -from gzip import GzipFile -from zipfile import ZipFile - -import fsspec.utils -from fsspec.spec import AbstractBufferedFile - - -def noop_file(file, mode, **kwargs): - return file - - -# should be functions of the form func(infile, mode=, **kwargs) -> file-like -compr = {None: noop_file} - - -def register_compression(name, callback, extensions, force=False): - """Register an "inferable" file compression type. - - Registers transparent file compression type for use with fsspec.open. - Compression can be specified by name in open, or "infer"-ed for any files - ending with the given extensions. - - Args: - name: (str) The compression type name. Eg. "gzip". - callback: A callable of form (infile, mode, **kwargs) -> file-like. - Accepts an input file-like object, the target mode and kwargs. - Returns a wrapped file-like object. - extensions: (str, Iterable[str]) A file extension, or list of file - extensions for which to infer this compression scheme. Eg. "gz". - force: (bool) Force re-registration of compression type or extensions. - - Raises: - ValueError: If name or extensions already registered, and not force. - - """ - if isinstance(extensions, str): - extensions = [extensions] - - # Validate registration - if name in compr and not force: - raise ValueError("Duplicate compression registration: %s" % name) - - for ext in extensions: - if ext in fsspec.utils.compressions and not force: - raise ValueError( - "Duplicate compression file extension: %s (%s)" % (ext, name) - ) - - compr[name] = callback - - for ext in extensions: - fsspec.utils.compressions[ext] = name - - -def unzip(infile, mode="rb", filename=None, **kwargs): - if "r" not in mode: - filename = filename or "file" - z = ZipFile(infile, mode="w", **kwargs) - fo = z.open(filename, mode="w") - fo.close = lambda closer=fo.close: closer() or z.close() - return fo - z = ZipFile(infile) - if filename is None: - filename = z.namelist()[0] - return z.open(filename, mode="r", **kwargs) - - -register_compression("zip", unzip, "zip") -register_compression("bz2", BZ2File, "bz2") -register_compression("gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz") - -try: - import lzma - - register_compression("lzma", lzma.LZMAFile, "xz") - register_compression("xz", lzma.LZMAFile, "xz", force=True) -except ImportError: - pass - -try: - import lzmaffi - - register_compression("lzma", lzmaffi.LZMAFile, "xz", force=True) - register_compression("xz", lzmaffi.LZMAFile, "xz", force=True) -except ImportError: - pass - - -class SnappyFile(AbstractBufferedFile): - def __init__(self, infile, mode, **kwargs): - import snappy - - self.details = {"size": 999999999} # not true, but OK if we don't seek - super().__init__(fs=None, path="snappy", mode=mode.strip("b") + "b", **kwargs) - self.infile = infile - if "r" in mode: - self.codec = snappy.StreamDecompressor() - else: - self.codec = snappy.StreamCompressor() - - def _upload_chunk(self, final=False): - self.buffer.seek(0) - out = self.codec.add_chunk(self.buffer.read()) - self.infile.write(out) - return True - - def seek(self, loc, whence=0): - raise NotImplementedError("SnappyFile is not seekable") - - def seekable(self): - return False - - def _fetch_range(self, start, end): - """Get the specified set of bytes from remote""" - data = self.infile.read(end - start) - return self.codec.decompress(data) - - -try: - import snappy - - snappy.compress - # Snappy may use the .sz file extension, but this is not part of the - # standard implementation. - register_compression("snappy", SnappyFile, []) - -except (ImportError, NameError): - pass - -try: - import lz4.frame - - register_compression("lz4", lz4.frame.open, "lz4") -except ImportError: - pass - -try: - import zstandard as zstd - - def zstandard_file(infile, mode="rb"): - if "r" in mode: - cctx = zstd.ZstdDecompressor() - return cctx.stream_reader(infile) - else: - cctx = zstd.ZstdCompressor(level=10) - return cctx.stream_writer(infile) - - register_compression("zstd", zstandard_file, "zst") -except ImportError: - pass diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/conftest.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/conftest.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/conftest.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/conftest.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,51 +0,0 @@ -import os -import shutil -import subprocess -import sys -import time - -import pytest - -import fsspec -from fsspec.implementations.cached import CachingFileSystem - - -@pytest.fixture() -def m(): - """ - Fixture providing a memory filesystem. - """ - m = fsspec.filesystem("memory") - m.store.clear() - try: - yield m - finally: - m.store.clear() - - -@pytest.fixture -def ftp_writable(tmpdir): - """ - Fixture providing a writable FTP filesystem. - """ - pytest.importorskip("pyftpdlib") - from fsspec.implementations.ftp import FTPFileSystem - - FTPFileSystem.clear_instance_cache() # remove lingering connections - CachingFileSystem.clear_instance_cache() - d = str(tmpdir) - with open(os.path.join(d, "out"), "wb") as f: - f.write(b"hello" * 10000) - P = subprocess.Popen( - [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"] - ) - try: - time.sleep(1) - yield "localhost", 2121, "user", "pass" - finally: - P.terminate() - P.wait() - try: - shutil.rmtree(tmpdir) - except Exception: - pass diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/core.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/core.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/core.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/core.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,440 +0,0 @@ -from __future__ import print_function, division, absolute_import - -import io -import os -import logging -from .compression import compr -from .utils import ( - infer_compression, - build_name_function, - update_storage_options, - stringify_path, -) -from .registry import get_filesystem_class - -# for backwards compat, we export cache things from here too -from .caching import ( # noqa: F401 - BaseCache, - MMapCache, - ReadAheadCache, - BytesCache, - BlockCache, - caches, -) - -logger = logging.getLogger("fsspec") - - -class OpenFile(object): - """ - File-like object to be used in a context - - Can layer (buffered) text-mode and compression over any file-system, which - are typically binary-only. - - These instances are safe to serialize, as the low-level file object - is not created until invoked using `with`. - - Parameters - ---------- - fs: FileSystem - The file system to use for opening the file. Should match the interface - of ``dask.bytes.local.LocalFileSystem``. - path: str - Location to open - mode: str like 'rb', optional - Mode of the opened file - compression: str or None, optional - Compression to apply - encoding: str or None, optional - The encoding to use if opened in text mode. - errors: str or None, optional - How to handle encoding errors if opened in text mode. - newline: None or str - Passed to TextIOWrapper in text mode, how to handle line endings. - """ - - def __init__( - self, - fs, - path, - mode="rb", - compression=None, - encoding=None, - errors=None, - newline=None, - ): - self.fs = fs - self.path = path - self.mode = mode - self.compression = get_compression(path, compression) - self.encoding = encoding - self.errors = errors - self.newline = newline - self.fobjects = [] - - def __reduce__(self): - return ( - OpenFile, - ( - self.fs, - self.path, - self.mode, - self.compression, - self.encoding, - self.errors, - ), - ) - - def __repr__(self): - return "".format(self.path) - - def __fspath__(self): - return self.path - - def __enter__(self): - mode = self.mode.replace("t", "").replace("b", "") + "b" - - f = self.fs.open(self.path, mode=mode) - - self.fobjects = [f] - - if self.compression is not None: - compress = compr[self.compression] - f = compress(f, mode=mode[0]) - self.fobjects.append(f) - - if "b" not in self.mode: - # assume, for example, that 'r' is equivalent to 'rt' as in builtin - f = io.TextIOWrapper( - f, encoding=self.encoding, errors=self.errors, newline=self.newline - ) - self.fobjects.append(f) - - return self.fobjects[-1] - - def __exit__(self, *args): - self.close() - - def __del__(self): - self.close() - - def open(self): - """Materialise this as a real open file without context - - The file should be explicitly closed to avoid enclosed open file - instances persisting - """ - return self.__enter__() - - def close(self): - """Close all encapsulated file objects""" - for f in reversed(self.fobjects): - if "r" not in self.mode and not f.closed: - f.flush() - f.close() - self.fobjects = [] - - -def open_files( - urlpath, - mode="rb", - compression=None, - encoding="utf8", - errors=None, - name_function=None, - num=1, - protocol=None, - newline=None, - **kwargs -): - """ Given a path or paths, return a list of ``OpenFile`` objects. - - For writing, a str path must contain the "*" character, which will be filled - in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2. - - For either reading or writing, can instead provide explicit list of paths. - - Parameters - ---------- - urlpath: string or list - Absolute or relative filepath(s). Prefix with a protocol like ``s3://`` - to read from alternative filesystems. To read from multiple files you - can pass a globstring or a list of paths, with the caveat that they - must all have the same protocol. - mode: 'rb', 'wt', etc. - compression: string - Compression to use. See ``dask.bytes.compression.files`` for options. - encoding: str - For text mode only - errors: None or str - Passed to TextIOWrapper in text mode - name_function: function or None - if opening a set of files for writing, those files do not yet exist, - so we need to generate their names by formatting the urlpath for - each sequence number - num: int [1] - if writing mode, number of files we expect to create (passed to - name+function) - protocol: str or None - If given, overrides the protocol found in the URL. - newline: bytes or None - Used for line terminator in text mode. If None, uses system default; - if blank, uses no translation. - **kwargs: dict - Extra options that make sense to a particular storage connection, e.g. - host, port, username, password, etc. - - Examples - -------- - >>> files = open_files('2015-*-*.csv') # doctest: +SKIP - >>> files = open_files( - ... 's3://bucket/2015-*-*.csv.gz', compression='gzip' - ... ) # doctest: +SKIP - - Returns - ------- - List of ``OpenFile`` objects. - """ - fs, fs_token, paths = get_fs_token_paths( - urlpath, - mode, - num=num, - name_function=name_function, - storage_options=kwargs, - protocol=protocol, - ) - return [ - OpenFile( - fs, - path, - mode=mode, - compression=compression, - encoding=encoding, - errors=errors, - newline=newline, - ) - for path in paths - ] - - -def open( - urlpath, - mode="rb", - compression=None, - encoding="utf8", - errors=None, - protocol=None, - newline=None, - **kwargs -): - """ Given a path or paths, return one ``OpenFile`` object. - - Parameters - ---------- - urlpath: string or list - Absolute or relative filepath. Prefix with a protocol like ``s3://`` - to read from alternative filesystems. Should not include glob - character(s). - mode: 'rb', 'wt', etc. - compression: string - Compression to use. See ``dask.bytes.compression.files`` for options. - encoding: str - For text mode only - errors: None or str - Passed to TextIOWrapper in text mode - protocol: str or None - If given, overrides the protocol found in the URL. - newline: bytes or None - Used for line terminator in text mode. If None, uses system default; - if blank, uses no translation. - **kwargs: dict - Extra options that make sense to a particular storage connection, e.g. - host, port, username, password, etc. - - Examples - -------- - >>> openfile = open('2015-01-01.csv') # doctest: +SKIP - >>> openfile = open( - ... 's3://bucket/2015-01-01.csv.gz', - ... compression='gzip' - ... ) # doctest: +SKIP - >>> with openfile as f: - ... df = pd.read_csv(f) # doctest: +SKIP - - Returns - ------- - ``OpenFile`` object. - """ - return open_files( - [urlpath], - mode, - compression, - encoding, - errors, - protocol, - newline=newline, - **kwargs - )[0] - - -def get_compression(urlpath, compression): - if compression == "infer": - compression = infer_compression(urlpath) - if compression is not None and compression not in compr: - raise ValueError("Compression type %s not supported" % compression) - return compression - - -def split_protocol(urlpath): - """Return protocol, path pair""" - urlpath = stringify_path(urlpath) - if "://" in urlpath: - protocol, path = urlpath.split("://", 1) - if len(protocol) > 1: - # excludes Windows paths - return protocol, path - return None, urlpath - - -def strip_protocol(urlpath): - """Return only path part of full URL, according to appropriate backend""" - protocol, _ = split_protocol(urlpath) - cls = get_filesystem_class(protocol) - return cls._strip_protocol(urlpath) - - -def expand_paths_if_needed(paths, mode, num, fs, name_function): - """Expand paths if they have a ``*`` in them. - - :param paths: list of paths - mode: str - Mode in which to open files. - num: int - If opening in writing mode, number of files we expect to create. - fs: filesystem object - name_function: callable - If opening in writing mode, this callable is used to generate path - names. Names are generated for each partition by - ``urlpath.replace('*', name_function(partition_index))``. - :return: list of paths - """ - expanded_paths = [] - paths = list(paths) - if "w" in mode and sum([1 for p in paths if "*" in p]) > 1: - raise ValueError("When writing data, only one filename mask can be specified.") - elif "w" in mode: - num = max(num, len(paths)) - for curr_path in paths: - if "*" in curr_path: - if "w" in mode: - # expand using name_function - expanded_paths.extend(_expand_paths(curr_path, name_function, num)) - else: - # expand using glob - expanded_paths.extend(fs.glob(curr_path)) - else: - expanded_paths.append(curr_path) - # if we generated more paths that asked for, trim the list - if "w" in mode and len(expanded_paths) > num: - expanded_paths = expanded_paths[:num] - return expanded_paths - - -def get_fs_token_paths( - urlpath, mode="rb", num=1, name_function=None, storage_options=None, protocol=None -): - """Filesystem, deterministic token, and paths from a urlpath and options. - - Parameters - ---------- - urlpath: string or iterable - Absolute or relative filepath, URL (may include protocols like - ``s3://``), or globstring pointing to data. - mode: str, optional - Mode in which to open files. - num: int, optional - If opening in writing mode, number of files we expect to create. - name_function: callable, optional - If opening in writing mode, this callable is used to generate path - names. Names are generated for each partition by - ``urlpath.replace('*', name_function(partition_index))``. - storage_options: dict, optional - Additional keywords to pass to the filesystem class. - protocol: str or None - To override the protocol specifier in the URL - """ - if isinstance(urlpath, (list, tuple)): - if not urlpath: - raise ValueError("empty urlpath sequence") - protocols, paths = zip(*map(split_protocol, urlpath)) - protocol = protocol or protocols[0] - if not all(p == protocol for p in protocols): - raise ValueError( - "When specifying a list of paths, all paths must " - "share the same protocol" - ) - cls = get_filesystem_class(protocol) - optionss = list(map(cls._get_kwargs_from_urls, urlpath)) - paths = [cls._strip_protocol(u) for u in urlpath] - options = optionss[0] - if not all(o == options for o in optionss): - raise ValueError( - "When specifying a list of paths, all paths must " - "share the same file-system options" - ) - update_storage_options(options, storage_options) - fs = cls(**options) - paths = expand_paths_if_needed(paths, mode, num, fs, name_function) - - elif isinstance(urlpath, str) or hasattr(urlpath, "name"): - protocols, path = split_protocol(urlpath) - protocol = protocol or protocols - cls = get_filesystem_class(protocol) - - options = cls._get_kwargs_from_urls(urlpath) - path = cls._strip_protocol(urlpath) - update_storage_options(options, storage_options) - fs = cls(**options) - - if "w" in mode: - paths = _expand_paths(path, name_function, num) - elif "*" in path: - paths = sorted(fs.glob(path)) - else: - paths = [path] - - else: - raise TypeError("url type not understood: %s" % urlpath) - - return fs, fs._fs_token, paths - - -def _expand_paths(path, name_function, num): - if isinstance(path, str): - if path.count("*") > 1: - raise ValueError("Output path spec must contain exactly one '*'.") - elif "*" not in path: - path = os.path.join(path, "*.part") - - if name_function is None: - name_function = build_name_function(num - 1) - - paths = [path.replace("*", name_function(i)) for i in range(num)] - if paths != sorted(paths): - logger.warning( - "In order to preserve order between partitions" - " paths created with ``name_function`` should " - "sort to partition order" - ) - elif isinstance(path, (tuple, list)): - assert len(path) == num - paths = list(path) - else: - raise ValueError( - "Path should be either\n" - "1. A list of paths: ['foo.json', 'bar.json', ...]\n" - "2. A directory: 'foo/\n" - "3. A path with a '*' in it: 'foo.*.json'" - ) - return paths diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/fuse.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/fuse.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/fuse.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/fuse.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,157 +0,0 @@ -from __future__ import print_function -import os -import stat -from errno import ENOENT, EIO -from fuse import Operations, FuseOSError -import threading -import time -from fuse import FUSE - - -class FUSEr(Operations): - def __init__(self, fs, path): - self.fs = fs - self.cache = {} - self.root = path.rstrip("/") + "/" - self.counter = 0 - - def getattr(self, path, fh=None): - path = "".join([self.root, path.lstrip("/")]).rstrip("/") - try: - info = self.fs.info(path) - except FileNotFoundError: - raise FuseOSError(ENOENT) - data = {"st_uid": 1000, "st_gid": 1000} - perm = 0o777 - - if info["type"] != "file": - data["st_mode"] = stat.S_IFDIR | perm - data["st_size"] = 0 - data["st_blksize"] = 0 - else: - data["st_mode"] = stat.S_IFREG | perm - data["st_size"] = info["size"] - data["st_blksize"] = 5 * 2 ** 20 - data["st_nlink"] = 1 - data["st_atime"] = time.time() - data["st_ctime"] = time.time() - data["st_mtime"] = time.time() - return data - - def readdir(self, path, fh): - path = "".join([self.root, path.lstrip("/")]) - files = self.fs.ls(path, False) - files = [os.path.basename(f.rstrip("/")) for f in files] - return [".", ".."] + files - - def mkdir(self, path, mode): - path = "".join([self.root, path.lstrip("/")]) - self.fs.mkdir(path) - return 0 - - def rmdir(self, path): - path = "".join([self.root, path.lstrip("/")]) - self.fs.rmdir(path) - return 0 - - def read(self, path, size, offset, fh): - f = self.cache[fh] - f.seek(offset) - out = f.read(size) - return out - - def write(self, path, data, offset, fh): - f = self.cache[fh] - f.write(data) - return len(data) - - def create(self, path, flags, fi=None): - fn = "".join([self.root, path.lstrip("/")]) - f = self.fs.open(fn, "wb") - self.cache[self.counter] = f - self.counter += 1 - return self.counter - 1 - - def open(self, path, flags): - fn = "".join([self.root, path.lstrip("/")]) - if flags % 2 == 0: - # read - mode = "rb" - else: - # write/create - mode = "wb" - self.cache[self.counter] = self.fs.open(fn, mode) - self.counter += 1 - return self.counter - 1 - - def truncate(self, path, length, fh=None): - fn = "".join([self.root, path.lstrip("/")]) - if length != 0: - raise NotImplementedError - # maybe should be no-op since open with write sets size to zero anyway - self.fs.touch(fn) - - def unlink(self, path): - fn = "".join([self.root, path.lstrip("/")]) - try: - self.fs.rm(fn, False) - except (IOError, FileNotFoundError): - raise FuseOSError(EIO) - - def release(self, path, fh): - try: - if fh in self.cache: - f = self.cache[fh] - f.close() - self.cache.pop(fh) - except Exception as e: - print(e) - return 0 - - def chmod(self, path, mode): - raise NotImplementedError - - -def run(fs, path, mount_point, foreground=True, threads=False): - """ Mount stuff in a local directory - - This uses fusepy to make it appear as if a given path on an fsspec - instance is in fact resident within the local file-system. - - This requires that fusepy by installed, and that FUSE be available on - the system (typically requiring a package to be installed with - apt, yum, brew, etc.). - - Parameters - ---------- - fs: file-system instance - From one of the compatible implementations - path: str - Location on that file-system to regard as the root directory to - mount. Note that you typically should include the terminating "/" - character. - mount_point: str - An empty directory on the local file-system where the contents of - the remote path will appear - foreground: bool - Whether or not calling this function will block. Operation will - typically be more stable if True. - threads: bool - Whether or not to create threads when responding to file operations - within the mounter directory. Operation will typically be more - stable if False. - - """ - func = lambda: FUSE( - FUSEr(fs, path), mount_point, nothreads=not threads, foreground=True - ) - if foreground is False: - th = threading.Thread(target=func) - th.daemon = True - th.start() - return th - else: # pragma: no cover - try: - func() - except KeyboardInterrupt: - pass diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/cached.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/cached.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/cached.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/cached.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,358 +0,0 @@ -import time -import pickle -import logging -import os -import hashlib -import tempfile -import inspect -from fsspec import AbstractFileSystem, filesystem -from fsspec.spec import AbstractBufferedFile -from fsspec.core import MMapCache, BaseCache - -logger = logging.getLogger("fsspec") - - -class CachingFileSystem(AbstractFileSystem): - """Locally caching filesystem, layer over any other FS - - This class implements chunk-wise local storage of remote files, for quick - access after the initial download. The files are stored in a given - directory with random hashes for the filenames. If no directory is given, - a temporary one is used, which should be cleaned up by the OS after the - process ends. The files themselves as sparse (as implemented in - MMapCache), so only the data which is accessed takes up space. - - Restrictions: - - - the block-size must be the same for each access of a given file, unless - all blocks of the file have already been read - - caching can only be applied to file-systems which produce files - derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also - allowed, for testing - """ - - protocol = ("blockcache", "cached") - - def __init__( - self, - target_protocol=None, - cache_storage="TMP", - cache_check=10, - check_files=False, - expiry_time=604800, - target_options=None, - **kwargs - ): - """ - - Parameters - ---------- - target_protocol: str - Target fielsystem protocol - cache_storage: str or list(str) - Location to store files. If "TMP", this is a temporary directory, - and will be cleaned up by the OS when this process ends (or later). - If a list, each location will be tried in the order given, but - only the last will be considered writable. - cache_check: int - Number of seconds between reload of cache metadata - check_files: bool - Whether to explicitly see if the UID of the remote file matches - the stored one before using. Warning: some file systems such as - HTTP cannot reliably give a unique hash of the contents of some - path, so be sure to set this option to False. - expiry_time: int - The time in seconds after which a local copy is considered useless. - Set to falsy to prevent expiry. The default is equivalent to one - week. - target_options: dict or None - Passed to the instantiation of the FS, if fs is None. - """ - if self._cached: - return - super().__init__(**kwargs) - if cache_storage == "TMP": - storage = [tempfile.mkdtemp()] - else: - if isinstance(cache_storage, str): - storage = [cache_storage] - else: - storage = cache_storage - os.makedirs(storage[-1], exist_ok=True) - self.storage = storage - self.kwargs = target_options or {} - self.cache_check = cache_check - self.check_files = check_files - self.expiry = expiry_time - self.load_cache() - if isinstance(target_protocol, AbstractFileSystem): - self.fs = target_protocol - self.protocol = self.fs.protocol - else: - self.protocol = target_protocol - self.fs = filesystem(target_protocol, **self.kwargs) - - def __reduce_ex__(self, *_): - return ( - self.__class__, - ( - self.protocol, - self.storage, - self.cache_check, - self.check_files, - self.expiry, - self.kwargs or None, - ), - ) - - def load_cache(self): - """Read set of stored blocks from file""" - cached_files = [] - for storage in self.storage: - fn = os.path.join(storage, "cache") - if os.path.exists(fn): - with open(fn, "rb") as f: - # TODO: consolidate blocks here - cached_files.append(pickle.load(f)) - else: - os.makedirs(storage, exist_ok=True) - cached_files.append({}) - self.cached_files = cached_files or [{}] - self.last_cache = time.time() - - def save_cache(self): - """Save set of stored blocks from file""" - fn = os.path.join(self.storage[-1], "cache") - # TODO: a file lock could be used to ensure file does not change - # between re-read and write; but occasional duplicated reads ok. - cache = self.cached_files[-1] - if os.path.exists(fn): - with open(fn, "rb") as f: - cached_files = pickle.load(f) - for k, c in cached_files.items(): - if c["blocks"] is not True: - if cache[k]["blocks"] is True: - c["blocks"] = True - else: - c["blocks"] = set(c["blocks"]).union(cache[k]["blocks"]) - else: - cached_files = cache - cache = {k: v.copy() for k, v in cached_files.items()} - for c in cache.values(): - if isinstance(c["blocks"], set): - c["blocks"] = list(c["blocks"]) - with open(fn + ".temp", "wb") as f: - pickle.dump(cache, f) - if os.path.exists(fn): - os.remove(fn) - os.rename(fn + ".temp", fn) - - def _check_cache(self): - """Reload caches if time elapsed or any disappeared""" - if not self.cache_check: - # explicitly told not to bother checking - return - timecond = time.time() - self.last_cache > self.cache_check - existcond = all(os.path.exists(storage) for storage in self.storage) - if timecond or not existcond: - self.load_cache() - - def _check_file(self, path): - """Is path in cache and still valid""" - self._check_cache() - for storage, cache in zip(self.storage, self.cached_files): - if path not in cache: - continue - detail = cache[path].copy() - if self.check_files: - if detail["uid"] != self.fs.ukey(path): - continue - if self.expiry: - if detail["time"] - time.time() > self.expiry: - continue - fn = os.path.join(storage, detail["fn"]) - if os.path.exists(fn): - return detail, fn - return False, None - - def _open(self, path, mode="rb", **kwargs): - """Wrap the target _open - - If the whole file exists in the cache, just open it locally and - return that. - - Otherwise, open the file on the target FS, and make it have a mmap - cache pointing to the location which we determine, in our cache. - The ``blocks`` instance is shared, so as the mmap cache instance - updates, so does the entry in our ``cached_files`` attribute. - We monkey-patch this file, so that when it closes, we call - ``close_and_update`` to save the state of the blocks. - """ - path = self._strip_protocol(path) - if not path.startswith(self.protocol): - path = self.protocol + "://" + path - if mode != "rb": - return self.fs._open(path, mode=mode, **kwargs) - detail, fn = self._check_file(path) - if detail: - # file is in cache - hash, blocks = detail["fn"], detail["blocks"] - if blocks is True: - # stored file is complete - logger.debug("Opening local copy of %s" % path) - return open(fn, "rb") - # TODO: action where partial file exists in read-only cache - logger.debug("Opening partially cached copy of %s" % path) - else: - hash = hashlib.sha256(path.encode()).hexdigest() - fn = os.path.join(self.storage[-1], hash) - blocks = set() - detail = { - "fn": hash, - "blocks": blocks, - "time": time.time(), - "uid": self.fs.ukey(path), - } - self.cached_files[-1][path] = detail - logger.debug("Creating local sparse file for %s" % path) - kwargs["cache_type"] = "none" - kwargs["mode"] = mode - - # call target filesystems open - f = self.fs._open(path, **kwargs) - if "blocksize" in detail: - if detail["blocksize"] != f.blocksize: - raise ValueError( - "Cached file must be reopened with same block" - "size as original (old: %i, new %i)" - "" % (detail["blocksize"], f.blocksize) - ) - else: - detail["blocksize"] = f.blocksize - f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks) - close = f.close - f.close = lambda: self.close_and_update(f, close) - return f - - def close_and_update(self, f, close): - """Called when a file is closing, so store the set of blocks""" - if f.path.startswith(self.protocol): - path = f.path - else: - path = self.protocol + "://" + f.path - c = self.cached_files[-1][path] - if c["blocks"] is not True and len(["blocks"]) * f.blocksize >= f.size: - c["blocks"] = True - self.save_cache() - close() - - def __getattribute__(self, item): - if item in [ - "load_cache", - "_open", - "save_cache", - "close_and_update", - "__init__", - "__getattribute__", - "__reduce_ex__", - "open", - "cat", - "get", - "read_block", - "tail", - "head", - "_check_file", - "_check_cache", - ]: - # all the methods defined in this class. Note `open` here, since - # it calls `_open`, but is actually in superclass - return lambda *args, **kw: getattr(type(self), item)(self, *args, **kw) - if item == "__class__": - return type(self) - d = object.__getattribute__(self, "__dict__") - fs = d.get("fs", None) # fs is not immediately defined - if item in d: - return d[item] - elif fs is not None: - if item in fs.__dict__: - # attribute of instance - return fs.__dict__[item] - # attributed belonging to the target filesystem - cls = type(fs) - m = getattr(cls, item) - if inspect.isfunction(m) and ( - not hasattr(m, "__self__") or m.__self__ is None - ): - # instance method - return m.__get__(fs, cls) - return m # class method or attribute - else: - # attributes of the superclass, while target is being set up - return super().__getattribute__(item) - - -class WholeFileCacheFileSystem(CachingFileSystem): - """Caches whole remote files on first access - - This class is intended as a layer over any other file system, and - will make a local copy of each file accessed, so that all subsequent - reads are local. This is similar to ``CachingFileSystem``, but without - the block-wise functionality and so can work even when sparse files - are not allowed. See its docstring for definition of the init - arguments. - - The class still needs access to the remote store for listing files, - and may refresh cached files. - """ - - protocol = "filecache" - - def _open(self, path, mode="rb", **kwargs): - path = self._strip_protocol(path) - if not path.startswith(self.protocol): - path = self.protocol + "://" + path - if mode != "rb": - return self.fs._open(path, mode=mode, **kwargs) - detail, fn = self._check_file(path) - if detail: - hash, blocks = detail["fn"], detail["blocks"] - if blocks is True: - logger.debug("Opening local copy of %s" % path) - return open(fn, "rb") - else: - raise ValueError( - "Attempt to open partially cached file %s" - "as a wholly cached file" % path - ) - else: - hash = hashlib.sha256(path.encode()).hexdigest() - fn = os.path.join(self.storage[-1], hash) - blocks = True - detail = { - "fn": hash, - "blocks": blocks, - "time": time.time(), - "uid": self.fs.ukey(path), - } - self.cached_files[-1][path] = detail - logger.debug("Copying %s to local cache" % path) - kwargs["mode"] = mode - - # call target filesystems open - # TODO: why not just use fs.get ?? - f = self.fs._open(path, **kwargs) - with open(fn, "wb") as f2: - if isinstance(f, AbstractBufferedFile): - # want no type of caching if just downloading whole thing - f.cache = BaseCache(0, f.cache.fetcher, f.size) - if getattr(f, "blocksize", 0) and f.size: - # opportunity to parallelise here - data = True - while data: - data = f.read(f.blocksize) - f2.write(data) - else: - # this only applies to HTTP, should instead use streaming - f2.write(f.read()) - self.save_cache() - return self._open(path, mode) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/dask.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/dask.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/dask.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/dask.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,120 +0,0 @@ -from distributed.worker import get_worker -from distributed.client import _get_global_client -import dask -from fsspec.spec import AbstractFileSystem, AbstractBufferedFile -from fsspec import filesystem - - -def make_instance(cls, args, kwargs): - inst = cls(*args, **kwargs) - inst._determine_worker() - return inst - - -class DaskWorkerFileSystem(AbstractFileSystem): - """View files accessible to a worker as any other remote file-system - - When instances are run on the worker, uses the real filesystem. When - run on the client, they call the worker to provide information or data. - - **Warning** this implementation is experimental, and read-only for now. - """ - - def __init__(self, remote_protocol, remote_options=None, **kwargs): - super().__init__(**kwargs) - self.protocol = remote_protocol - self.remote_options = remote_options - self.worker = None - self.client = None - self.fs = None - self._determine_worker() - - def _determine_worker(self): - try: - get_worker() - self.worker = True - self.fs = filesystem(self.protocol, **(self.remote_options or {})) - except ValueError: - self.worker = False - self.client = _get_global_client() - self.rfs = dask.delayed(self) - - def __reduce__(self): - return make_instance, (type(self), self.storage_args, self.storage_options) - - def mkdir(self, *args, **kwargs): - if self.worker: - self.fs.mkdir(*args, **kwargs) - else: - self.rfs.mkdir(*args, **kwargs).compute() - - def rm(self, *args, **kwargs): - if self.worker: - self.fs.rm(*args, **kwargs) - else: - self.rfs.rm(*args, **kwargs).compute() - - def copy(self, *args, **kwargs): - if self.worker: - self.fs.copy(*args, **kwargs) - else: - self.rfs.copy(*args, **kwargs).compute() - - def mv(self, *args, **kwargs): - if self.worker: - self.fs.mv(*args, **kwargs) - else: - self.rfs.mv(*args, **kwargs).compute() - - def ls(self, *args, **kwargs): - if self.worker: - return self.fs.ls(*args, **kwargs) - else: - return self.rfs.ls(*args, **kwargs).compute() - - def _open(self, path, mode="rb", **kwargs): - if self.worker: - return self.fs._open(path, mode=mode) - else: - return DaskFile(self, path, mode, **kwargs) - - def fetch_range(self, path, mode, start, end): - if self.worker: - with self._open(path, mode) as f: - f.seek(start) - return f.read(end - start) - else: - return self.rfs.fetch_range(path, mode, start, end).compute() - - -class DaskFile(AbstractBufferedFile): - def __init__( - self, - fs, - path, - mode="rb", - block_size="default", - autocommit=True, - cache_type="bytes", - **kwargs - ): - super().__init__( - fs, - path, - mode=mode, - block_size=block_size, - autocommit=autocommit, - cache_type=cache_type, - **kwargs - ) - - def _upload_chunk(self, final=False): - pass - - def _initiate_upload(self): - """ Create remote file/upload """ - pass - - def _fetch_range(self, start, end): - """Get the specified set of bytes from remote""" - return self.fs.fetch_range(self.path, self.mode, start, end) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/ftp.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/ftp.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/ftp.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/ftp.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,265 +0,0 @@ -from ftplib import FTP, Error, error_perm -from socket import timeout -import uuid -from ..spec import AbstractBufferedFile, AbstractFileSystem -from ..utils import infer_storage_options - - -class FTPFileSystem(AbstractFileSystem): - """A filesystem over classic """ - - root_marker = "/" - cachable = False - - def __init__( - self, - host, - port=21, - username=None, - password=None, - acct=None, - block_size=None, - tempdir="/tmp", - timeout=30, - **kwargs - ): - """ - You can use _get_kwargs_from_urls to get some kwargs from - a reasonable FTP url. - - Authentication will be anonymous if username/password are not - given. - - Parameters - ---------- - host: str - The remote server name/ip to connect to - port: int - Port to connect with - username: str or None - If authenticating, the user's identifier - password: str of None - User's password on the server, if using - acct: str or None - Some servers also need an "account" string for auth - block_size: int or None - If given, the read-ahead or write buffer size. - tempdir: str - Directory on remote to put temporary files when in a transaction - """ - super(FTPFileSystem, self).__init__(**kwargs) - self.host = host - self.port = port - self.tempdir = tempdir - self.cred = username, password, acct - self.timeout = timeout - if block_size is not None: - self.blocksize = block_size - else: - self.blocksize = 2 ** 16 - self._connect() - - def _connect(self): - self.ftp = FTP(timeout=self.timeout) - self.ftp.connect(self.host, self.port) - self.ftp.login(*self.cred) - - @classmethod - def _strip_protocol(cls, path): - return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/") - - @staticmethod - def _get_kwargs_from_urls(urlpath): - out = infer_storage_options(urlpath) - out.pop("path", None) - out.pop("protocol", None) - return out - - def invalidate_cache(self, path=None): - if path is not None: - self.dircache.pop(path, None) - else: - self.dircache.clear() - - def ls(self, path, detail=True): - path = self._strip_protocol(path) - out = [] - if path not in self.dircache: - try: - try: - out = [ - (fn, details) - for (fn, details) in self.ftp.mlsd(path) - if fn not in [".", ".."] - and details["type"] not in ["pdir", "cdir"] - ] - except error_perm: - out = _mlsd2(self.ftp, path) # Not platform independent - for fn, details in out: - if path == "/": - path = "" # just for forming the names, below - details["name"] = "/".join([path, fn.lstrip("/")]) - if details["type"] == "file": - details["size"] = int(details["size"]) - else: - details["size"] = 0 - self.dircache[path] = out - except Error: - try: - info = self.info(path) - if info["type"] == "file": - out = [(path, info)] - except (Error, IndexError): - raise FileNotFoundError - files = self.dircache.get(path, out) - if not detail: - return sorted([fn for fn, details in files]) - return [details for fn, details in files] - - def info(self, path, **kwargs): - # implement with direct method - path = self._strip_protocol(path) - files = self.ls(self._parent(path).lstrip("/"), True) - try: - out = [f for f in files if f["name"] == path][0] - except IndexError: - raise FileNotFoundError(path) - return out - - def _open(self, path, mode="rb", block_size=None, autocommit=True, **kwargs): - path = self._strip_protocol(path) - block_size = block_size or self.blocksize - return FTPFile( - self, - path, - mode=mode, - block_size=block_size, - tempdir=self.tempdir, - autocommit=autocommit, - ) - - def _rm(self, path): - path = self._strip_protocol(path) - self.ftp.delete(path) - self.invalidate_cache(path.rsplit("/", 1)[0]) - - def mkdir(self, path, **kwargs): - path = self._strip_protocol(path) - self.ftp.mkd(path) - - def rmdir(self, path): - path = self._strip_protocol(path) - self.ftp.rmd(path) - - def mv(self, path1, path2, **kwargs): - path1 = self._strip_protocol(path1) - path2 = self._strip_protocol(path2) - self.ftp.rename(path1, path2) - self.invalidate_cache(self._parent(path1)) - self.invalidate_cache(self._parent(path2)) - - def __del__(self): - self.ftp.close() - - -class TransferDone(Exception): - """Internal exception to break out of transfer""" - - pass - - -class FTPFile(AbstractBufferedFile): - """Interact with a remote FTP file with read/write buffering""" - - def __init__(self, fs, path, **kwargs): - super().__init__(fs, path, **kwargs) - if kwargs.get("autocommit", False) is False: - self.target = self.path - self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())]) - - def commit(self): - self.fs.mv(self.path, self.target) - - def discard(self): - self.fs.rm(self.path) - - def _fetch_range(self, start, end): - """Get bytes between given byte limits - - Implemented by raising an exception in the fetch callback when the - number of bytes received reaches the requested amount. - - Will fail if the server does not respect the REST command on - retrieve requests. - """ - out = [] - total = [0] - - def callback(x): - total[0] += len(x) - if total[0] > end - start: - out.append(x[: (end - start) - total[0]]) - raise TransferDone - else: - out.append(x) - - if total[0] == end - start: - raise TransferDone - - try: - self.fs.ftp.retrbinary( - "RETR %s" % self.path, - blocksize=self.blocksize, - rest=start, - callback=callback, - ) - except TransferDone: - try: - self.fs.ftp.abort() - self.fs.ftp.voidresp() - except timeout: - self.fs._connect() - return b"".join(out) - - def _upload_chunk(self, final=False): - self.buffer.seek(0) - self.fs.ftp.storbinary( - "STOR " + self.path, self.buffer, blocksize=self.blocksize, rest=self.offset - ) - return True - - -def _mlsd2(ftp, path="."): - """ - Fall back to using `dir` instead of `mlsd` if not supported. - - This parses a Linux style `ls -l` response to `dir`, but the response may - be platform dependent. - - Parameters - ---------- - ftp: ftplib.FTP - path: str - Expects to be given path, but defaults to ".". - """ - lines = [] - minfo = [] - ftp.dir(path, lines.append) - for line in lines: - line = line.split() - this = ( - line[-1], - { - "modify": " ".join(line[5:8]), - "unix.owner": line[2], - "unix.group": line[3], - "unix.mode": line[0], - "size": line[4], - }, - ) - if "d" == this[1]["unix.mode"][0]: - this[1]["type"] = "dir" - else: - this[1]["type"] = "file" - minfo.append(this) - return minfo diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/github.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/github.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/github.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/github.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,67 +0,0 @@ -import io -import requests -from ..spec import AbstractFileSystem - - -class GithubFileSystem(AbstractFileSystem): - """[Experimental] interface to files in github - - An instance of this class provides the files residing within a remote github - repository. You may specify a point in the repos history, by SHA, branch - or tag (default is current master). - - Given that code files tend to be small, and that github does not support - retrieving partial content, we always fetch whole files. - """ - - url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}" - rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}" - protocol = "github" - - def __init__(self, org, repo, sha="master", **kwargs): - super().__init__(**kwargs) - self.org = org - self.repo = repo - self.root = sha - self.ls("") - - def ls(self, path, detail=False, sha=None, **kwargs): - if path == "": - sha = self.root - if sha is None: - parts = path.rstrip("/").split("/") - so_far = "" - sha = self.root - for part in parts: - out = self.ls(so_far, True, sha=sha) - so_far += "/" + part if so_far else part - out = [o for o in out if o["name"] == so_far][0] - if out["type"] == "file": - if detail: - return [out] - else: - return path - sha = out["sha"] - if path not in self.dircache: - r = requests.get(self.url.format(org=self.org, repo=self.repo, sha=sha)) - self.dircache[path] = [ - { - "name": path + "/" + f["path"] if path else f["path"], - "mode": f["mode"], - "type": {"blob": "file", "tree": "directory"}[f["type"]], - "size": f.get("size", 0), - "sha": f["sha"], - } - for f in r.json()["tree"] - ] - if detail: - return self.dircache[path] - else: - return sorted([f["name"] for f in self.dircache[path]]) - - def _open(self, path, mode="rb", **kwargs): - if mode != "rb": - raise NotImplementedError - url = self.rurl.format(org=self.org, repo=self.repo, path=path, sha=self.root) - r = requests.get(url) - return io.BytesIO(r.content) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/hdfs.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/hdfs.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/hdfs.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/hdfs.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,192 +0,0 @@ -from ..spec import AbstractFileSystem -from ..utils import infer_storage_options -from pyarrow.hdfs import HadoopFileSystem - - -class PyArrowHDFS(AbstractFileSystem): - """Adapted version of Arrow's HadoopFileSystem - - This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which - passes on all calls to the underlying class. - """ - - def __init__( - self, - host="default", - port=0, - user=None, - kerb_ticket=None, - driver="libhdfs", - extra_conf=None, - **kwargs - ): - """ - - Parameters - ---------- - host: str - Hostname, IP or "default" to try to read from Hadoop config - port: int - Port to connect on, or default from Hadoop config if 0 - user: str or None - If given, connect as this username - kerb_ticket: str or None - If given, use this ticket for authentication - driver: 'libhdfs' or 'libhdfs3' - Binary driver; libhdfs if the JNI library and default - extra_conf: None or dict - Passed on to HadoopFileSystem - """ - if self._cached: - return - AbstractFileSystem.__init__(self, **kwargs) - self.pars = (host, port, user, kerb_ticket, driver, extra_conf) - self.pahdfs = HadoopFileSystem( - host=host, - port=port, - user=user, - kerb_ticket=kerb_ticket, - driver=driver, - extra_conf=extra_conf, - ) - - def _open(self, path, mode="rb", block_size=None, autocommit=True, **kwargs): - """ - - Parameters - ---------- - path: str - Location of file; should start with '/' - mode: str - block_size: int - Hadoop block size, e.g., 2**26 - autocommit: True - Transactions are not yet implemented for HDFS; errors if not True - kwargs: dict or None - Hadoop config parameters - - Returns - ------- - HDFSFile file-like instance - """ - if not autocommit: - raise NotImplementedError - return HDFSFile(self, path, mode, block_size, **kwargs) - - def __reduce_ex__(self, protocol): - return PyArrowHDFS, self.pars - - def ls(self, path, detail=True): - out = self.pahdfs.ls(path, detail) - if detail: - for p in out: - p["type"] = p["kind"] - p["name"] = self._strip_protocol(p["name"]) - else: - out = [self._strip_protocol(p) for p in out] - return out - - @staticmethod - def _get_kwargs_from_urls(paths): - ops = infer_storage_options(paths) - out = {} - if ops.get("host", None): - out["host"] = ops["host"] - if ops.get("username", None): - out["user"] = ops["username"] - if ops.get("port", None): - out["port"] = ops["port"] - return out - - @classmethod - def _strip_protocol(cls, path): - ops = infer_storage_options(path) - return ops["path"] - - def __getattribute__(self, item): - if item in [ - "_open", - "__init__", - "__getattribute__", - "__reduce_ex__", - "open", - "ls", - "makedirs", - ]: - # all the methods defined in this class. Note `open` here, since - # it calls `_open`, but is actually in superclass - return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args, **kw) - if item == "__class__": - return PyArrowHDFS - d = object.__getattribute__(self, "__dict__") - pahdfs = d.get("pahdfs", None) # fs is not immediately defined - if pahdfs is not None and item in [ - "chmod", - "chown", - "user", - "df", - "disk_usage", - "download", - "driver", - "exists", - "extra_conf", - "get_capacity", - "get_space_used", - "host", - "is_open", - "kerb_ticket", - "strip_protocol", - "mkdir", - "mv", - "port", - "get_capacity", - "get_space_used", - "df", - "chmod", - "chown", - "disk_usage", - "download", - "upload", - "_get_kwargs_from_urls", - "read_parquet", - "rm", - "stat", - "upload", - ]: - return getattr(pahdfs, item) - else: - # attributes of the superclass, while target is being set up - return super().__getattribute__(item) - - -class HDFSFile(object): - """Wrapper around arrow's HdfsFile - - Allows seek beyond EOF and (eventually) commit/discard - """ - - def __init__(self, fs, path, mode, block_size, **kwargs): - self.fs = fs - self.path = path - self.mode = mode - self.block_size = block_size - self.fh = fs.pahdfs.open(path, mode, block_size, **kwargs) - if self.fh.readable(): - self.seek_size = self.size() - - def seek(self, loc, whence=0): - if whence == 0 and self.readable(): - loc = min(loc, self.seek_size) - return self.fh.seek(loc, whence) - - def __getattr__(self, item): - return getattr(self.fh, item) - - def __reduce_ex__(self, protocol): - return HDFSFile, (self.fs, self.path, self.mode, self.block_size) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.close() diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/http.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/http.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/http.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/http.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,358 +0,0 @@ -from __future__ import print_function, division, absolute_import - -import re -import requests -from urllib.parse import urlparse -from fsspec import AbstractFileSystem -from fsspec.spec import AbstractBufferedFile -from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE - -# https://stackoverflow.com/a/15926317/3821154 -ex = re.compile(r"""]*?\s+)?href=(["'])(.*?)\1""") -ex2 = re.compile(r"""(http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""") - - -class HTTPFileSystem(AbstractFileSystem): - """ - Simple File-System for fetching data via HTTP(S) - - ``ls()`` is implemented by loading the parent page and doing a regex - match on the result. If simple_link=True, anything of the form - "http(s)://server.com/stuff?thing=other"; otherwise only links within - HTML href tags will be used. - """ - - sep = "/" - - def __init__( - self, - simple_links=True, - block_size=None, - same_scheme=True, - size_policy=None, - **storage_options - ): - """ - Parameters - ---------- - block_size: int - Blocks to read bytes; if 0, will default to raw requests file-like - objects instead of HTTPFile instances - simple_links: bool - If True, will consider both HTML tags and anything that looks - like a URL; if False, will consider only the former. - same_scheme: True - When doing ls/glob, if this is True, only consider paths that have - http/https matching the input URLs. - size_policy: this argument is deprecated - storage_options: key-value - May be credentials, e.g., `{'auth': ('username', 'pword')}` or any - other parameters passed on to requests - """ - AbstractFileSystem.__init__(self) - self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE - self.simple_links = simple_links - self.same_schema = same_scheme - self.kwargs = storage_options - self.session = requests.Session() - - @classmethod - def _strip_protocol(cls, path): - """ For HTTP, we always want to keep the full URL - """ - return path - - # TODO: override get - - def ls(self, url, detail=True): - # ignoring URL-encoded arguments - r = self.session.get(url, **self.kwargs) - if self.simple_links: - links = ex2.findall(r.text) + ex.findall(r.text) - else: - links = ex.findall(r.text) - out = set() - parts = urlparse(url) - for l in links: - if isinstance(l, tuple): - l = l[1] - if l.startswith("http"): - if self.same_schema: - if l.split(":", 1)[0] == url.split(":", 1)[0]: - out.add(l) - elif l.replace("https", "http").startswith( - url.replace("https", "http") - ): - # allowed to cross http <-> https - out.add(l) - elif l.startswith("/") and len(l) > 1: - out.add(parts.scheme + "://" + parts.netloc + l) - else: - if l not in ["..", "../"]: - # Ignore FTP-like "parent" - out.add("/".join([url.rstrip("/"), l.lstrip("/")])) - if not out and url.endswith("/"): - return self.ls(url.rstrip("/"), detail=True) - if detail: - return [ - { - "name": u, - "size": None, - "type": "directory" if u.endswith("/") else "file", - } - for u in out - ] - else: - return list(sorted(out)) - - def cat(self, url): - r = requests.get(url, **self.kwargs) - r.raise_for_status() - return r.content - - def mkdirs(self, url): - """Make any intermediate directories to make path writable""" - raise NotImplementedError - - def exists(self, path): - kwargs = self.kwargs.copy() - kwargs["stream"] = True - try: - r = self.session.get(path, **kwargs) - r.close() - return r.ok - except requests.HTTPError: - return False - - def _open(self, url, mode="rb", block_size=None, cache_options=None, **kwargs): - """Make a file-like object - - Parameters - ---------- - url: str - Full URL with protocol - mode: string - must be "rb" - block_size: int or None - Bytes to download in one request; use instance value if None. If - zero, will return a streaming Requests file-like instance. - kwargs: key-value - Any other parameters, passed to requests calls - """ - if mode != "rb": - raise NotImplementedError - block_size = block_size if block_size is not None else self.block_size - kw = self.kwargs.copy() - kw.update(kwargs) - kw.pop("autocommit", None) - if block_size: - return HTTPFile( - self, url, self.session, block_size, cache_options=cache_options, **kw - ) - else: - kw["stream"] = True - r = self.session.get(url, **kw) - r.raise_for_status() - r.raw.decode_content = True - return r.raw - - def ukey(self, url): - """Unique identifier; assume HTTP files are static, unchanging""" - return tokenize(url, self.kwargs, self.protocol) - - def info(self, url, **kwargs): - """Get info of URL - - Tries to access location via HEAD, and then GET methods, but does - not fetch the data. - - It is possible that the server does not supply any size information, in - which case size will be given as None (and certain operations on the - corresponding file will not work). - """ - size = False - for policy in ["head", "get"]: - try: - size = file_size(url, self.session, policy, **self.kwargs) - if size: - break - except Exception: - pass - else: - # get failed, so conclude URL does not exist - if size is False: - raise FileNotFoundError(url) - return {"name": url, "size": size or None, "type": "file"} - - -class HTTPFile(AbstractBufferedFile): - """ - A file-like object pointing to a remove HTTP(S) resource - - Supports only reading, with read-ahead of a predermined block-size. - - In the case that the server does not supply the filesize, only reading of - the complete file in one go is supported. - - Parameters - ---------- - url: str - Full URL of the remote resource, including the protocol - session: requests.Session or None - All calls will be made within this session, to avoid restarting - connections where the server allows this - block_size: int or None - The amount of read-ahead to do, in bytes. Default is 5MB, or the value - configured for the FileSystem creating this file - size: None or int - If given, this is the size of the file in bytes, and we don't attempt - to call the server to find the value. - kwargs: all other key-values are passed to requests calls. - """ - - def __init__( - self, - fs, - url, - session=None, - block_size=None, - mode="rb", - cache_type="bytes", - cache_options=None, - size=None, - **kwargs - ): - if mode != "rb": - raise NotImplementedError("File mode not supported") - self.url = url - self.session = session if session is not None else requests.Session() - if size is not None: - self.details = {"name": url, "size": size, "type": "file"} - super().__init__( - fs=fs, - path=url, - mode=mode, - block_size=block_size, - cache_type=cache_type, - cache_options=cache_options, - **kwargs - ) - self.cache.size = self.size or self.blocksize - - def read(self, length=-1): - """Read bytes from file - - Parameters - ---------- - length: int - Read up to this many bytes. If negative, read all content to end of - file. If the server has not supplied the filesize, attempting to - read only part of the data will raise a ValueError. - """ - if ( - (length < 0 and self.loc == 0) - or (length > (self.size or length)) # explicit read all - or ( # read more than there is - self.size and self.size < self.blocksize - ) # all fits in one block anyway - ): - self._fetch_all() - if self.size is None: - if length < 0: - self._fetch_all() - else: - length = min(self.size - self.loc, length) - return super().read(length) - - def _fetch_all(self): - """Read whole file in one shot, without caching - - This is only called when position is still at zero, - and read() is called without a byte-count. - """ - if not isinstance(self.cache, AllBytes): - r = self.session.get(self.url, **self.kwargs) - r.raise_for_status() - out = r.content - self.cache = AllBytes(out) - self.size = len(out) - - def _fetch_range(self, start, end): - """Download a block of data - - The expectation is that the server returns only the requested bytes, - with HTTP code 206. If this is not the case, we first check the headers, - and then stream the output - if the data size is bigger than we - requested, an exception is raised. - """ - kwargs = self.kwargs.copy() - headers = kwargs.pop("headers", {}) - headers["Range"] = "bytes=%i-%i" % (start, end - 1) - r = self.session.get(self.url, headers=headers, stream=True, **kwargs) - if r.status_code == 416: - # range request outside file - return b"" - r.raise_for_status() - if r.status_code == 206: - # partial content, as expected - out = r.content - elif "Content-Length" in r.headers: - cl = int(r.headers["Content-Length"]) - if cl <= end - start: - # data size OK - out = r.content - else: - raise ValueError( - "Got more bytes (%i) than requested (%i)" % (cl, end - start) - ) - else: - cl = 0 - out = [] - for chunk in r.iter_content(chunk_size=2 ** 20): - # data size unknown, let's see if it goes too big - if chunk: - out.append(chunk) - cl += len(chunk) - if cl > end - start: - raise ValueError( - "Got more bytes so far (>%i) than requested (%i)" - % (cl, end - start) - ) - else: - break - out = b"".join(out) - return out - - -def file_size(url, session=None, size_policy="head", **kwargs): - """Call HEAD on the server to get file size - - Default operation is to explicitly allow redirects and use encoding - 'identity' (no compression) to get the true size of the target. - """ - kwargs = kwargs.copy() - ar = kwargs.pop("allow_redirects", True) - head = kwargs.get("headers", {}).copy() - head["Accept-Encoding"] = "identity" - session = session or requests.Session() - if size_policy == "head": - r = session.head(url, allow_redirects=ar, **kwargs) - elif size_policy == "get": - kwargs["stream"] = True - r = session.get(url, allow_redirects=ar, **kwargs) - else: - raise TypeError('size_policy must be "head" or "get", got %s' "" % size_policy) - if "Content-Length" in r.headers: - return int(r.headers["Content-Length"]) - elif "Content-Range" in r.headers: - return int(r.headers["Content-Range"].split("/")[1]) - - -class AllBytes(object): - """Cache entire contents of a remote URL""" - - def __init__(self, data): - self.data = data - - def _fetch(self, start, end): - return self.data[start:end] diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/local.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/local.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/local.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/local.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,233 +0,0 @@ -import io -import os -import shutil -import posixpath -import re -import tempfile -from fsspec import AbstractFileSystem -from fsspec.utils import stringify_path - - -class LocalFileSystem(AbstractFileSystem): - """Interface to files on local storage - - Parameters - ---------- - auto_mkdirs: bool - Whether, when opening a file, the directory containing it should - be created (if it doesn't already exist). This is assumed by pyarrow - code. - """ - - root_marker = "/" - - def __init__(self, auto_mkdir=True, **kwargs): - super().__init__(**kwargs) - self.auto_mkdir = auto_mkdir - - def mkdir(self, path, create_parents=True, **kwargs): - path = self._strip_protocol(path) - if create_parents: - self.makedirs(path, exist_ok=True) - else: - os.mkdir(path, **kwargs) - - def makedirs(self, path, exist_ok=False): - path = self._strip_protocol(path) - os.makedirs(path, exist_ok=exist_ok) - - def rmdir(self, path): - os.rmdir(path) - - def ls(self, path, detail=False): - path = self._strip_protocol(path) - paths = [posixpath.join(path, f) for f in os.listdir(path)] - if detail: - return [self.info(f) for f in paths] - else: - return paths - - def glob(self, path, **kargs): - path = self._strip_protocol(path) - return super().glob(path) - - def info(self, path, **kwargs): - path = self._strip_protocol(path) - out = os.stat(path, follow_symlinks=False) - dest = False - if os.path.islink(path): - t = "link" - dest = os.readlink(path) - elif os.path.isdir(path): - t = "directory" - elif os.path.isfile(path): - t = "file" - else: - t = "other" - result = {"name": path, "size": out.st_size, "type": t, "created": out.st_ctime} - for field in ["mode", "uid", "gid", "mtime"]: - result[field] = getattr(out, "st_" + field) - if dest: - result["destination"] = dest - try: - out2 = os.stat(path, follow_symlinks=True) - result["size"] = out2.st_size - except IOError: - result["size"] = 0 - return result - - def copy(self, path1, path2, **kwargs): - shutil.copyfile(path1, path2) - - def get(self, path1, path2, **kwargs): - if kwargs.get("recursive"): - return super(LocalFileSystem, self).get(path1, path2, **kwargs) - else: - return self.copy(path1, path2, **kwargs) - - def put(self, path1, path2, **kwargs): - if kwargs.get("recursive"): - return super(LocalFileSystem, self).put(path1, path2, **kwargs) - else: - return self.copy(path1, path2, **kwargs) - - def mv(self, path1, path2, **kwargs): - os.rename(path1, path2) - - def rm(self, path, recursive=False, maxdepth=None): - if recursive and self.isdir(path): - shutil.rmtree(path) - else: - os.remove(path) - - def _open(self, path, mode="rb", block_size=None, **kwargs): - path = self._strip_protocol(path) - if self.auto_mkdir: - self.makedirs(self._parent(path), exist_ok=True) - return LocalFileOpener(path, mode, fs=self, **kwargs) - - def touch(self, path, **kwargs): - path = self._strip_protocol(path) - if self.exists(path): - os.utime(path, None) - else: - open(path, "a").close() - - @classmethod - def _parent(cls, path): - path = cls._strip_protocol(path).rstrip("/") - if "/" in path: - return path.rsplit("/", 1)[0] - else: - return cls.root_marker - - @classmethod - def _strip_protocol(cls, path): - path = stringify_path(path) - if path.startswith("file://"): - path = path[7:] - return make_path_posix(path) - - -def make_path_posix(path, sep=os.sep): - """ Make path generic """ - if re.match("/[A-Za-z]:", path): - # for windows file URI like "file:///C:/folder/file" - # or "file:///C:\\dir\\file" - path = path[1:] - if path.startswith("\\\\"): - # special case for windows UNC/DFS-style paths, do nothing, - # jsut flip the slashes around (case below does not work!) - return path.replace("\\", "/") - if path.startswith("\\") or re.match("[\\\\]*[A-Za-z]:", path): - # windows full path "\\server\\path" or "C:\\local\\path" - return path.lstrip("\\").replace("\\", "/").replace("//", "/") - if ( - sep not in path - and "/" not in path - or (sep == "/" and not path.startswith("/")) - or (sep == "\\" and ":" not in path) - ): - # relative path like "path" or "rel\\path" (win) or rel/path" - path = os.path.abspath(path) - if os.sep == "\\": - # abspath made some more '\\' separators - return make_path_posix(path, sep) - return path - - -class LocalFileOpener(object): - def __init__(self, path, mode, autocommit=True, fs=None, **kwargs): - self.path = path - self.mode = mode - self.fs = fs - self.f = None - self.autocommit = autocommit - self.blocksize = io.DEFAULT_BUFFER_SIZE - self._open() - - def _open(self): - if self.f is None or self.f.closed: - if self.autocommit or "w" not in self.mode: - self.f = open(self.path, mode=self.mode) - else: - # TODO: check if path is writable? - i, name = tempfile.mkstemp() - self.temp = name - self.f = open(name, mode=self.mode) - if "w" not in self.mode: - self.details = self.fs.info(self.path) - self.size = self.details["size"] - self.f.size = self.size - - def _fetch_range(self, start, end): - # probably only used by cached FS - if "r" not in self.mode: - raise ValueError - self._open() - self.f.seek(start) - return self.f.read(end - start) - - def __setstate__(self, state): - if "r" in state["mode"]: - loc = self.state.pop("loc") - self._open() - self.f.seek(loc) - else: - self.f = None - self.__dict__.update(state) - - def __getstate__(self): - d = self.__dict__.copy() - d.pop("f") - if "r" in self.mode: - d["loc"] = self.f.tell() - else: - if not self.f.closed: - raise ValueError("Cannot serialise open write-mode local file") - return d - - def commit(self): - if self.autocommit: - raise RuntimeError("Can only commit if not already set to autocommit") - os.rename(self.temp, self.path) - - def discard(self): - if self.autocommit: - raise RuntimeError("Cannot discard if set to autocommit") - os.remove(self.temp) - - def __fspath__(self): - # uniquely for fsspec implementations, this is a real path - return self.path - - def __getattr__(self, item): - return getattr(self.f, item) - - def __enter__(self): - self._incontext = True - return self.f.__enter__() - - def __exit__(self, exc_type, exc_value, traceback): - self._incontext = False - self.f.__exit__(exc_type, exc_value, traceback) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/memory.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/memory.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/memory.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/memory.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,169 +0,0 @@ -from __future__ import print_function, division, absolute_import - -from io import BytesIO -from fsspec import AbstractFileSystem -import logging - -logger = logging.Logger("fsspec.memoryfs") - - -class MemoryFileSystem(AbstractFileSystem): - """A filesystem based on a dict of BytesIO objects""" - - store = {} # global - pseudo_dirs = [] - protocol = "memory" - root_marker = "" - - def ls(self, path, detail=False): - if path in self.store: - # there is a key with this exact name, but could also be directory - out = [ - { - "name": path, - "size": self.store[path].getbuffer().nbytes, - "type": "file", - } - ] - else: - out = [] - path = path.strip("/").lstrip("/") - paths = set() - for p2 in self.store: - has_slash = "/" if p2.startswith("/") else "" - p = p2.lstrip("/") - if "/" in p: - root = p.rsplit("/", 1)[0] - else: - root = "" - if root == path: - out.append( - { - "name": has_slash + p, - "size": self.store[p2].getbuffer().nbytes, - "type": "file", - } - ) - elif path and all( - (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/")) - ): - # implicit directory - ppath = "/".join(p.split("/")[: len(path.split("/")) + 1]) - if ppath not in paths: - out.append( - { - "name": has_slash + ppath + "/", - "size": 0, - "type": "directory", - } - ) - paths.add(ppath) - elif all( - (a == b) - for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) - ): - # root directory entry - ppath = p.rstrip("/").split("/", 1)[0] - if ppath not in paths: - out.append( - { - "name": has_slash + ppath + "/", - "size": 0, - "type": "directory", - } - ) - paths.add(ppath) - for p2 in self.pseudo_dirs: - if self._parent(p2).strip("/").rstrip("/") == path: - out.append({"name": p2 + "/", "size": 0, "type": "directory"}) - if detail: - return out - return sorted([f["name"] for f in out]) - - def mkdir(self, path): - path = path.rstrip("/") - if path not in self.pseudo_dirs: - self.pseudo_dirs.append(path) - - def rmdir(self, path): - path = path.rstrip("/") - if path in self.pseudo_dirs: - if self.ls(path) == []: - self.pseudo_dirs.remove(path) - else: - raise OSError("Directory %s not empty" % path) - else: - raise FileNotFoundError(path) - - def exists(self, path): - return path in self.store - - def _open(self, path, mode="rb", **kwargs): - """Make a file-like object - - Parameters - ---------- - path: str - identifier - mode: str - normally "rb", "wb" or "ab" - """ - if mode in ["rb", "ab", "rb+"]: - if path in self.store: - f = self.store[path] - if mode == "rb": - f.seek(0) - else: - f.seek(0, 2) - return f - else: - raise FileNotFoundError(path) - if mode == "wb": - m = MemoryFile(self, path) - if not self._intrans: - m.commit() - return m - - def copy(self, path1, path2, **kwargs): - self.store[path2] = MemoryFile(self, path2, self.store[path1].getbuffer()) - - def cat(self, path): - return self.store[path].getvalue() - - def _rm(self, path): - del self.store[path] - - def size(self, path): - """Size in bytes of the file at path""" - if path not in self.store: - raise FileNotFoundError(path) - return self.store[path].getbuffer().nbytes - - -class MemoryFile(BytesIO): - """A BytesIO which can't close and works as a context manager - - Can initialise with data - - No need to provide fs, path if auto-committing (default) - """ - - def __init__(self, fs, path, data=None): - self.fs = fs - self.path = path - if data: - self.write(data) - self.size = len(data) - self.seek(0) - - def __enter__(self): - return self - - def close(self): - self.size = self.seek(0, 2) - - def discard(self): - pass - - def commit(self): - self.fs.store[self.path] = self diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/sftp.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/sftp.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/sftp.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/sftp.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,139 +0,0 @@ -import paramiko -from stat import S_ISDIR, S_ISLNK -import types -import uuid -from .. import AbstractFileSystem -from ..utils import infer_storage_options - - -class SFTPFileSystem(AbstractFileSystem): - """Files over SFTP/SSH - - Peer-to-peer filesystem over SSH using paramiko. - """ - - protocol = "sftp", "ssh" - - def __init__(self, host, **ssh_kwargs): - """ - - Parameters - ---------- - host: str - Hostname or IP as a string - temppath: str - Location on the server to put files, when within a transaction - ssh_kwargs: dict - Parameters passed on to connection. See details in - http://docs.paramiko.org/en/2.4/api/client.html#paramiko.client.SSHClient.connect - May include port, username, password... - """ - if self._cached: - return - super(SFTPFileSystem, self).__init__(**ssh_kwargs) - self.temppath = ssh_kwargs.pop("temppath", "/tmp") - self.host = host - self.ssh_kwargs = ssh_kwargs - self._connect() - - def _connect(self): - self.client = paramiko.SSHClient() - self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - self.client.connect(self.host, **self.ssh_kwargs) - self.ftp = self.client.open_sftp() - - @classmethod - def _strip_protocol(cls, path): - return infer_storage_options(path)["path"] - - @staticmethod - def _get_kwargs_from_urls(urlpath): - out = infer_storage_options(urlpath) - out.pop("path", None) - out.pop("protocol", None) - return out - - def mkdir(self, path, mode=511): - self.ftp.mkdir(path, mode) - - def makedirs(self, path, exist_ok=False, mode=511): - if self.exists(path) and not exist_ok: - raise FileExistsError("File exists: {}".format(path)) - - parts = path.split("/") - path = "" - - for part in parts: - path += "/" + part - if not self.exists(path): - self.mkdir(path, mode) - - def rmdir(self, path): - self.ftp.rmdir(path) - - def info(self, path): - s = self.ftp.stat(path) - if S_ISDIR(s.st_mode): - t = "directory" - elif S_ISLNK(s.st_mode): - t = "link" - else: - t = "file" - return { - "name": path + "/" if t == "directory" else path, - "size": s.st_size, - "type": t, - "uid": s.st_uid, - "gui": s.st_gid, - "time": s.st_atime, - "mtime": s.st_mtime, - } - - def ls(self, path, detail=False): - out = ["/".join([path.rstrip("/"), p]) for p in self.ftp.listdir(path)] - out = [self.info(o) for o in out] - if detail: - return out - return sorted([p["name"] for p in out]) - - def put(self, lpath, rpath): - self.ftp.put(lpath, rpath) - - def get(self, rpath, lpath): - self.ftp.get(rpath, lpath) - - def _open(self, path, mode="rb", block_size=None, **kwargs): - """ - block_size: int or None - If 0, no buffering, if 1, line buffering, if >1, buffer that many - bytes, if None use default from paramiko. - """ - if kwargs.get("autocommit", True) is False: - # writes to temporary file, move on commit - path2 = "{}/{}".format(self.temppath, uuid.uuid4()) - f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1) - f.temppath = path2 - f.targetpath = path - f.fs = self - f.commit = types.MethodType(commit_a_file, f) - f.discard = types.MethodType(discard_a_file, f) - else: - f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1) - return f - - def _rm(self, path): - if self.isdir(path): - self.ftp.rmdir(path) - else: - self.ftp.remove(path) - - def mv(self, old, new): - self.ftp.posix_rename(old, new) - - -def commit_a_file(self): - self.fs.mv(self.temppath, self.targetpath) - - -def discard_a_file(self): - self.fs._rm(self.temppath) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/webhdfs.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/webhdfs.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/webhdfs.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/webhdfs.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,384 +0,0 @@ -# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html - -import requests -from urllib.parse import quote -import uuid -from ..spec import AbstractFileSystem, AbstractBufferedFile -from ..utils import infer_storage_options -import logging - -logger = logging.getLogger("webhdfs") - - -class WebHDFS(AbstractFileSystem): - """ - Interface to HDFS over HTTP - - Three auth mechanisms are supported: - - insecure: no auth is done, and the user is assumed to be whoever they - say they are (parameter `user`), or a predefined value such as - "dr.who" if not given - spnego: when kerberos authentication is enabled, auth is negotiated by - requests_kerberos https://github.com/requests/requests-kerberos . - This establishes a session based on existing kinit login and/or - specified principal/password; paraneters are passed with ``kerb_kwargs`` - token: uses an existing Hadoop delegation token from another secured - service. Indeed, this client can also generate such tokens when - not insecure. Note that tokens expire, but can be renewed (by a - previously specified user) and may allow for proxying. - - """ - - tempdir = "/tmp" - protocol = "webhdfs", "webHDFS" - - def __init__( - self, - host, - port=50070, - kerberos=False, - token=None, - user=None, - proxy_to=None, - kerb_kwargs=None, - data_proxy=None, - **kwargs - ): - """ - Parameters - ---------- - host: str - Name-node address - port: int - Port for webHDFS - kerberos: bool - Whether to authenticate with kerberos for this connection - token: str or None - If given, use this token on every call to authenticate. A user - and user-proxy may be encoded in the token and should not be also - given - user: str or None - If given, assert the user name to connect with - proxy_to: str or None - If given, the user has the authority to proxy, and this value is - the user in who's name actions are taken - kerb_kwargs: dict - Any extra arguments for HTTPKerberosAuth, see - https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py - data_proxy: dict, callable or None - If given, map data-node addresses. This can be necessary if the - HDFS cluster is behind a proxy, running on Docker or otherwise has - a mismatch between the host-names given by the name-node and the - address by which to refer to them from the client. If a dict, - maps host names `host->data_proxy[host]`; if a callable, full - URLs are passed, and function must conform to - `url->data_proxy(url)`. - kwargs - """ - if self._cached: - return - super().__init__(**kwargs) - self.url = "http://{host}:{port}/webhdfs/v1".format(host=host, port=port) - self.kerb = kerberos - self.kerb_kwargs = kerb_kwargs or {} - self.pars = {} - self.proxy = data_proxy or {} - if token is not None: - if user is not None or proxy_to is not None: - raise ValueError( - "If passing a delegation token, must not set " - "user or proxy_to, as these are encoded in the" - " token" - ) - self.pars["delegation"] = token - if user is not None: - self.pars["user.name"] = user - if proxy_to is not None: - self.pars["doas"] = proxy_to - if kerberos and user is not None: - raise ValueError( - "If using Kerberos auth, do not specify the " - "user, this is handled by kinit." - ) - self._connect() - - def _connect(self): - self.session = requests.Session() - if self.kerb: - from requests_kerberos import HTTPKerberosAuth - - self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs) - - def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs): - url = self.url + quote(path or "") - args = kwargs.copy() - args.update(self.pars) - args["op"] = op.upper() - logger.debug(url, method, args) - out = self.session.request( - method=method.upper(), - url=url, - params=args, - data=data, - allow_redirects=redirect, - ) - if out.status_code == 404: - raise FileNotFoundError(path) - if out.status_code == 403: - raise PermissionError(path or "") - if out.status_code == 401: - raise PermissionError # not specific to path - out.raise_for_status() - return out - - def _open( - self, - path, - mode="rb", - block_size=None, - autocommit=True, - replication=None, - permissions=None, - **kwargs - ): - """ - - Parameters - ---------- - path: str - File location - mode: str - 'rb', 'wb', etc. - block_size: int - Client buffer size for read-ahead or write buffer - autocommit: bool - If False, writes to temporary file that only gets put in final - location upon commit - replication: int - Number of copies of file on the cluster, write mode only - permissions: str or int - posix permissions, write mode only - kwargs - - Returns - ------- - WebHDFile instance - """ - block_size = block_size or self.blocksize - return WebHDFile( - self, - path, - mode=mode, - block_size=block_size, - tempdir=self.tempdir, - autocommit=autocommit, - replication=replication, - permissions=permissions, - ) - - @staticmethod - def _process_info(info): - info["type"] = info["type"].lower() - info["size"] = info["length"] - return info - - @classmethod - def _strip_protocol(cls, path): - return infer_storage_options(path)["path"] - - @staticmethod - def _get_kwargs_from_urls(urlpath): - out = infer_storage_options(urlpath) - out.pop("path", None) - out.pop("protocol", None) - if "username" in out: - out["user"] = out.pop("username") - return out - - def info(self, path): - out = self._call("GETFILESTATUS", path=path) - info = out.json()["FileStatus"] - info["name"] = path - return self._process_info(info) - - def ls(self, path, detail=False): - out = self._call("LISTSTATUS", path=path) - infos = out.json()["FileStatuses"]["FileStatus"] - for info in infos: - self._process_info(info) - info["name"] = path.rstrip("/") + "/" + info["pathSuffix"] - if detail: - return sorted(infos, key=lambda i: i["name"]) - else: - return sorted(info["name"] for info in infos) - - def content_summary(self, path): - """Total numbers of files, directories and bytes under path""" - out = self._call("GETCONTENTSUMMARY", path=path) - return out.json()["ContentSummary"] - - def ukey(self, path): - """Checksum info of file, giving method and result""" - out = self._call("GETFILECHECKSUM", path=path, redirect=False) - location = self._apply_proxy(out.headers["Location"]) - out2 = self.session.get(location) - out2.raise_for_status() - return out2.json()["FileChecksum"] - - def home_directory(self): - """Get user's home directory""" - out = self._call("GETHOMEDIRECTORY") - return out.json()["Path"] - - def get_delegation_token(self, renewer=None): - """Retrieve token which can give the same authority to other uses - - Parameters - ---------- - renewer: str or None - User who may use this token; if None, will be current user - """ - if renewer: - out = self._call("GETDELEGATIONTOKEN", renewer=renewer) - else: - out = self._call("GETDELEGATIONTOKEN") - t = out.json()["Token"] - if t is None: - raise ValueError("No token available for this user/security context") - return t["urlString"] - - def renew_delegation_token(self, token): - """Make token live longer. Returns new expiry time""" - out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token) - return out.json()["long"] - - def cancel_delegation_token(self, token): - """Stop the token from being useful""" - self._call("CANCELDELEGATIONTOKEN", method="put", token=token) - - def chmod(self, path, mod): - """Set the permission at path - - Parameters - ---------- - path: str - location to set (file or directory) - mod: str or int - posix epresentation or permission, give as oct string, e.g, '777' - or 0o777 - """ - self._call("SETPERMISSION", method="put", path=path, permission=mod) - - def chown(self, path, owner=None, group=None): - """Change owning user and/or group""" - kwargs = {} - if owner is not None: - kwargs["owner"] = owner - if group is not None: - kwargs["group"] = group - self._call("SETOWNER", method="put", path=path, **kwargs) - - def set_replication(self, path, replication): - """ - Set file replication factor - - Parameters - ---------- - path: str - File location (not for directories) - replication: int - Number of copies of file on the cluster. Should be smaller than - number of data nodes; normally 3 on most systems. - """ - self._call("SETREPLICATION", path=path, method="put", replication=replication) - - def mkdir(self, path, **kwargs): - self._call("MKDIRS", method="put", path=path) - - def makedirs(self, path, exist_ok=False): - if exist_ok is False and self.exists(path): - raise FileExistsError(path) - self.mkdir(path) - - def mv(self, path1, path2, **kwargs): - self._call("RENAME", method="put", path=path1, destination=path2) - - def rm(self, path, recursive=False, **kwargs): - self._call( - "DELETE", - method="delete", - path=path, - recursive="true" if recursive else "false", - ) - - def _apply_proxy(self, location): - if self.proxy and callable(self.proxy): - location = self.proxy(location) - elif self.proxy: - # as a dict - for k, v in self.proxy.items(): - location = location.replace(k, v, 1) - return location - - -class WebHDFile(AbstractBufferedFile): - """A file living in HDFS over webHDFS""" - - def __init__(self, fs, path, **kwargs): - super().__init__(fs, path, **kwargs) - kwargs = kwargs.copy() - if kwargs.get("permissions", None) is None: - kwargs.pop("permissions", None) - if kwargs.get("replication", None) is None: - kwargs.pop("replication", None) - self.permissions = kwargs.pop("permissions", 511) - tempdir = kwargs.pop("tempdir") - if kwargs.pop("autocommit", False) is False: - self.target = self.path - self.path = "/".join([tempdir, str(uuid.uuid4())]) - - def _upload_chunk(self, final=False): - """ Write one part of a multi-block file upload - - Parameters - ========== - final: bool - This is the last block, so should complete file, if - self.autocommit is True. - """ - out = self.fs.session.post(self.location, data=self.buffer.getvalue()) - out.raise_for_status() - return True - - def _initiate_upload(self): - """ Create remote file/upload """ - if "a" in self.mode: - op, method = "APPEND", "POST" - else: - op, method = "CREATE", "PUT" - if self.fs.exists(self.path): - # no "truncate" or "create empty" - self.fs.rm(self.path) - out = self.fs._call(op, method, self.path, redirect=False, **self.kwargs) - location = self.fs._apply_proxy(out.headers["Location"]) - if "w" in self.mode: - # create empty file to append to - out2 = self.fs.session.put(location) - out2.raise_for_status() - self.location = location.replace("CREATE", "APPEND") - - def _fetch_range(self, start, end): - out = self.fs._call( - "OPEN", path=self.path, offset=start, length=end - start, redirect=False - ) - out.raise_for_status() - location = out.headers["Location"] - out2 = self.fs.session.get(self.fs._apply_proxy(location)) - return out2.content - - def commit(self): - self.fs.mv(self.path, self.target) - - def discard(self): - self.fs.rm(self.path) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/zip.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/zip.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/zip.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/implementations/zip.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,119 +0,0 @@ -from __future__ import print_function, division, absolute_import - -import zipfile -from fsspec import AbstractFileSystem, open_files -from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE - - -class ZipFileSystem(AbstractFileSystem): - """Read contents of ZIP archive as a file-system - - Keeps file object open while instance lives. - - This class is pickleable, but not necessarily thread-safe - """ - - root_marker = "" - - def __init__(self, fo="", mode="r", **storage_options): - """ - Parameters - ---------- - fo: str or file-like - Contains ZIP, and must exist. If a str, will fetch file using - `open_files()`, which must return one file exactly. - mode: str - Currently, only 'r' accepted - storage_options: key-value - May be credentials, e.g., `{'auth': ('username', 'pword')}` or any - other parameters for requests - """ - if self._cached: - return - AbstractFileSystem.__init__(self) - if mode != "r": - raise ValueError("Only read from zip files accepted") - self.in_fo = fo - if isinstance(fo, str): - files = open_files(fo) - if len(files) != 1: - raise ValueError( - 'Path "{}" did not resolve to exactly' - 'one file: "{}"'.format(fo, files) - ) - fo = files[0] - self.fo = fo.__enter__() # the whole instance is a context - self.zip = zipfile.ZipFile(self.fo) - self.block_size = storage_options.get("block_size", DEFAULT_BLOCK_SIZE) - self.dir_cache = None - - @classmethod - def _strip_protocol(cls, path): - # zip file paths are always relative to the archive root - return super()._strip_protocol(path).lstrip("/") - - def _get_dirs(self): - if self.dir_cache is None: - files = self.zip.infolist() - self.dir_cache = {} - for z in files: - f = {s: getattr(z, s) for s in zipfile.ZipInfo.__slots__} - f.update( - { - "name": z.filename, - "size": z.file_size, - "type": ("directory" if z.is_dir() else "file"), - } - ) - self.dir_cache[f["name"]] = f - - def ls(self, path, detail=False): - self._get_dirs() - paths = {} - for p, f in self.dir_cache.items(): - p = p.rstrip("/") - if "/" in p: - root = p.rsplit("/", 1)[0] - else: - root = "" - if root == path.rstrip("/"): - paths[p] = f - elif path and all( - (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/")) - ): - # implicit directory - ppath = "/".join(p.split("/")[: len(path.split("/")) + 1]) - if ppath not in paths: - out = {"name": ppath + "/", "size": 0, "type": "directory"} - paths[ppath] = out - - elif all( - (a == b) - for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) - ): - # root directory entry - ppath = p.rstrip("/").split("/", 1)[0] - if ppath not in paths: - out = {"name": ppath + "/", "size": 0, "type": "directory"} - paths[ppath] = out - out = list(paths.values()) - if detail: - return out - else: - return list(sorted(f["name"] for f in out)) - - def cat(self, path): - return self.zip.read(path) - - def _open(self, path, mode="rb", **kwargs): - path = self._strip_protocol(path) - if mode != "rb": - raise NotImplementedError - info = self.info(path) - out = self.zip.open(path, "r") - out.size = info["size"] - out.name = info["name"] - return out - - def ukey(self, path): - return tokenize(path, self.in_fo, self.protocol) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/__init__.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/__init__.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/__init__.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/__init__.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,24 +0,0 @@ -from ._version import get_versions - -from .spec import AbstractFileSystem -from .registry import get_filesystem_class, registry, filesystem -from .mapping import FSMap, get_mapper -from .core import open_files, get_fs_token_paths, open -from . import caching - -__version__ = get_versions()["version"] -del get_versions - - -__all__ = [ - "AbstractFileSystem", - "FSMap", - "filesystem", - "get_filesystem_class", - "get_fs_token_paths", - "get_mapper", - "open", - "open_files", - "registry", - "caching", -] diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/mapping.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/mapping.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/mapping.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/mapping.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,152 +0,0 @@ -from collections.abc import MutableMapping -from .registry import get_filesystem_class -from .core import split_protocol - - -class FSMap(MutableMapping): - """Wrap a FileSystem instance as a mutable wrapping. - - The keys of the mapping become files under the given root, and the - values (which must be bytes) the contents of those files. - - Parameters - ---------- - root: string - prefix for all the files - fs: FileSystem instance - check: bool (=True) - performs a touch at the location, to check for write access. - - Examples - -------- - >>> fs = FileSystem(**parameters) # doctest: +SKIP - >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP - or, more likely - >>> d = fs.get_mapper('my-data/path/') - - >>> d['loc1'] = b'Hello World' # doctest: +SKIP - >>> list(d.keys()) # doctest: +SKIP - ['loc1'] - >>> d['loc1'] # doctest: +SKIP - b'Hello World' - """ - - def __init__(self, root, fs, check=False, create=False): - self.fs = fs - self.root = fs._strip_protocol(root).rstrip( - "/" - ) # we join on '/' in _key_to_str - if create: - if not self.fs.exists(root): - self.fs.mkdir(root) - if check: - if not self.fs.exists(root): - raise ValueError( - "Path %s does not exist. Create " - " with the ``create=True`` keyword" % root - ) - self.fs.touch(root + "/a") - self.fs.rm(root + "/a") - - def clear(self): - """Remove all keys below root - empties out mapping - """ - try: - self.fs.rm(self.root, True) - self.fs.mkdir(self.root) - except: # noqa: E722 - pass - - def _key_to_str(self, key): - """Generate full path for the key""" - if isinstance(key, (tuple, list)): - key = str(tuple(key)) - else: - key = str(key) - return "/".join([self.root, key]) if self.root else key - - def _str_to_key(self, s): - """Strip path of to leave key name""" - return s[len(self.root) :].lstrip("/") - - def __getitem__(self, key, default=None): - """Retrieve data""" - key = self._key_to_str(key) - try: - result = self.fs.cat(key) - except: # noqa: E722 - if default is not None: - return default - raise KeyError(key) - return result - - def pop(self, key, default=None): - result = self.__getitem__(key, default) - try: - del self[key] - except KeyError: - pass - return result - - def __setitem__(self, key, value): - """Store value in key""" - key = self._key_to_str(key) - self.fs.mkdirs(self.fs._parent(key), exist_ok=True) - with self.fs.open(key, "wb") as f: - f.write(value) - - def __iter__(self): - return (self._str_to_key(x) for x in self.fs.find(self.root)) - - def __len__(self): - return len(self.fs.find(self.root)) - - def __delitem__(self, key): - """Remove key""" - try: - self.fs.rm(self._key_to_str(key)) - except: # noqa: E722 - raise KeyError - - def __contains__(self, key): - """Does key exist in mapping?""" - return self.fs.exists(self._key_to_str(key)) - - def __getstate__(self): - """Mapping should be pickleable""" - # TODO: replace with reduce to reinstantiate? - return self.fs, self.root - - def __setstate__(self, state): - fs, root = state - self.fs = fs - self.root = root - - -def get_mapper(url, check=False, create=False, **kwargs): - """Create key-value interface for given URL and options - - The URL will be of the form "protocol://location" and point to the root - of the mapper required. All keys will be file-names below this location, - and their values the contents of each key. - - Parameters - ---------- - url: str - Root URL of mapping - check: bool - Whether to attempt to read from the location before instantiation, to - check that the mapping does exist - create: bool - Whether to make the directory corresponding to the root before - instantiating - - Returns - ------- - ``FSMap`` instance, the dict-like key-value store. - """ - protocol, path = split_protocol(url) - cls = get_filesystem_class(protocol) - fs = cls(**kwargs) - # Removing protocol here - could defer to each open() on the backend - return FSMap(url, fs, check, create) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/registry.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/registry.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/registry.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/registry.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,116 +0,0 @@ -import importlib -from distutils.version import LooseVersion - -__all__ = ["registry", "get_filesystem_class", "default"] - -# mapping protocol: implementation class object -registry = {} -default = "file" - -# protocols mapped to the class which implements them. This dict can -# be dynamically updated. -known_implementations = { - "file": {"class": "fsspec.implementations.local.LocalFileSystem"}, - "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"}, - "http": { - "class": "fsspec.implementations.http.HTTPFileSystem", - "err": 'HTTPFileSystem requires "requests" to be installed', - }, - "https": { - "class": "fsspec.implementations.http.HTTPFileSystem", - "err": 'HTTPFileSystem requires "requests" to be installed', - }, - "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"}, - "gcs": { - "class": "gcsfs.GCSFileSystem", - "err": "Please install gcsfs to access Google Storage", - }, - "gs": { - "class": "gcsfs.GCSFileSystem", - "err": "Please install gcsfs to access Google Storage", - }, - "sftp": { - "class": "fsspec.implementations.sftp.SFTPFileSystem", - "err": 'SFTPFileSystem requires "paramiko" to be installed', - }, - "ssh": { - "class": "fsspec.implementations.sftp.SFTPFileSystem", - "err": 'SFTPFileSystem requires "paramiko" to be installed', - }, - "ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"}, - "hdfs": { - "class": "fsspec.implementations.hdfs.PyArrowHDFS", - "err": "pyarrow and local java libraries required for HDFS", - }, - "webhdfs": { - "class": "fsspec.implementations.webhdfs.WebHDFS", - "err": 'webHDFS access requires "requests" to be installed', - }, - "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"}, - "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"}, - "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"}, - "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"}, - "dask": { - "class": "fsspec.implementations.dask.DaskWorkerFileSystem", - "err": "Install dask distributed to access worker file system", - }, -} - -minversions = {"s3fs": LooseVersion("0.3.0"), "gcsfs": LooseVersion("0.3.0")} - - -def get_filesystem_class(protocol): - """Fetch named protocol implementation from the registry - - The dict ``known_implementations`` maps protocol names to the locations - of classes implementing the corresponding file-system. When used for the - first time, appropriate imports will happen and the class will be placed in - the registry. All subsequent calls will fetch directly from the registry. - - Some protocol implementations require additional dependencies, and so the - import may fail. In this case, the string in the "err" field of the - ``known_implementations`` will be given as the error message. - """ - if protocol is None: - protocol = default - - if protocol not in registry: - if protocol not in known_implementations: - raise ValueError("Protocol not known: %s" % protocol) - bit = known_implementations[protocol] - mod, name = bit["class"].rsplit(".", 1) - minversion = minversions.get(mod, None) - err = None - try: - mod = importlib.import_module(mod) - except ImportError: - err = ImportError(bit["err"]) - - except Exception as e: - err = e - if err is not None: - raise RuntimeError(str(err)) - - if minversion: - version = getattr(mod, "__version__", None) - if version and LooseVersion(version) < minversion: - raise RuntimeError( - "'{}={}' is installed, but version '{}' or " - "higher is required".format(mod.__name__, version, minversion) - ) - registry[protocol] = getattr(mod, name) - cls = registry[protocol] - if getattr(cls, "protocol", None) in ("abstract", None): - cls.protocol = protocol - - return cls - - -def filesystem(protocol, **storage_options): - """Instantiate filesystems for given protocol and arguments - - ``storage_options`` are specific to the protocol being chosen, and are - passed directly to the class. - """ - cls = get_filesystem_class(protocol) - return cls(**storage_options) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/spec.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/spec.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/spec.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/spec.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,1246 +0,0 @@ -import warnings -from hashlib import md5 -import io -import os -import logging - -from .transaction import Transaction -from .utils import read_block, tokenize, stringify_path - -logger = logging.getLogger("fsspec") - - -def make_instance(cls, args, kwargs): - return cls(*args, **kwargs) - - -class _Cached(type): - """ - Metaclass for caching file system instances. - - Notes - ----- - Instances are cached according to - - * The values of the class attributes listed in `_extra_tokenize_attributes` - * The arguments passed to ``__init__``. - - This creates an additional reference to the filesystem, which prevents the - filesystem from being garbage collected when all *user* references go away. - A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also* - be made for a filesystem instance to be garbage collected. - """ - - cachable = True - _extra_tokenize_attributes = () - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # Note: we intentionally create a reference here, to avoid garbage - # collecting instances when all other references are gone. To really - # delete a FileSystem, the cache must be cleared. - self._cache = {} - - def __call__(self, *args, **kwargs): - cls = type(self) - extra_tokens = tuple( - getattr(self, attr, None) for attr in self._extra_tokenize_attributes - ) - token = tokenize(cls, *args, *extra_tokens, **kwargs) - if self.cachable and token in self._cache: - return self._cache[token] - else: - obj = super().__call__(*args, **kwargs) - # Setting _fs_token here causes some static linters to complain. - obj._fs_token_ = token - self.storage_args = args - self.storage_options = kwargs - - if self.cachable: - self._cache[token] = obj - return obj - - -try: # optionally derive from pyarrow's FileSystem, if available - import pyarrow as pa - - up = pa.filesystem.DaskFileSystem -except ImportError: - up = object - - -class AbstractFileSystem(up, metaclass=_Cached): - """ - An abstract super-class for pythonic file-systems - - Implementations are expected to be compatible with or, better, subclass - from here. - """ - - cachable = True # this class can be cached, instances reused - _cached = False - blocksize = 2 ** 22 - sep = "/" - protocol = "abstract" - root_marker = "" # For some FSs, may require leading '/' or other character - - #: Extra *class attributes* that should be considered when hashing. - _extra_tokenize_attributes = () - - def __init__(self, *args, **storage_options): - """Create and configure file-system instance - - Instances may be cachable, so if similar enough arguments are seen - a new instance is not required. The token attribute exists to allow - implementations to cache instances if they wish. - - A reasonable default should be provided if there are no arguments. - - Subclasses should call this method. - - Magic kwargs that affect functionality here: - add_docs: if True, will append docstrings from this spec to the - specific implementation - """ - if self._cached: - # reusing instance, don't change - return - self._cached = True - self._intrans = False - self._transaction = None - self.dircache = {} - - if storage_options.pop("add_docs", None): - warnings.warn("add_docs is no longer supported.", FutureWarning) - - if storage_options.pop("add_aliases", None): - warnings.warn("add_aliases has been removed.", FutureWarning) - # This is set in _Cached - self._fs_token_ = None - - @property - def _fs_token(self): - return self._fs_token_ - - def __dask_tokenize__(self): - return self._fs_token - - def __hash__(self): - return int(self._fs_token, 16) - - def __eq__(self, other): - return isinstance(other, type(self)) and self._fs_token == other._fs_token - - @classmethod - def _strip_protocol(cls, path): - """ Turn path from fully-qualified to file-system-specific - - May require FS-specific handling, e.g., for relative paths or links. - """ - path = stringify_path(path) - protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol - for protocol in protos: - path = path.rstrip("/") - if path.startswith(protocol + "://"): - path = path[len(protocol) + 3 :] - elif path.startswith(protocol + ":"): - path = path[len(protocol) + 1 :] - # use of root_marker to make minimum required path, e.g., "/" - return path or cls.root_marker - - @staticmethod - def _get_kwargs_from_urls(paths): - """If kwargs can be encoded in the paths, extract them here - - This should happen before instantiation of the class; incoming paths - then should be amended to strip the options in methods. - - Examples may look like an sftp path "sftp://user@host:/my/path", where - the user and host should become kwargs and later get stripped. - """ - # by default, nothing happens - return {} - - @classmethod - def current(cls): - """ Return the most recently created FileSystem - - If no instance has been created, then create one with defaults - """ - if not len(cls._cache): - return cls() - else: - return list(cls._cache.values())[-1] - - @property - def transaction(self): - """A context within which files are committed together upon exit - - Requires the file class to implement `.commit()` and `.discard()` - for the normal and exception cases. - """ - if self._transaction is None: - self._transaction = Transaction(self) - return self._transaction - - def start_transaction(self): - """Begin write transaction for deferring files, non-context version""" - self._intrans = True - self._transaction = Transaction(self) - return self.transaction - - def end_transaction(self): - """Finish write transaction, non-context version""" - self.transaction.complete() - self._transaction = None - - def invalidate_cache(self, path=None): - """ - Discard any cached directory information - - Parameters - ---------- - path: string or None - If None, clear all listings cached else listings at or under given - path. - """ - pass # not necessary to implement, may have no cache - - def mkdir(self, path, create_parents=True, **kwargs): - """ - Create directory entry at path - - For systems that don't have true directories, may create an for - this instance only and not touch the real filesystem - - Parameters - ---------- - path: str - location - create_parents: bool - if True, this is equivalent to ``makedirs`` - kwargs: - may be permissions, etc. - """ - pass # not necessary to implement, may not have directories - - def makedirs(self, path, exist_ok=False): - """Recursively make directories - - Creates directory at path and any intervening required directories. - Raises exception if, for instance, the path already exists but is a - file. - - Parameters - ---------- - path: str - leaf directory name - exist_ok: bool (False) - If True, will error if the target already exists - """ - pass # not necessary to implement, may not have directories - - def rmdir(self, path): - """Remove a directory, if empty""" - pass # not necessary to implement, may not have directories - - def ls(self, path, detail=True, **kwargs): - """List objects at path. - - This should include subdirectories and files at that location. The - difference between a file and a directory must be clear when details - are requested. - - The specific keys, or perhaps a FileInfo class, or similar, is TBD, - but must be consistent across implementations. - Must include: - - full path to the entry (without protocol) - - size of the entry, in bytes. If the value cannot be determined, will - be ``None``. - - type of entry, "file", "directory" or other - - Additional information - may be present, aproriate to the file-system, e.g., generation, - checksum, etc. - - May use refresh=True|False to allow use of self._ls_from_cache to - check for a saved listing and avoid calling the backend. This would be - common where listing may be expensive. - - Parameters - ---------- - path: str - detail: bool - if True, gives a list of dictionaries, where each is the same as - the result of ``info(path)``. If False, gives a list of paths - (str). - kwargs: may have additional backend-specific options, such as version - information - - Returns - ------- - List of strings if detail is False, or list of directory information - dicts if detail is True. - """ - raise NotImplementedError - - def _ls_from_cache(self, path): - """Check cache for listing - - Returns listing, if found (may me empty list for a directly that exists - but contains nothing), None if not in cache. - """ - parent = self._parent(path) - if path in self.dircache: - return self.dircache[path] - elif parent in self.dircache: - files = [f for f in self.dircache[parent] if f["name"] == path] - if len(files) == 0: - # parent dir was listed but did not contain this file - raise FileNotFoundError(path) - return files - - def walk(self, path, maxdepth=None, **kwargs): - """ Return all files belows path - - List all files, recursing into subdirectories; output is iterator-style, - like ``os.walk()``. For a simple list of files, ``find()`` is available. - - Note that the "files" outputted will include anything that is not - a directory, such as links. - - Parameters - ---------- - path: str - Root to recurse into - maxdepth: int - Maximum recursion depth. None means limitless, but not recommended - on link-based file-systems. - kwargs: passed to ``ls`` - """ - path = self._strip_protocol(path) - full_dirs = [] - dirs = [] - files = [] - - try: - listing = self.ls(path, detail=True, **kwargs) - except (FileNotFoundError, IOError): - return [], [], [] - - for info in listing: - # each info name must be at least [path]/part , but here - # we check also for names like [path]/part/ - name = info["name"].rstrip("/") - if info["type"] == "directory" and name != path: - # do not include "self" path - full_dirs.append(name) - dirs.append(name.rsplit("/", 1)[-1]) - elif name == path: - # file-like with same name as give path - files.append("") - else: - files.append(name.rsplit("/", 1)[-1]) - yield path, dirs, files - - for d in full_dirs: - if maxdepth is None or maxdepth > 1: - for res in self.walk( - d, - maxdepth=(maxdepth - 1) if maxdepth is not None else None, - **kwargs - ): - yield res - - def find(self, path, maxdepth=None, withdirs=False, **kwargs): - """List all files below path. - - Like posix ``find`` command without conditions - - Parameters - ---------- - path : str - maxdepth: int or None - If not None, the maximum number of levels to descend - withdirs: bool - Whether to include directory paths in the output. This is True - when used by glob, but users usually only want files. - kwargs are passed to ``ls``. - """ - # TODO: allow equivalent of -name parameter - out = set() - for path, dirs, files in self.walk(path, maxdepth, **kwargs): - if withdirs: - files += dirs - for name in files: - if name and name not in out: - out.add("/".join([path.rstrip("/"), name]) if path else name) - if self.isfile(path) and path not in out: - # walk works on directories, but find should also return [path] - # when path happens to be a file - out.add(path) - return sorted(out) - - def du(self, path, total=True, maxdepth=None, **kwargs): - """Space used by files within a path - - Parameters - ---------- - path: str - total: bool - whether to sum all the file sizes - maxdepth: int or None - maximum number of directory levels to descend, None for unlimited. - kwargs: passed to ``ls`` - - Returns - ------- - Dict of {fn: size} if total=False, or int otherwise, where numbers - refer to bytes used. - """ - sizes = {} - for f in self.find(path, maxdepth=maxdepth, **kwargs): - info = self.info(f) - sizes[info["name"]] = info["size"] - if total: - return sum(sizes.values()) - else: - return sizes - - def glob(self, path, **kwargs): - """ - Find files by glob-matching. - - If the path ends with '/' and does not contain "*", it is essentially - the same as ``ls(path)``, returning only files. - - We support ``"**"``, - ``"?"`` and ``"[..]"``. - - kwargs are passed to ``ls``. - """ - import re - from glob import has_magic - - ends = path.endswith("/") - path = self._strip_protocol(path) - indstar = path.find("*") if path.find("*") >= 0 else len(path) - indques = path.find("?") if path.find("?") >= 0 else len(path) - indbrace = path.find("[") if path.find("[") >= 0 else len(path) - - ind = min(indstar, indques, indbrace) - - if not has_magic(path): - root = path - depth = 1 - if ends: - path += "/*" - elif self.exists(path): - return [path] - else: - return [] # glob of non-existent returns empty - elif "/" in path[:ind]: - ind2 = path[:ind].rindex("/") - root = path[: ind2 + 1] - depth = 20 if "**" in path else path[ind2 + 1 :].count("/") + 1 - else: - root = "" - depth = 20 if "**" in path else 1 - allpaths = self.find(root, maxdepth=depth, withdirs=True, **kwargs) - pattern = ( - "^" - + ( - path.replace("\\", r"\\") - .replace(".", r"\.") - .replace("+", r"\+") - .replace("//", "/") - .replace("(", r"\(") - .replace(")", r"\)") - .replace("|", r"\|") - .rstrip("/") - .replace("?", ".") - ) - + "$" - ) - pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern) - pattern = re.sub("[*]", "[^/]*", pattern) - pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*")) - out = {p for p in allpaths if pattern.match(p.replace("//", "/").rstrip("/"))} - return list(sorted(out)) - - def exists(self, path): - """Is there a file at the given path""" - try: - self.info(path) - return True - except: # noqa: E722 - # any exception allowed bar FileNotFoundError? - return False - - def info(self, path, **kwargs): - """Give details of entry at path - - Returns a single dictionary, with exactly the same information as ``ls`` - would with ``detail=True``. - - The default implementation should calls ls and could be overridden by a - shortcut. kwargs are passed on to ```ls()``. - - Some file systems might not be able to measure the file's size, in - which case, the returned dict will include ``'size': None``. - - Returns - ------- - dict with keys: name (full path in the FS), size (in bytes), type (file, - directory, or something else) and other FS-specific keys. - """ - path = self._strip_protocol(path) - out = self.ls(self._parent(path), detail=True, **kwargs) - out = [o for o in out if o["name"].rstrip("/") == path] - if out: - return out[0] - out = self.ls(path, detail=True, **kwargs) - path = path.rstrip("/") - out1 = [o for o in out if o["name"].rstrip("/") == path] - if len(out1) == 1: - if "size" not in out1[0]: - out1[0]["size"] = None - return out1[0] - elif len(out1) > 1 or out: - return {"name": path, "size": 0, "type": "directory"} - else: - raise FileNotFoundError(path) - - def checksum(self, path): - """Unique value for current version of file - - If the checksum is the same from one moment to another, the contents - are guaranteed to be the same. If the checksum changes, the contents - *might* have changed. - - This should normally be overridden; default will probably capture - creation/modification timestamp (which would be good) or maybe - access timestamp (which would be bad) - """ - return int(tokenize(self.info(path)), 16) - - def size(self, path): - """Size in bytes of file""" - return self.info(path).get("size", None) - - def isdir(self, path): - """Is this entry directory-like?""" - try: - return self.info(path)["type"] == "directory" - except FileNotFoundError: - return False - - def isfile(self, path): - """Is this entry file-like?""" - try: - return self.info(path)["type"] == "file" - except: # noqa: E722 - return False - - def cat(self, path): - """ Get the content of a file """ - return self.open(path, "rb").read() - - def get(self, rpath, lpath, recursive=False, **kwargs): - """Copy file to local. - - Possible extension: maybe should be able to copy to any file-system - (streaming through local). - """ - rpath = self._strip_protocol(rpath) - if recursive: - rpaths = self.find(rpath) - lpaths = [ - os.path.join(lpath, path[len(rpath) :].lstrip("/")) for path in rpaths - ] - for lpath in lpaths: - dirname = os.path.dirname(lpath) - if not os.path.isdir(dirname): - os.makedirs(dirname) - else: - rpaths = [rpath] - lpaths = [lpath] - for lpath, rpath in zip(lpaths, rpaths): - with self.open(rpath, "rb", **kwargs) as f1: - with open(lpath, "wb") as f2: - data = True - while data: - data = f1.read(self.blocksize) - f2.write(data) - - def put(self, lpath, rpath, recursive=False, **kwargs): - """ Upload file from local """ - if recursive: - lpaths = [] - for dirname, subdirlist, filelist in os.walk(lpath): - lpaths += [os.path.join(dirname, filename) for filename in filelist] - rootdir = os.path.basename(lpath.rstrip("/")) - if self.exists(rpath): - # copy lpath inside rpath directory - rpath2 = os.path.join(rpath, rootdir) - else: - # copy lpath as rpath directory - rpath2 = rpath - rpaths = [ - os.path.join(rpath2, path[len(lpath) :].lstrip("/")) for path in lpaths - ] - else: - lpaths = [lpath] - rpaths = [rpath] - for lpath, rpath in zip(lpaths, rpaths): - with open(lpath, "rb") as f1: - with self.open(rpath, "wb", **kwargs) as f2: - data = True - while data: - data = f1.read(self.blocksize) - f2.write(data) - - def head(self, path, size=1024): - """ Get the first ``size`` bytes from file """ - with self.open(path, "rb") as f: - return f.read(size) - - def tail(self, path, size=1024): - """ Get the last ``size`` bytes from file """ - with self.open(path, "rb") as f: - f.seek(max(-size, -f.size), 2) - return f.read() - - def copy(self, path1, path2, **kwargs): - """ Copy within two locations in the filesystem""" - raise NotImplementedError - - def mv(self, path1, path2, **kwargs): - """ Move file from one location to another """ - self.copy(path1, path2, **kwargs) - self.rm(path1, recursive=False) - - def _rm(self, path): - """Delete a file""" - raise NotImplementedError - - def rm(self, path, recursive=False, maxdepth=None): - """Delete files. - - Parameters - ---------- - path: str or list of str - File(s) to delete. - recursive: bool - If file(s) are directories, recursively delete contents and then - also remove the directory - maxdepth: int or None - Depth to pass to walk for finding files to delete, if recursive. - If None, there will be no limit and infinite recursion may be - possible. - """ - # prefer some bulk method, if possible - if not isinstance(path, list): - path = [path] - for p in path: - if recursive: - out = self.walk(p, maxdepth=maxdepth) - for pa_, _, files in reversed(list(out)): - for name in files: - fn = "/".join([pa_, name]) if pa_ else name - self.rm(fn) - self.rmdir(pa_) - else: - self._rm(p) - - @classmethod - def _parent(cls, path): - path = cls._strip_protocol(path.rstrip("/")) - if "/" in path: - return cls.root_marker + path.rsplit("/", 1)[0] - else: - return cls.root_marker - - def _open( - self, - path, - mode="rb", - block_size=None, - autocommit=True, - cache_options=None, - **kwargs - ): - """Return raw bytes-mode file-like from the file-system""" - return AbstractBufferedFile( - self, - path, - mode, - block_size, - autocommit, - cache_options=cache_options, - **kwargs - ) - - def open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs): - """ - Return a file-like object from the filesystem - - The resultant instance must function correctly in a context ``with`` - block. - - Parameters - ---------- - path: str - Target file - mode: str like 'rb', 'w' - See builtin ``open()`` - block_size: int - Some indication of buffering - this is a value in bytes - cache_options : dict, optional - Extra arguments to pass through to the cache. - encoding, errors, newline: passed on to TextIOWrapper for text mode - """ - import io - - path = self._strip_protocol(path) - if "b" not in mode: - mode = mode.replace("t", "") + "b" - - text_kwargs = { - k: kwargs.pop(k) - for k in ["encoding", "errors", "newline"] - if k in kwargs - } - return io.TextIOWrapper( - self.open(path, mode, block_size, **kwargs), **text_kwargs - ) - else: - ac = kwargs.pop("autocommit", not self._intrans) - f = self._open( - path, - mode=mode, - block_size=block_size, - autocommit=ac, - cache_options=cache_options, - **kwargs - ) - if not ac: - self.transaction.files.append(f) - return f - - def touch(self, path, truncate=True, **kwargs): - """ Create empty file, or update timestamp - - Parameters - ---------- - path: str - file location - truncate: bool - If True, always set file size to 0; if False, update timestamp and - leave file unchanged, if backend allows this - """ - if truncate or not self.exists(path): - with self.open(path, "wb", **kwargs): - pass - else: - raise NotImplementedError # update timestamp, if possible - - def ukey(self, path): - """Hash of file properties, to tell if it has changed""" - return md5(str(self.info(path)).encode()).hexdigest() - - def read_block(self, fn, offset, length, delimiter=None): - """ Read a block of bytes from - - Starting at ``offset`` of the file, read ``length`` bytes. If - ``delimiter`` is set then we ensure that the read starts and stops at - delimiter boundaries that follow the locations ``offset`` and ``offset - + length``. If ``offset`` is zero then we start at zero. The - bytestring returned WILL include the end delimiter string. - - If offset+length is beyond the eof, reads to eof. - - Parameters - ---------- - fn: string - Path to filename - offset: int - Byte offset to start read - length: int - Number of bytes to read - delimiter: bytes (optional) - Ensure reading starts and stops at delimiter bytestring - - Examples - -------- - >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP - b'Alice, 100\\nBo' - >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP - b'Alice, 100\\nBob, 200\\n' - - Use ``length=None`` to read to the end of the file. - >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP - b'Alice, 100\\nBob, 200\\nCharlie, 300' - - See Also - -------- - utils.read_block - """ - with self.open(fn, "rb") as f: - size = f.size - if length is None: - length = size - if size is not None and offset + length > size: - length = size - offset - return read_block(f, offset, length, delimiter) - - def __reduce__(self): - return make_instance, (type(self), self.storage_args, self.storage_options) - - def _get_pyarrow_filesystem(self): - """ - Make a version of the FS instance which will be acceptable to pyarrow - """ - # all instances already also derive from pyarrow - return self - - def get_mapper(self, root, check=False, create=False): - """Create key/value store based on this file-system - - Makes a MutibleMapping interface to the FS at the given root path. - See ``fsspec.mapping.FSMap`` for further details. - """ - from .mapping import FSMap - - return FSMap(root, self, check, create) - - @classmethod - def clear_instance_cache(cls): - """ - Clear the cache of filesystem instances. - - Notes - ----- - Unless overridden by setting the ``cachable`` class attribute to False, - the filesystem class stores a reference to newly created instances. This - prevents Python's normal rules around garbage collection from working, - since the instances refcount will not drop to zero until - ``clear_instance_cache`` is called. - """ - cls._cache.clear() - - # ------------------------------------------------------------------------ - # Aliases - - def makedir(self, path, create_parents=True, **kwargs): - """Alias of :ref:`FilesystemSpec.mkdir`.""" - return self.mkdir(path, create_parents=create_parents, **kwargs) - - def mkdirs(self, path, exist_ok=False): - """Alias of :ref:`FilesystemSpec.makedirs`.""" - return self.makedirs(path, exist_ok=exist_ok) - - def listdir(self, path, detail=True, **kwargs): - """Alias of :ref:`FilesystemSpec.ls`.""" - return self.ls(path, detail=detail, **kwargs) - - def cp(self, path1, path2, **kwargs): - """Alias of :ref:`FilesystemSpec.copy`.""" - return self.copy(path1, path2, **kwargs) - - def move(self, path1, path2, **kwargs): - """Alias of :ref:`FilesystemSpec.mv`.""" - return self.mv(path1, path2, **kwargs) - - def stat(self, path, **kwargs): - """Alias of :ref:`FilesystemSpec.info`.""" - return self.info(path, **kwargs) - - def disk_usage(self, path, total=True, maxdepth=None, **kwargs): - """Alias of :ref:`FilesystemSpec.du`.""" - return self.du(path, total=total, maxdepth=maxdepth, **kwargs) - - def rename(self, path1, path2, **kwargs): - """Alias of :ref:`FilesystemSpec.mv`.""" - return self.mv(path1, path2, **kwargs) - - def delete(self, path, recursive=False, maxdepth=None): - """Alias of :ref:`FilesystemSpec.rm`.""" - return self.rm(path, recursive=recursive, maxdepth=maxdepth) - - def upload(self, lpath, rpath, recursive=False, **kwargs): - """Alias of :ref:`FilesystemSpec.put`.""" - return self.put(lpath, rpath, recursive=recursive, **kwargs) - - def download(self, rpath, lpath, recursive=False, **kwargs): - """Alias of :ref:`FilesystemSpec.get`.""" - return self.get(rpath, lpath, recursive=recursive, **kwargs) - - -class AbstractBufferedFile(io.IOBase): - """Convenient class to derive from to provide buffering - - In the case that the backend does not provide a pythonic file-like object - already, this class contains much of the logic to build one. The only - methods that need to be overridden are ``_upload_chunk``, - ``_initate_upload`` and ``_fetch_range``. - """ - - DEFAULT_BLOCK_SIZE = 5 * 2 ** 20 - - def __init__( - self, - fs, - path, - mode="rb", - block_size="default", - autocommit=True, - cache_type="readahead", - cache_options=None, - **kwargs - ): - """ - Template for files with buffered reading and writing - - Parameters - ---------- - fs: instance of FileSystem - path: str - location in file-system - mode: str - Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file - systems may be read-only, and some may not support append. - block_size: int - Buffer size for reading or writing, 'default' for class default - autocommit: bool - Whether to write to final destination; may only impact what - happens when file is being closed. - cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead" - Caching policy in read mode. See the definitions in ``core``. - cache_options : dict - Additional options passed to the constructor for the cache specified - by `cache_type`. - kwargs: - Gets stored as self.kwargs - """ - from .core import caches - - self.path = path - self.fs = fs - self.mode = mode - self.blocksize = ( - self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size - ) - self.loc = 0 - self.autocommit = autocommit - self.end = None - self.start = None - self.closed = False - - if cache_options is None: - cache_options = {} - - if "trim" in kwargs: - warnings.warn( - "Passing 'trim' to control the cache behavior has been deprecated. " - "Specify it within the 'cache_options' argument instead.", - FutureWarning, - ) - cache_options["trim"] = kwargs.pop("trim") - - self.kwargs = kwargs - - if mode not in {"ab", "rb", "wb"}: - raise NotImplementedError("File mode not supported") - if mode == "rb": - if not hasattr(self, "details"): - self.details = fs.info(path) - self.size = self.details["size"] - self.cache = caches[cache_type]( - self.blocksize, self._fetch_range, self.size, **cache_options - ) - else: - self.buffer = io.BytesIO() - self.offset = None - self.forced = False - self.location = None - - @property - def closed(self): - # get around this attr being read-only in IOBase - return self._closed - - @closed.setter - def closed(self, c): - self._closed = c - - def __hash__(self): - if "w" in self.mode: - return id(self) - else: - return int(tokenize(self.details), 16) - - def __eq__(self, other): - """Files are equal if they have the same checksum, only in read mode""" - return self.mode == "rb" and other.mode == "rb" and hash(self) == hash(other) - - def commit(self): - """Move from temp to final destination""" - - def discard(self): - """Throw away temporary file""" - - def info(self): - """ File information about this path """ - if "r" in self.mode: - return self.details - else: - raise ValueError("Info not available while writing") - - def tell(self): - """ Current file location """ - return self.loc - - def seek(self, loc, whence=0): - """ Set current file location - - Parameters - ---------- - loc: int - byte location - whence: {0, 1, 2} - from start of file, current location or end of file, resp. - """ - loc = int(loc) - if not self.mode == "rb": - raise ValueError("Seek only available in read mode") - if whence == 0: - nloc = loc - elif whence == 1: - nloc = self.loc + loc - elif whence == 2: - nloc = self.size + loc - else: - raise ValueError("invalid whence (%s, should be 0, 1 or 2)" % whence) - if nloc < 0: - raise ValueError("Seek before start of file") - self.loc = nloc - return self.loc - - def write(self, data): - """ - Write data to buffer. - - Buffer only sent on flush() or if buffer is greater than - or equal to blocksize. - - Parameters - ---------- - data: bytes - Set of bytes to be written. - """ - if self.mode not in {"wb", "ab"}: - raise ValueError("File not in write mode") - if self.closed: - raise ValueError("I/O operation on closed file.") - if self.forced: - raise ValueError("This file has been force-flushed, can only close") - out = self.buffer.write(data) - self.loc += out - if self.buffer.tell() >= self.blocksize: - self.flush() - return out - - def flush(self, force=False): - """ - Write buffered data to backend store. - - Writes the current buffer, if it is larger than the block-size, or if - the file is being closed. - - Parameters - ---------- - force: bool - When closing, write the last block even if it is smaller than - blocks are allowed to be. Disallows further writing to this file. - """ - - if self.closed: - raise ValueError("Flush on closed file") - if force and self.forced: - raise ValueError("Force flush cannot be called more than once") - if force: - self.forced = True - - if self.mode not in {"wb", "ab"}: - # no-op to flush on read-mode - return - - if not force and self.buffer.tell() < self.blocksize: - # Defer write on small block - return - - if self.offset is None: - # Initialize a multipart upload - self.offset = 0 - self._initiate_upload() - - if self._upload_chunk(final=force) is not False: - self.offset += self.buffer.seek(0, 2) - self.buffer = io.BytesIO() - - def _upload_chunk(self, final=False): - """ Write one part of a multi-block file upload - - Parameters - ========== - final: bool - This is the last block, so should complete file, if - self.autocommit is True. - """ - # may not yet have been initialized, may neet to call _initialize_upload - - def _initiate_upload(self): - """ Create remote file/upload """ - pass - - def _fetch_range(self, start, end): - """Get the specified set of bytes from remote""" - raise NotImplementedError - - def read(self, length=-1): - """ - Return data from cache, or fetch pieces as necessary - - Parameters - ---------- - length: int (-1) - Number of bytes to read; if <0, all remaining bytes. - """ - length = -1 if length is None else int(length) - if self.mode != "rb": - raise ValueError("File not in read mode") - if length < 0: - length = self.size - self.loc - if self.closed: - raise ValueError("I/O operation on closed file.") - logger.debug("%s read: %i - %i" % (self, self.loc, self.loc + length)) - if length == 0: - # don't even bother calling fetch - return b"" - out = self.cache._fetch(self.loc, self.loc + length) - self.loc += len(out) - return out - - def readinto(self, b): - """mirrors builtin file's readinto method - - https://docs.python.org/3/library/io.html#io.RawIOBase.readinto - """ - data = self.read(len(b)) - b[: len(data)] = data - return len(data) - - def readuntil(self, char=b"\n", blocks=None): - """Return data between current position and first occurrence of char - - char is included in the output, except if the end of the tile is - encountered first. - - Parameters - ---------- - char: bytes - Thing to find - blocks: None or int - How much to read in each go. Defaults to file blocksize - which may - mean a new read on every call. - """ - out = [] - while True: - start = self.tell() - part = self.read(blocks or self.blocksize) - if len(part) == 0: - break - found = part.find(char) - if found > -1: - out.append(part[: found + len(char)]) - self.seek(start + found + len(char)) - break - out.append(part) - return b"".join(out) - - def readline(self): - """Read until first occurrence of newline character - - Note that, because of character encoding, this is not necessarily a - true line ending. - """ - return self.readuntil(b"\n") - - def __next__(self): - out = self.readline() - if out: - return out - raise StopIteration - - def __iter__(self): - return self - - def readlines(self): - """Return all data, split by the newline character""" - data = self.read() - lines = data.split(b"\n") - out = [l + b"\n" for l in lines[:-1]] - if data.endswith(b"\n"): - return out - else: - return out + [lines[-1]] - # return list(self) ??? - - def readinto1(self, b): - return self.readinto(b) - - def close(self): - """ Close file - - Finalizes writes, discards cache - """ - if self.closed: - return - if self.mode == "rb": - self.cache = None - else: - if not self.forced: - self.flush(force=True) - - if self.fs is not None: - self.fs.invalidate_cache(self.path) - self.fs.invalidate_cache(self.fs._parent(self.path)) - - self.closed = True - - def readable(self): - """Whether opened for reading""" - return self.mode == "rb" and not self.closed - - def seekable(self): - """Whether is seekable (only in read mode)""" - return self.readable() - - def writable(self): - """Whether opened for writing""" - return self.mode in {"wb", "ab"} and not self.closed - - def __del__(self): - self.close() - - def __str__(self): - return "" % (type(self.fs).__name__, self.path) - - __repr__ = __str__ - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/transaction.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/transaction.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/transaction.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/transaction.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,80 +0,0 @@ -class Transaction(object): - """Filesystem transaction write context - - Gathers files for deferred commit or discard, so that several write - operations can be finalized semi-atomically. This works by having this - instance as the ``.transaction`` attribute of the given filesystem - """ - - def __init__(self, fs): - """ - Parameters - ---------- - fs: FileSystem instance - """ - self.fs = fs - self.files = [] - - def __enter__(self): - self.start() - - def __exit__(self, exc_type, exc_val, exc_tb): - """End transaction and commit, if exit is not due to exception""" - # only commit if there was no exception - self.complete(commit=exc_type is None) - self.fs._intrans = False - self.fs._transaction = None - - def start(self): - """Start a transaction on this FileSystem""" - self.fs._intrans = True - - def complete(self, commit=True): - """Finish transaction: commit or discard all deferred files""" - for f in self.files: - if commit: - f.commit() - else: - f.discard() - self.files = [] - self.fs._intrans = False - - -class FileActor(object): - def __init__(self): - self.files = [] - - def commit(self): - for f in self.files: - f.commit() - self.files.clear() - - def discard(self): - for f in self.files: - f.discard() - self.files.clear() - - def append(self, f): - self.files.append(f) - - -class DaskTransaction(Transaction): - def __init__(self, fs): - """ - Parameters - ---------- - fs: FileSystem instance - """ - import distributed - - super().__init__(fs) - client = distributed.default_client() - self.files = client.submit(FileActor, actor=True).result() - - def complete(self, commit=True): - """Finish transaction: commit or discard all deferred files""" - if commit: - self.files.commit().result() - else: - self.files.discard().result() - self.fs._intrans = False diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/utils.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/utils.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/utils.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/utils.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,304 +0,0 @@ -from hashlib import md5 -import math -import os -import pathlib -import re -from urllib.parse import urlsplit - - -DEFAULT_BLOCK_SIZE = 5 * 2 ** 20 - - -def infer_storage_options(urlpath, inherit_storage_options=None): - """ Infer storage options from URL path and merge it with existing storage - options. - - Parameters - ---------- - urlpath: str or unicode - Either local absolute file path or URL (hdfs://namenode:8020/file.csv) - inherit_storage_options: dict (optional) - Its contents will get merged with the inferred information from the - given path - - Returns - ------- - Storage options dict. - - Examples - -------- - >>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP - {"protocol": "file", "path", "/mnt/datasets/test.csv"} - >>> infer_storage_options( - ... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1', - ... inherit_storage_options={'extra': 'value'}) # doctest: +SKIP - {"protocol": "hdfs", "username": "username", "password": "pwd", - "host": "node", "port": 123, "path": "/mnt/datasets/test.csv", - "url_query": "q=1", "extra": "value"} - """ - # Handle Windows paths including disk name in this special case - if re.match(r"^[a-zA-Z]:[\\/]", urlpath): - return {"protocol": "file", "path": urlpath} - - parsed_path = urlsplit(urlpath) - protocol = parsed_path.scheme or "file" - if parsed_path.fragment: - path = "#".join([parsed_path.path, parsed_path.fragment]) - else: - path = parsed_path.path - if protocol == "file": - # Special case parsing file protocol URL on Windows according to: - # https://msdn.microsoft.com/en-us/library/jj710207.aspx - windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path) - if windows_path: - path = "%s:%s" % windows_path.groups() - - if protocol in ["http", "https"]: - # for HTTP, we don't want to parse, as requests will anyway - return {"protocol": protocol, "path": urlpath} - - options = {"protocol": protocol, "path": path} - - if parsed_path.netloc: - # Parse `hostname` from netloc manually because `parsed_path.hostname` - # lowercases the hostname which is not always desirable (e.g. in S3): - # https://github.com/dask/dask/issues/1417 - options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0] - - if protocol in ("s3", "gcs", "gs"): - options["path"] = options["host"] + options["path"] - else: - options["host"] = options["host"] - if parsed_path.port: - options["port"] = parsed_path.port - if parsed_path.username: - options["username"] = parsed_path.username - if parsed_path.password: - options["password"] = parsed_path.password - - if parsed_path.query: - options["url_query"] = parsed_path.query - if parsed_path.fragment: - options["url_fragment"] = parsed_path.fragment - - if inherit_storage_options: - update_storage_options(options, inherit_storage_options) - - return options - - -def update_storage_options(options, inherited=None): - if not inherited: - inherited = {} - collisions = set(options) & set(inherited) - if collisions: - collisions = "\n".join("- %r" % k for k in collisions) - raise KeyError( - "Collision between inferred and specified storage " - "options:\n%s" % collisions - ) - options.update(inherited) - - -# Compression extensions registered via fsspec.compression.register_compression -compressions = {} - - -def infer_compression(filename): - """Infer compression, if available, from filename. - - Infer a named compression type, if registered and available, from filename - extension. This includes builtin (gz, bz2, zip) compressions, as well as - optional compressions. See fsspec.compression.register_compression. - """ - extension = os.path.splitext(filename)[-1].strip(".") - if extension in compressions: - return compressions[extension] - - -def build_name_function(max_int): - """ Returns a function that receives a single integer - and returns it as a string padded by enough zero characters - to align with maximum possible integer - - >>> name_f = build_name_function(57) - - >>> name_f(7) - '07' - >>> name_f(31) - '31' - >>> build_name_function(1000)(42) - '0042' - >>> build_name_function(999)(42) - '042' - >>> build_name_function(0)(0) - '0' - """ - # handle corner cases max_int is 0 or exact power of 10 - max_int += 1e-8 - - pad_length = int(math.ceil(math.log10(max_int))) - - def name_function(i): - return str(i).zfill(pad_length) - - return name_function - - -def seek_delimiter(file, delimiter, blocksize): - r"""Seek current file to file start, file end, or byte after delimiter seq. - - Seeks file to next chunk delimiter, where chunks are defined on file start, - a delimiting sequence, and file end. Use file.tell() to see location afterwards. - Note that file start is a valid split, so must be at offset > 0 to seek for - delimiter. - - Parameters - ---------- - file: a file - delimiter: bytes - a delimiter like ``b'\n'`` or message sentinel, matching file .read() type - blocksize: int - Number of bytes to read from the file at once. - - - Returns - ------- - Returns True if a delimiter was found, False if at file start or end. - - """ - - if file.tell() == 0: - # beginning-of-file, return without seek - return False - - # Interface is for binary IO, with delimiter as bytes, but initialize last - # with result of file.read to preserve compatibility with text IO. - last = None - while True: - current = file.read(blocksize) - if not current: - # end-of-file without delimiter - return False - full = last + current if last else current - try: - if delimiter in full: - i = full.index(delimiter) - file.seek(file.tell() - (len(full) - i) + len(delimiter)) - return True - elif len(current) < blocksize: - # end-of-file without delimiter - return False - except (OSError, ValueError): - pass - last = full[-len(delimiter) :] - - -def read_block(f, offset, length, delimiter=None, split_before=False): - """ Read a block of bytes from a file - - Parameters - ---------- - f: File - Open file - offset: int - Byte offset to start read - length: int - Number of bytes to read, read through end of file if None - delimiter: bytes (optional) - Ensure reading starts and stops at delimiter bytestring - split_before: bool (optional) - Start/stop read *before* delimiter bytestring. - - - If using the ``delimiter=`` keyword argument we ensure that the read - starts and stops at delimiter boundaries that follow the locations - ``offset`` and ``offset + length``. If ``offset`` is zero then we - start at zero, regardless of delimiter. The bytestring returned WILL - include the terminating delimiter string. - - Examples - -------- - - >>> from io import BytesIO # doctest: +SKIP - >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP - >>> read_block(f, 0, 13) # doctest: +SKIP - b'Alice, 100\\nBo' - - >>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP - b'Alice, 100\\nBob, 200\\n' - - >>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP - b'Bob, 200\\nCharlie, 300' - """ - if delimiter: - f.seek(offset) - found_start_delim = seek_delimiter(f, delimiter, 2 ** 16) - if length is None: - return f.read() - start = f.tell() - length -= start - offset - - f.seek(start + length) - found_end_delim = seek_delimiter(f, delimiter, 2 ** 16) - end = f.tell() - - # Adjust split location to before delimiter iff seek found the - # delimiter sequence, not start or end of file. - if found_start_delim and split_before: - start -= len(delimiter) - - if found_end_delim and split_before: - end -= len(delimiter) - - offset = start - length = end - start - - f.seek(offset) - b = f.read(length) - return b - - -def tokenize(*args, **kwargs): - """ Deterministic token - - (modified from dask.base) - - >>> tokenize([1, 2, '3']) - '9d71491b50023b06fc76928e6eddb952' - - >>> tokenize('Hello') == tokenize('Hello') - True - """ - if kwargs: - args += (kwargs,) - return md5(str(args).encode()).hexdigest() - - -def stringify_path(filepath): - """ Attempt to convert a path-like object to a string. - - Parameters - ---------- - filepath: object to be converted - - Returns - ------- - filepath_str: maybe a string version of the object - - Notes - ----- - Objects supporting the fspath protocol (Python 3.6+) are coerced - according to its __fspath__ method. - - For backwards compatibility with older Python version, pathlib.Path - objects are specially coerced. - - Any other object is passed through unchanged, which includes bytes, - strings, buffers, or anything else that's not even path-like. - """ - if hasattr(filepath, "__fspath__"): - return filepath.__fspath__() - elif isinstance(filepath, pathlib.Path): - return str(filepath) - return filepath diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/_version.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/_version.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/_version.py 2019-11-27 17:39:39.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec/_version.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,21 +0,0 @@ - -# This file was generated by 'versioneer.py' (0.18) from -# revision-control system data, or from the parent directory name of an -# unpacked source archive. Distribution tarballs contain a pre-generated copy -# of this file. - -import json - -version_json = ''' -{ - "date": "2019-11-13T10:37:40-0600", - "dirty": false, - "error": null, - "full-revisionid": "8b59dc8c2c035db5793102b9513c46e6a1bd4fb0", - "version": "0.6.0" -} -''' # END VERSION_JSON - - -def get_versions(): - return json.loads(version_json) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/dependency_links.txt fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/dependency_links.txt --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/dependency_links.txt 2019-11-27 17:39:39.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/dependency_links.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ - diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/not-zip-safe fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/not-zip-safe --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/not-zip-safe 2019-11-27 06:34:55.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/not-zip-safe 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ - diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/PKG-INFO fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/PKG-INFO --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/PKG-INFO 2019-11-27 17:39:39.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/PKG-INFO 1970-01-01 00:00:00.000000000 +0000 @@ -1,93 +0,0 @@ -Metadata-Version: 2.1 -Name: fsspec -Version: 0.6.0 -Summary: File-system specification -Home-page: http://github.com/intake/filesystem_spec -Maintainer: Martin Durant -Maintainer-email: mdurant@anaconda.com -License: BSD -Description: # filesystem_spec - - [![Build Status](https://travis-ci.org/intake/filesystem_spec.svg?branch=master)](https://travis-ci.org/martindurant/filesystem_spec) - [![Docs](https://readthedocs.org/projects/filesystem-spec/badge/?version=latest)](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest) - - A specification for pythonic filesystems. - - ## Install - - ```bash - pip install fsspec - ``` - or - ```bash - conda install -c conda-forge fsspec - ``` - - ## Purpose - - To produce a template or specification for a file-system interface, that specific implementations should follow, - so that applications making use of them can rely on a common behaviour and not have to worry about the specific - internal implementation decisions with any given backend. Many such implementations are included in this package, - or in sister projects such as `s3fs` and `gcsfs`. - - In addition, if this is well-designed, then additional functionality, such as a key-value store or FUSE - mounting of the file-system implementation may be available for all implementations "for free". - - ## Documentation - - Please refer to [RTD](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest) - - ## Develop - - fsspec uses [tox](https://tox.readthedocs.io/en/latest/) and - [tox-conda](https://github.com/tox-dev/tox-conda) to manage dev and test - environments. First, install conda with tox and tox-conda in a base environment - (eg. `conda install -c conda-forge tox tox-conda`). Calls to `tox` can then be - used to configure a development environment and run tests. - - First, setup a development conda environment via `tox -e dev`. This will - install fspec dependencies, test & dev tools, and install fsspec in develop - mode. Then, activate the dev environment under `.tox/dev` via `conda activate .tox/dev`. - - ### Testing - - Tests can be run directly in the activated dev environment via `pytest fsspec`. - - The full fsspec test suite can be run via `tox`, which will setup and execute - tests against multiple dependency versions in isolated environment. Run `tox - -av` to list available test environments, select environments via `tox -e `. - - The full fsspec suite requires a system-level docker, docker-compose, and fuse - installation. See `ci/install.sh` for a detailed installation example. - - ### Code Formatting - - fsspec uses [Black](https://black.readthedocs.io/en/stable) to ensure - a consistent code format throughout the project. ``black`` is automatically - installed in the tox dev env, activated via `conda activate .tox/dev`. - - Then, run `black fsspec` from the root of the filesystem_spec repository to - auto-format your code. Additionally, many editors have plugins that will apply - `black` as you edit files. - - Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com) to - automatically run `black` when you make a git commit. ``black`` is automatically - installed in the tox dev env, activated via `conda activate .tox/dev`. - - Then, run `pre-commit install --install-hooks` from the root of the - filesystem_spec repository to setup pre-commit hooks. `black` will now be run - before you commit, reformatting any changed files. You can format without - committing via `pre-commit run` or skip these checks with `git commit - --no-verify`. - -Keywords: file -Platform: UNKNOWN -Classifier: Development Status :: 4 - Beta -Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: BSD License -Classifier: Operating System :: OS Independent -Classifier: Programming Language :: Python :: 3.5 -Classifier: Programming Language :: Python :: 3.6 -Classifier: Programming Language :: Python :: 3.7 -Requires-Python: >=3.5 -Description-Content-Type: text/markdown diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/top_level.txt fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/top_level.txt --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/top_level.txt 2019-11-27 17:39:39.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.7/dist-packages/fsspec-0.6.0.egg-info/top_level.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -fsspec diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/caching.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/caching.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/caching.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/caching.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,379 +0,0 @@ -import os -import io -import functools -import logging -import math - -logger = logging.getLogger("fsspec") - - -class BaseCache(object): - """Pass-though cache: doesn't keep anything, calls every time - - Acts as base class for other cachers - - Parameters - ---------- - blocksize: int - How far to read ahead in numbers of bytes - fetcher: func - Function of the form f(start, end) which gets bytes from remote as - specified - size: int - How big this file is - """ - - def __init__(self, blocksize, fetcher, size): - self.blocksize = blocksize - self.fetcher = fetcher - self.size = size - - def _fetch(self, start, end): - return self.fetcher(start, end) - - def __getitem__(self, item: slice): - if not isinstance(item, slice): - raise TypeError( - "Cache indices must be a contiguous slice. Got {} instead.".format( - type(item) - ) - ) - if item.step and item.step != 1: - raise ValueError( - "Cache indices must be a contiguous slice. 'item' has step={}".format( - item.step - ) - ) - - # handle endpoints - if item.start is None: - item = slice(0, item.stop) - elif item.start < 0: - item = slice(self.size + item.start, item.stop) - if item.stop is None: - item = slice(item.start, self.size) - elif item.stop < 0: - item = slice(item.start, self.size + item.stop) - - return self._fetch(item.start, item.stop) - - -class MMapCache(BaseCache): - """memory-mapped sparse file cache - - Opens temporary file, which is filled blocks-wise when data is requested. - Ensure there is enough disc space in the temporary location. - - This cache method might only work on posix - """ - - def __init__(self, blocksize, fetcher, size, location=None, blocks=None): - super().__init__(blocksize, fetcher, size) - self.blocks = set() if blocks is None else blocks - self.location = location - self.cache = self._makefile() - - def _makefile(self): - import tempfile - import mmap - - if self.size == 0: - return bytearray() - - # posix version - if self.location is None or not os.path.exists(self.location): - if self.location is None: - fd = tempfile.TemporaryFile() - self.blocks = set() - else: - fd = io.open(self.location, "wb+") - fd.seek(self.size - 1) - fd.write(b"1") - fd.flush() - else: - fd = io.open(self.location, "rb+") - - return mmap.mmap(fd.fileno(), self.size) - - def _fetch(self, start, end): - start_block = start // self.blocksize - end_block = end // self.blocksize - need = [i for i in range(start_block, end_block + 1) if i not in self.blocks] - while need: - # TODO: not a for loop so we can consolidate blocks later to - # make fewer fetch calls; this could be parallel - i = need.pop(0) - sstart = i * self.blocksize - send = min(sstart + self.blocksize, self.size) - self.cache[sstart:send] = self.fetcher(sstart, send) - self.blocks.add(i) - - return self.cache[start:end] - - def __getstate__(self): - state = self.__dict__.copy() - # Remove the unpicklable entries. - del state["cache"] - return state - - def __setstate__(self, state): - # Restore instance attributes - self.__dict__.update(state) - self.cache = self._makefile() - - -class ReadAheadCache(BaseCache): - """ Cache which reads only when we get beyond a block of data - - This is a much simpler version of BytesCache, and does not attempt to - fill holes in the cache or keep fragments alive. It is best suited to - many small reads in a sequential order (e.g., reading lines from a file). - """ - - def __init__(self, blocksize, fetcher, size): - super().__init__(blocksize, fetcher, size) - self.cache = b"" - self.start = 0 - self.end = 0 - - def _fetch(self, start, end): - end = min(self.size, end) - l = end - start - if start >= self.size: - return b"" - elif start >= self.start and end <= self.end: - # cache hit - return self.cache[start - self.start : end - self.start] - elif self.start <= start < self.end: - # partial hit - part = self.cache[start - self.start :] - l -= len(part) - start = self.end - else: - # miss - part = b"" - end = min(self.size, end + self.blocksize) - self.cache = self.fetcher(start, end) # new block replaces old - self.start = start - self.end = self.start + len(self.cache) - return part + self.cache[:l] - - -class BlockCache(BaseCache): - """ - Cache holding memory as a set of blocks. - - Requests are only ever made `blocksize` at a time, and are - stored in an LRU cache. The least recently accessed block is - discarded when more than `maxblocks` are stored. - - Parameters - ---------- - blocksize : int - The number of bytes to store in each block. - Requests are only ever made for `blocksize`, so this - should balance the overhead of making a request against - the granularity of the blocks. - fetcher : Callable - size : int - The total size of the file being cached. - maxblocks : int - The maximum number of blocks to cache for. The maximum memory - use for this cache is then ``blocksize * maxblocks``. - """ - - def __init__(self, blocksize, fetcher, size, maxblocks=32): - super().__init__(blocksize, fetcher, size) - self.nblocks = math.ceil(size / blocksize) - self.maxblocks = maxblocks - self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block) - - def __repr__(self): - return "".format( - self.blocksize, self.size, self.nblocks - ) - - def cache_info(self): - """ - The statistics on the block cache. - - Returns - ---------- - NamedTuple - Returned directly from the LRU Cache used internally. - """ - return self._fetch_block_cached.cache_info() - - def __getstate__(self): - state = self.__dict__ - del state["_fetch_block_cached"] - return state - - def __setstate__(self, state): - self.__dict__.update(state) - self._fetch_block_cached = functools.lru_cache(state["maxblocks"])( - self._fetch_block - ) - - def _fetch(self, start, end): - if end < start: - raise ValueError( - "'end' ({}) is smaller than 'start' ({}).".format(end, start) - ) - - if end > self.size: - raise ValueError("'end={}' larger than size ('{}')".format(end, self.size)) - - # byte position -> block numbers - start_block_number = start // self.blocksize - end_block_number = end // self.blocksize - - # these are cached, so safe to do multiple calls for the same start and end. - for block_number in range(start_block_number, end_block_number + 1): - self._fetch_block(block_number) - - return self._read_cache( - start, - end, - start_block_number=start_block_number, - end_block_number=end_block_number, - ) - - def _fetch_block(self, block_number): - """ - Fetch the block of data for `block_number`. - """ - if block_number > self.nblocks: - raise ValueError( - "'block_number={}' is greater than the number of blocks ({})".format( - block_number, self.nblocks - ) - ) - - start = block_number * self.blocksize - end = start + self.blocksize - logger.info("BlockCache fetching block %d", block_number) - block_contents = super()._fetch(start, end) - return block_contents - - def _read_cache(self, start, end, start_block_number, end_block_number): - """ - Read from our block cache. - - Parameters - ---------- - start, end : int - The start and end byte positions. - start_block_number, end_block_number : int - The start and end block numbers. - """ - start_pos = start % self.blocksize - end_pos = end % self.blocksize - - if start_block_number == end_block_number: - block = self._fetch_block_cached(start_block_number) - return block[start_pos:end_pos] - - else: - # read from the initial - out = [] - out.append(self._fetch_block_cached(start_block_number)[start_pos:]) - - # intermediate blocks - # Note: it'd be nice to combine these into one big request. However - # that doesn't play nicely with our LRU cache. - for block_number in range(start_block_number + 1, end_block_number): - out.append(self._fetch_block_cached(block_number)) - - # final block - out.append(self._fetch_block_cached(end_block_number)[:end_pos]) - - return b"".join(out) - - -class BytesCache(BaseCache): - """Cache which holds data in a in-memory bytes object - - Implements read-ahead by the block size, for semi-random reads progressing - through the file. - - Parameters - ---------- - trim: bool - As we read more data, whether to discard the start of the buffer when - we are more than a blocksize ahead of it. - """ - - def __init__(self, blocksize, fetcher, size, trim=True): - super().__init__(blocksize, fetcher, size) - self.cache = b"" - self.start = None - self.end = None - self.trim = trim - - def _fetch(self, start, end): - # TODO: only set start/end after fetch, in case it fails? - # is this where retry logic might go? - if ( - self.start is not None - and start >= self.start - and self.end is not None - and end < self.end - ): - # cache hit: we have all the required data - offset = start - self.start - return self.cache[offset : offset + end - start] - - if self.blocksize: - bend = min(self.size, end + self.blocksize) - else: - bend = end - - if bend == start or start > self.size: - return b"" - - if (self.start is None or start < self.start) and ( - self.end is None or end > self.end - ): - # First read, or extending both before and after - self.cache = self.fetcher(start, bend) - self.start = start - elif start < self.start: - if self.end - end > self.blocksize: - self.cache = self.fetcher(start, bend) - self.start = start - else: - new = self.fetcher(start, self.start) - self.start = start - self.cache = new + self.cache - elif bend > self.end: - if self.end > self.size: - pass - elif end - self.end > self.blocksize: - self.cache = self.fetcher(start, bend) - self.start = start - else: - new = self.fetcher(self.end, bend) - self.cache = self.cache + new - - self.end = self.start + len(self.cache) - offset = start - self.start - out = self.cache[offset : offset + end - start] - if self.trim: - num = (self.end - self.start) // (self.blocksize + 1) - if num > 1: - self.start += self.blocksize * num - self.cache = self.cache[self.blocksize * num :] - return out - - def __len__(self): - return len(self.cache) - - -caches = { - "none": BaseCache, - "mmap": MMapCache, - "bytes": BytesCache, - "readahead": ReadAheadCache, - "block": BlockCache, -} diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/compression.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/compression.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/compression.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/compression.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,152 +0,0 @@ -"""Helper functions for a standard streaming compression API""" -from bz2 import BZ2File -from gzip import GzipFile -from zipfile import ZipFile - -import fsspec.utils -from fsspec.spec import AbstractBufferedFile - - -def noop_file(file, mode, **kwargs): - return file - - -# should be functions of the form func(infile, mode=, **kwargs) -> file-like -compr = {None: noop_file} - - -def register_compression(name, callback, extensions, force=False): - """Register an "inferable" file compression type. - - Registers transparent file compression type for use with fsspec.open. - Compression can be specified by name in open, or "infer"-ed for any files - ending with the given extensions. - - Args: - name: (str) The compression type name. Eg. "gzip". - callback: A callable of form (infile, mode, **kwargs) -> file-like. - Accepts an input file-like object, the target mode and kwargs. - Returns a wrapped file-like object. - extensions: (str, Iterable[str]) A file extension, or list of file - extensions for which to infer this compression scheme. Eg. "gz". - force: (bool) Force re-registration of compression type or extensions. - - Raises: - ValueError: If name or extensions already registered, and not force. - - """ - if isinstance(extensions, str): - extensions = [extensions] - - # Validate registration - if name in compr and not force: - raise ValueError("Duplicate compression registration: %s" % name) - - for ext in extensions: - if ext in fsspec.utils.compressions and not force: - raise ValueError( - "Duplicate compression file extension: %s (%s)" % (ext, name) - ) - - compr[name] = callback - - for ext in extensions: - fsspec.utils.compressions[ext] = name - - -def unzip(infile, mode="rb", filename=None, **kwargs): - if "r" not in mode: - filename = filename or "file" - z = ZipFile(infile, mode="w", **kwargs) - fo = z.open(filename, mode="w") - fo.close = lambda closer=fo.close: closer() or z.close() - return fo - z = ZipFile(infile) - if filename is None: - filename = z.namelist()[0] - return z.open(filename, mode="r", **kwargs) - - -register_compression("zip", unzip, "zip") -register_compression("bz2", BZ2File, "bz2") -register_compression("gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz") - -try: - import lzma - - register_compression("lzma", lzma.LZMAFile, "xz") - register_compression("xz", lzma.LZMAFile, "xz", force=True) -except ImportError: - pass - -try: - import lzmaffi - - register_compression("lzma", lzmaffi.LZMAFile, "xz", force=True) - register_compression("xz", lzmaffi.LZMAFile, "xz", force=True) -except ImportError: - pass - - -class SnappyFile(AbstractBufferedFile): - def __init__(self, infile, mode, **kwargs): - import snappy - - self.details = {"size": 999999999} # not true, but OK if we don't seek - super().__init__(fs=None, path="snappy", mode=mode.strip("b") + "b", **kwargs) - self.infile = infile - if "r" in mode: - self.codec = snappy.StreamDecompressor() - else: - self.codec = snappy.StreamCompressor() - - def _upload_chunk(self, final=False): - self.buffer.seek(0) - out = self.codec.add_chunk(self.buffer.read()) - self.infile.write(out) - return True - - def seek(self, loc, whence=0): - raise NotImplementedError("SnappyFile is not seekable") - - def seekable(self): - return False - - def _fetch_range(self, start, end): - """Get the specified set of bytes from remote""" - data = self.infile.read(end - start) - return self.codec.decompress(data) - - -try: - import snappy - - snappy.compress - # Snappy may use the .sz file extension, but this is not part of the - # standard implementation. - register_compression("snappy", SnappyFile, []) - -except (ImportError, NameError): - pass - -try: - import lz4.frame - - register_compression("lz4", lz4.frame.open, "lz4") -except ImportError: - pass - -try: - import zstandard as zstd - - def zstandard_file(infile, mode="rb"): - if "r" in mode: - cctx = zstd.ZstdDecompressor() - return cctx.stream_reader(infile) - else: - cctx = zstd.ZstdCompressor(level=10) - return cctx.stream_writer(infile) - - register_compression("zstd", zstandard_file, "zst") -except ImportError: - pass diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/conftest.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/conftest.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/conftest.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/conftest.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,51 +0,0 @@ -import os -import shutil -import subprocess -import sys -import time - -import pytest - -import fsspec -from fsspec.implementations.cached import CachingFileSystem - - -@pytest.fixture() -def m(): - """ - Fixture providing a memory filesystem. - """ - m = fsspec.filesystem("memory") - m.store.clear() - try: - yield m - finally: - m.store.clear() - - -@pytest.fixture -def ftp_writable(tmpdir): - """ - Fixture providing a writable FTP filesystem. - """ - pytest.importorskip("pyftpdlib") - from fsspec.implementations.ftp import FTPFileSystem - - FTPFileSystem.clear_instance_cache() # remove lingering connections - CachingFileSystem.clear_instance_cache() - d = str(tmpdir) - with open(os.path.join(d, "out"), "wb") as f: - f.write(b"hello" * 10000) - P = subprocess.Popen( - [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"] - ) - try: - time.sleep(1) - yield "localhost", 2121, "user", "pass" - finally: - P.terminate() - P.wait() - try: - shutil.rmtree(tmpdir) - except Exception: - pass diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/core.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/core.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/core.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/core.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,440 +0,0 @@ -from __future__ import print_function, division, absolute_import - -import io -import os -import logging -from .compression import compr -from .utils import ( - infer_compression, - build_name_function, - update_storage_options, - stringify_path, -) -from .registry import get_filesystem_class - -# for backwards compat, we export cache things from here too -from .caching import ( # noqa: F401 - BaseCache, - MMapCache, - ReadAheadCache, - BytesCache, - BlockCache, - caches, -) - -logger = logging.getLogger("fsspec") - - -class OpenFile(object): - """ - File-like object to be used in a context - - Can layer (buffered) text-mode and compression over any file-system, which - are typically binary-only. - - These instances are safe to serialize, as the low-level file object - is not created until invoked using `with`. - - Parameters - ---------- - fs: FileSystem - The file system to use for opening the file. Should match the interface - of ``dask.bytes.local.LocalFileSystem``. - path: str - Location to open - mode: str like 'rb', optional - Mode of the opened file - compression: str or None, optional - Compression to apply - encoding: str or None, optional - The encoding to use if opened in text mode. - errors: str or None, optional - How to handle encoding errors if opened in text mode. - newline: None or str - Passed to TextIOWrapper in text mode, how to handle line endings. - """ - - def __init__( - self, - fs, - path, - mode="rb", - compression=None, - encoding=None, - errors=None, - newline=None, - ): - self.fs = fs - self.path = path - self.mode = mode - self.compression = get_compression(path, compression) - self.encoding = encoding - self.errors = errors - self.newline = newline - self.fobjects = [] - - def __reduce__(self): - return ( - OpenFile, - ( - self.fs, - self.path, - self.mode, - self.compression, - self.encoding, - self.errors, - ), - ) - - def __repr__(self): - return "".format(self.path) - - def __fspath__(self): - return self.path - - def __enter__(self): - mode = self.mode.replace("t", "").replace("b", "") + "b" - - f = self.fs.open(self.path, mode=mode) - - self.fobjects = [f] - - if self.compression is not None: - compress = compr[self.compression] - f = compress(f, mode=mode[0]) - self.fobjects.append(f) - - if "b" not in self.mode: - # assume, for example, that 'r' is equivalent to 'rt' as in builtin - f = io.TextIOWrapper( - f, encoding=self.encoding, errors=self.errors, newline=self.newline - ) - self.fobjects.append(f) - - return self.fobjects[-1] - - def __exit__(self, *args): - self.close() - - def __del__(self): - self.close() - - def open(self): - """Materialise this as a real open file without context - - The file should be explicitly closed to avoid enclosed open file - instances persisting - """ - return self.__enter__() - - def close(self): - """Close all encapsulated file objects""" - for f in reversed(self.fobjects): - if "r" not in self.mode and not f.closed: - f.flush() - f.close() - self.fobjects = [] - - -def open_files( - urlpath, - mode="rb", - compression=None, - encoding="utf8", - errors=None, - name_function=None, - num=1, - protocol=None, - newline=None, - **kwargs -): - """ Given a path or paths, return a list of ``OpenFile`` objects. - - For writing, a str path must contain the "*" character, which will be filled - in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2. - - For either reading or writing, can instead provide explicit list of paths. - - Parameters - ---------- - urlpath: string or list - Absolute or relative filepath(s). Prefix with a protocol like ``s3://`` - to read from alternative filesystems. To read from multiple files you - can pass a globstring or a list of paths, with the caveat that they - must all have the same protocol. - mode: 'rb', 'wt', etc. - compression: string - Compression to use. See ``dask.bytes.compression.files`` for options. - encoding: str - For text mode only - errors: None or str - Passed to TextIOWrapper in text mode - name_function: function or None - if opening a set of files for writing, those files do not yet exist, - so we need to generate their names by formatting the urlpath for - each sequence number - num: int [1] - if writing mode, number of files we expect to create (passed to - name+function) - protocol: str or None - If given, overrides the protocol found in the URL. - newline: bytes or None - Used for line terminator in text mode. If None, uses system default; - if blank, uses no translation. - **kwargs: dict - Extra options that make sense to a particular storage connection, e.g. - host, port, username, password, etc. - - Examples - -------- - >>> files = open_files('2015-*-*.csv') # doctest: +SKIP - >>> files = open_files( - ... 's3://bucket/2015-*-*.csv.gz', compression='gzip' - ... ) # doctest: +SKIP - - Returns - ------- - List of ``OpenFile`` objects. - """ - fs, fs_token, paths = get_fs_token_paths( - urlpath, - mode, - num=num, - name_function=name_function, - storage_options=kwargs, - protocol=protocol, - ) - return [ - OpenFile( - fs, - path, - mode=mode, - compression=compression, - encoding=encoding, - errors=errors, - newline=newline, - ) - for path in paths - ] - - -def open( - urlpath, - mode="rb", - compression=None, - encoding="utf8", - errors=None, - protocol=None, - newline=None, - **kwargs -): - """ Given a path or paths, return one ``OpenFile`` object. - - Parameters - ---------- - urlpath: string or list - Absolute or relative filepath. Prefix with a protocol like ``s3://`` - to read from alternative filesystems. Should not include glob - character(s). - mode: 'rb', 'wt', etc. - compression: string - Compression to use. See ``dask.bytes.compression.files`` for options. - encoding: str - For text mode only - errors: None or str - Passed to TextIOWrapper in text mode - protocol: str or None - If given, overrides the protocol found in the URL. - newline: bytes or None - Used for line terminator in text mode. If None, uses system default; - if blank, uses no translation. - **kwargs: dict - Extra options that make sense to a particular storage connection, e.g. - host, port, username, password, etc. - - Examples - -------- - >>> openfile = open('2015-01-01.csv') # doctest: +SKIP - >>> openfile = open( - ... 's3://bucket/2015-01-01.csv.gz', - ... compression='gzip' - ... ) # doctest: +SKIP - >>> with openfile as f: - ... df = pd.read_csv(f) # doctest: +SKIP - - Returns - ------- - ``OpenFile`` object. - """ - return open_files( - [urlpath], - mode, - compression, - encoding, - errors, - protocol, - newline=newline, - **kwargs - )[0] - - -def get_compression(urlpath, compression): - if compression == "infer": - compression = infer_compression(urlpath) - if compression is not None and compression not in compr: - raise ValueError("Compression type %s not supported" % compression) - return compression - - -def split_protocol(urlpath): - """Return protocol, path pair""" - urlpath = stringify_path(urlpath) - if "://" in urlpath: - protocol, path = urlpath.split("://", 1) - if len(protocol) > 1: - # excludes Windows paths - return protocol, path - return None, urlpath - - -def strip_protocol(urlpath): - """Return only path part of full URL, according to appropriate backend""" - protocol, _ = split_protocol(urlpath) - cls = get_filesystem_class(protocol) - return cls._strip_protocol(urlpath) - - -def expand_paths_if_needed(paths, mode, num, fs, name_function): - """Expand paths if they have a ``*`` in them. - - :param paths: list of paths - mode: str - Mode in which to open files. - num: int - If opening in writing mode, number of files we expect to create. - fs: filesystem object - name_function: callable - If opening in writing mode, this callable is used to generate path - names. Names are generated for each partition by - ``urlpath.replace('*', name_function(partition_index))``. - :return: list of paths - """ - expanded_paths = [] - paths = list(paths) - if "w" in mode and sum([1 for p in paths if "*" in p]) > 1: - raise ValueError("When writing data, only one filename mask can be specified.") - elif "w" in mode: - num = max(num, len(paths)) - for curr_path in paths: - if "*" in curr_path: - if "w" in mode: - # expand using name_function - expanded_paths.extend(_expand_paths(curr_path, name_function, num)) - else: - # expand using glob - expanded_paths.extend(fs.glob(curr_path)) - else: - expanded_paths.append(curr_path) - # if we generated more paths that asked for, trim the list - if "w" in mode and len(expanded_paths) > num: - expanded_paths = expanded_paths[:num] - return expanded_paths - - -def get_fs_token_paths( - urlpath, mode="rb", num=1, name_function=None, storage_options=None, protocol=None -): - """Filesystem, deterministic token, and paths from a urlpath and options. - - Parameters - ---------- - urlpath: string or iterable - Absolute or relative filepath, URL (may include protocols like - ``s3://``), or globstring pointing to data. - mode: str, optional - Mode in which to open files. - num: int, optional - If opening in writing mode, number of files we expect to create. - name_function: callable, optional - If opening in writing mode, this callable is used to generate path - names. Names are generated for each partition by - ``urlpath.replace('*', name_function(partition_index))``. - storage_options: dict, optional - Additional keywords to pass to the filesystem class. - protocol: str or None - To override the protocol specifier in the URL - """ - if isinstance(urlpath, (list, tuple)): - if not urlpath: - raise ValueError("empty urlpath sequence") - protocols, paths = zip(*map(split_protocol, urlpath)) - protocol = protocol or protocols[0] - if not all(p == protocol for p in protocols): - raise ValueError( - "When specifying a list of paths, all paths must " - "share the same protocol" - ) - cls = get_filesystem_class(protocol) - optionss = list(map(cls._get_kwargs_from_urls, urlpath)) - paths = [cls._strip_protocol(u) for u in urlpath] - options = optionss[0] - if not all(o == options for o in optionss): - raise ValueError( - "When specifying a list of paths, all paths must " - "share the same file-system options" - ) - update_storage_options(options, storage_options) - fs = cls(**options) - paths = expand_paths_if_needed(paths, mode, num, fs, name_function) - - elif isinstance(urlpath, str) or hasattr(urlpath, "name"): - protocols, path = split_protocol(urlpath) - protocol = protocol or protocols - cls = get_filesystem_class(protocol) - - options = cls._get_kwargs_from_urls(urlpath) - path = cls._strip_protocol(urlpath) - update_storage_options(options, storage_options) - fs = cls(**options) - - if "w" in mode: - paths = _expand_paths(path, name_function, num) - elif "*" in path: - paths = sorted(fs.glob(path)) - else: - paths = [path] - - else: - raise TypeError("url type not understood: %s" % urlpath) - - return fs, fs._fs_token, paths - - -def _expand_paths(path, name_function, num): - if isinstance(path, str): - if path.count("*") > 1: - raise ValueError("Output path spec must contain exactly one '*'.") - elif "*" not in path: - path = os.path.join(path, "*.part") - - if name_function is None: - name_function = build_name_function(num - 1) - - paths = [path.replace("*", name_function(i)) for i in range(num)] - if paths != sorted(paths): - logger.warning( - "In order to preserve order between partitions" - " paths created with ``name_function`` should " - "sort to partition order" - ) - elif isinstance(path, (tuple, list)): - assert len(path) == num - paths = list(path) - else: - raise ValueError( - "Path should be either\n" - "1. A list of paths: ['foo.json', 'bar.json', ...]\n" - "2. A directory: 'foo/\n" - "3. A path with a '*' in it: 'foo.*.json'" - ) - return paths diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/fuse.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/fuse.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/fuse.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/fuse.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,157 +0,0 @@ -from __future__ import print_function -import os -import stat -from errno import ENOENT, EIO -from fuse import Operations, FuseOSError -import threading -import time -from fuse import FUSE - - -class FUSEr(Operations): - def __init__(self, fs, path): - self.fs = fs - self.cache = {} - self.root = path.rstrip("/") + "/" - self.counter = 0 - - def getattr(self, path, fh=None): - path = "".join([self.root, path.lstrip("/")]).rstrip("/") - try: - info = self.fs.info(path) - except FileNotFoundError: - raise FuseOSError(ENOENT) - data = {"st_uid": 1000, "st_gid": 1000} - perm = 0o777 - - if info["type"] != "file": - data["st_mode"] = stat.S_IFDIR | perm - data["st_size"] = 0 - data["st_blksize"] = 0 - else: - data["st_mode"] = stat.S_IFREG | perm - data["st_size"] = info["size"] - data["st_blksize"] = 5 * 2 ** 20 - data["st_nlink"] = 1 - data["st_atime"] = time.time() - data["st_ctime"] = time.time() - data["st_mtime"] = time.time() - return data - - def readdir(self, path, fh): - path = "".join([self.root, path.lstrip("/")]) - files = self.fs.ls(path, False) - files = [os.path.basename(f.rstrip("/")) for f in files] - return [".", ".."] + files - - def mkdir(self, path, mode): - path = "".join([self.root, path.lstrip("/")]) - self.fs.mkdir(path) - return 0 - - def rmdir(self, path): - path = "".join([self.root, path.lstrip("/")]) - self.fs.rmdir(path) - return 0 - - def read(self, path, size, offset, fh): - f = self.cache[fh] - f.seek(offset) - out = f.read(size) - return out - - def write(self, path, data, offset, fh): - f = self.cache[fh] - f.write(data) - return len(data) - - def create(self, path, flags, fi=None): - fn = "".join([self.root, path.lstrip("/")]) - f = self.fs.open(fn, "wb") - self.cache[self.counter] = f - self.counter += 1 - return self.counter - 1 - - def open(self, path, flags): - fn = "".join([self.root, path.lstrip("/")]) - if flags % 2 == 0: - # read - mode = "rb" - else: - # write/create - mode = "wb" - self.cache[self.counter] = self.fs.open(fn, mode) - self.counter += 1 - return self.counter - 1 - - def truncate(self, path, length, fh=None): - fn = "".join([self.root, path.lstrip("/")]) - if length != 0: - raise NotImplementedError - # maybe should be no-op since open with write sets size to zero anyway - self.fs.touch(fn) - - def unlink(self, path): - fn = "".join([self.root, path.lstrip("/")]) - try: - self.fs.rm(fn, False) - except (IOError, FileNotFoundError): - raise FuseOSError(EIO) - - def release(self, path, fh): - try: - if fh in self.cache: - f = self.cache[fh] - f.close() - self.cache.pop(fh) - except Exception as e: - print(e) - return 0 - - def chmod(self, path, mode): - raise NotImplementedError - - -def run(fs, path, mount_point, foreground=True, threads=False): - """ Mount stuff in a local directory - - This uses fusepy to make it appear as if a given path on an fsspec - instance is in fact resident within the local file-system. - - This requires that fusepy by installed, and that FUSE be available on - the system (typically requiring a package to be installed with - apt, yum, brew, etc.). - - Parameters - ---------- - fs: file-system instance - From one of the compatible implementations - path: str - Location on that file-system to regard as the root directory to - mount. Note that you typically should include the terminating "/" - character. - mount_point: str - An empty directory on the local file-system where the contents of - the remote path will appear - foreground: bool - Whether or not calling this function will block. Operation will - typically be more stable if True. - threads: bool - Whether or not to create threads when responding to file operations - within the mounter directory. Operation will typically be more - stable if False. - - """ - func = lambda: FUSE( - FUSEr(fs, path), mount_point, nothreads=not threads, foreground=True - ) - if foreground is False: - th = threading.Thread(target=func) - th.daemon = True - th.start() - return th - else: # pragma: no cover - try: - func() - except KeyboardInterrupt: - pass diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/cached.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/cached.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/cached.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/cached.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,358 +0,0 @@ -import time -import pickle -import logging -import os -import hashlib -import tempfile -import inspect -from fsspec import AbstractFileSystem, filesystem -from fsspec.spec import AbstractBufferedFile -from fsspec.core import MMapCache, BaseCache - -logger = logging.getLogger("fsspec") - - -class CachingFileSystem(AbstractFileSystem): - """Locally caching filesystem, layer over any other FS - - This class implements chunk-wise local storage of remote files, for quick - access after the initial download. The files are stored in a given - directory with random hashes for the filenames. If no directory is given, - a temporary one is used, which should be cleaned up by the OS after the - process ends. The files themselves as sparse (as implemented in - MMapCache), so only the data which is accessed takes up space. - - Restrictions: - - - the block-size must be the same for each access of a given file, unless - all blocks of the file have already been read - - caching can only be applied to file-systems which produce files - derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also - allowed, for testing - """ - - protocol = ("blockcache", "cached") - - def __init__( - self, - target_protocol=None, - cache_storage="TMP", - cache_check=10, - check_files=False, - expiry_time=604800, - target_options=None, - **kwargs - ): - """ - - Parameters - ---------- - target_protocol: str - Target fielsystem protocol - cache_storage: str or list(str) - Location to store files. If "TMP", this is a temporary directory, - and will be cleaned up by the OS when this process ends (or later). - If a list, each location will be tried in the order given, but - only the last will be considered writable. - cache_check: int - Number of seconds between reload of cache metadata - check_files: bool - Whether to explicitly see if the UID of the remote file matches - the stored one before using. Warning: some file systems such as - HTTP cannot reliably give a unique hash of the contents of some - path, so be sure to set this option to False. - expiry_time: int - The time in seconds after which a local copy is considered useless. - Set to falsy to prevent expiry. The default is equivalent to one - week. - target_options: dict or None - Passed to the instantiation of the FS, if fs is None. - """ - if self._cached: - return - super().__init__(**kwargs) - if cache_storage == "TMP": - storage = [tempfile.mkdtemp()] - else: - if isinstance(cache_storage, str): - storage = [cache_storage] - else: - storage = cache_storage - os.makedirs(storage[-1], exist_ok=True) - self.storage = storage - self.kwargs = target_options or {} - self.cache_check = cache_check - self.check_files = check_files - self.expiry = expiry_time - self.load_cache() - if isinstance(target_protocol, AbstractFileSystem): - self.fs = target_protocol - self.protocol = self.fs.protocol - else: - self.protocol = target_protocol - self.fs = filesystem(target_protocol, **self.kwargs) - - def __reduce_ex__(self, *_): - return ( - self.__class__, - ( - self.protocol, - self.storage, - self.cache_check, - self.check_files, - self.expiry, - self.kwargs or None, - ), - ) - - def load_cache(self): - """Read set of stored blocks from file""" - cached_files = [] - for storage in self.storage: - fn = os.path.join(storage, "cache") - if os.path.exists(fn): - with open(fn, "rb") as f: - # TODO: consolidate blocks here - cached_files.append(pickle.load(f)) - else: - os.makedirs(storage, exist_ok=True) - cached_files.append({}) - self.cached_files = cached_files or [{}] - self.last_cache = time.time() - - def save_cache(self): - """Save set of stored blocks from file""" - fn = os.path.join(self.storage[-1], "cache") - # TODO: a file lock could be used to ensure file does not change - # between re-read and write; but occasional duplicated reads ok. - cache = self.cached_files[-1] - if os.path.exists(fn): - with open(fn, "rb") as f: - cached_files = pickle.load(f) - for k, c in cached_files.items(): - if c["blocks"] is not True: - if cache[k]["blocks"] is True: - c["blocks"] = True - else: - c["blocks"] = set(c["blocks"]).union(cache[k]["blocks"]) - else: - cached_files = cache - cache = {k: v.copy() for k, v in cached_files.items()} - for c in cache.values(): - if isinstance(c["blocks"], set): - c["blocks"] = list(c["blocks"]) - with open(fn + ".temp", "wb") as f: - pickle.dump(cache, f) - if os.path.exists(fn): - os.remove(fn) - os.rename(fn + ".temp", fn) - - def _check_cache(self): - """Reload caches if time elapsed or any disappeared""" - if not self.cache_check: - # explicitly told not to bother checking - return - timecond = time.time() - self.last_cache > self.cache_check - existcond = all(os.path.exists(storage) for storage in self.storage) - if timecond or not existcond: - self.load_cache() - - def _check_file(self, path): - """Is path in cache and still valid""" - self._check_cache() - for storage, cache in zip(self.storage, self.cached_files): - if path not in cache: - continue - detail = cache[path].copy() - if self.check_files: - if detail["uid"] != self.fs.ukey(path): - continue - if self.expiry: - if detail["time"] - time.time() > self.expiry: - continue - fn = os.path.join(storage, detail["fn"]) - if os.path.exists(fn): - return detail, fn - return False, None - - def _open(self, path, mode="rb", **kwargs): - """Wrap the target _open - - If the whole file exists in the cache, just open it locally and - return that. - - Otherwise, open the file on the target FS, and make it have a mmap - cache pointing to the location which we determine, in our cache. - The ``blocks`` instance is shared, so as the mmap cache instance - updates, so does the entry in our ``cached_files`` attribute. - We monkey-patch this file, so that when it closes, we call - ``close_and_update`` to save the state of the blocks. - """ - path = self._strip_protocol(path) - if not path.startswith(self.protocol): - path = self.protocol + "://" + path - if mode != "rb": - return self.fs._open(path, mode=mode, **kwargs) - detail, fn = self._check_file(path) - if detail: - # file is in cache - hash, blocks = detail["fn"], detail["blocks"] - if blocks is True: - # stored file is complete - logger.debug("Opening local copy of %s" % path) - return open(fn, "rb") - # TODO: action where partial file exists in read-only cache - logger.debug("Opening partially cached copy of %s" % path) - else: - hash = hashlib.sha256(path.encode()).hexdigest() - fn = os.path.join(self.storage[-1], hash) - blocks = set() - detail = { - "fn": hash, - "blocks": blocks, - "time": time.time(), - "uid": self.fs.ukey(path), - } - self.cached_files[-1][path] = detail - logger.debug("Creating local sparse file for %s" % path) - kwargs["cache_type"] = "none" - kwargs["mode"] = mode - - # call target filesystems open - f = self.fs._open(path, **kwargs) - if "blocksize" in detail: - if detail["blocksize"] != f.blocksize: - raise ValueError( - "Cached file must be reopened with same block" - "size as original (old: %i, new %i)" - "" % (detail["blocksize"], f.blocksize) - ) - else: - detail["blocksize"] = f.blocksize - f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks) - close = f.close - f.close = lambda: self.close_and_update(f, close) - return f - - def close_and_update(self, f, close): - """Called when a file is closing, so store the set of blocks""" - if f.path.startswith(self.protocol): - path = f.path - else: - path = self.protocol + "://" + f.path - c = self.cached_files[-1][path] - if c["blocks"] is not True and len(["blocks"]) * f.blocksize >= f.size: - c["blocks"] = True - self.save_cache() - close() - - def __getattribute__(self, item): - if item in [ - "load_cache", - "_open", - "save_cache", - "close_and_update", - "__init__", - "__getattribute__", - "__reduce_ex__", - "open", - "cat", - "get", - "read_block", - "tail", - "head", - "_check_file", - "_check_cache", - ]: - # all the methods defined in this class. Note `open` here, since - # it calls `_open`, but is actually in superclass - return lambda *args, **kw: getattr(type(self), item)(self, *args, **kw) - if item == "__class__": - return type(self) - d = object.__getattribute__(self, "__dict__") - fs = d.get("fs", None) # fs is not immediately defined - if item in d: - return d[item] - elif fs is not None: - if item in fs.__dict__: - # attribute of instance - return fs.__dict__[item] - # attributed belonging to the target filesystem - cls = type(fs) - m = getattr(cls, item) - if inspect.isfunction(m) and ( - not hasattr(m, "__self__") or m.__self__ is None - ): - # instance method - return m.__get__(fs, cls) - return m # class method or attribute - else: - # attributes of the superclass, while target is being set up - return super().__getattribute__(item) - - -class WholeFileCacheFileSystem(CachingFileSystem): - """Caches whole remote files on first access - - This class is intended as a layer over any other file system, and - will make a local copy of each file accessed, so that all subsequent - reads are local. This is similar to ``CachingFileSystem``, but without - the block-wise functionality and so can work even when sparse files - are not allowed. See its docstring for definition of the init - arguments. - - The class still needs access to the remote store for listing files, - and may refresh cached files. - """ - - protocol = "filecache" - - def _open(self, path, mode="rb", **kwargs): - path = self._strip_protocol(path) - if not path.startswith(self.protocol): - path = self.protocol + "://" + path - if mode != "rb": - return self.fs._open(path, mode=mode, **kwargs) - detail, fn = self._check_file(path) - if detail: - hash, blocks = detail["fn"], detail["blocks"] - if blocks is True: - logger.debug("Opening local copy of %s" % path) - return open(fn, "rb") - else: - raise ValueError( - "Attempt to open partially cached file %s" - "as a wholly cached file" % path - ) - else: - hash = hashlib.sha256(path.encode()).hexdigest() - fn = os.path.join(self.storage[-1], hash) - blocks = True - detail = { - "fn": hash, - "blocks": blocks, - "time": time.time(), - "uid": self.fs.ukey(path), - } - self.cached_files[-1][path] = detail - logger.debug("Copying %s to local cache" % path) - kwargs["mode"] = mode - - # call target filesystems open - # TODO: why not just use fs.get ?? - f = self.fs._open(path, **kwargs) - with open(fn, "wb") as f2: - if isinstance(f, AbstractBufferedFile): - # want no type of caching if just downloading whole thing - f.cache = BaseCache(0, f.cache.fetcher, f.size) - if getattr(f, "blocksize", 0) and f.size: - # opportunity to parallelise here - data = True - while data: - data = f.read(f.blocksize) - f2.write(data) - else: - # this only applies to HTTP, should instead use streaming - f2.write(f.read()) - self.save_cache() - return self._open(path, mode) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/dask.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/dask.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/dask.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/dask.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,120 +0,0 @@ -from distributed.worker import get_worker -from distributed.client import _get_global_client -import dask -from fsspec.spec import AbstractFileSystem, AbstractBufferedFile -from fsspec import filesystem - - -def make_instance(cls, args, kwargs): - inst = cls(*args, **kwargs) - inst._determine_worker() - return inst - - -class DaskWorkerFileSystem(AbstractFileSystem): - """View files accessible to a worker as any other remote file-system - - When instances are run on the worker, uses the real filesystem. When - run on the client, they call the worker to provide information or data. - - **Warning** this implementation is experimental, and read-only for now. - """ - - def __init__(self, remote_protocol, remote_options=None, **kwargs): - super().__init__(**kwargs) - self.protocol = remote_protocol - self.remote_options = remote_options - self.worker = None - self.client = None - self.fs = None - self._determine_worker() - - def _determine_worker(self): - try: - get_worker() - self.worker = True - self.fs = filesystem(self.protocol, **(self.remote_options or {})) - except ValueError: - self.worker = False - self.client = _get_global_client() - self.rfs = dask.delayed(self) - - def __reduce__(self): - return make_instance, (type(self), self.storage_args, self.storage_options) - - def mkdir(self, *args, **kwargs): - if self.worker: - self.fs.mkdir(*args, **kwargs) - else: - self.rfs.mkdir(*args, **kwargs).compute() - - def rm(self, *args, **kwargs): - if self.worker: - self.fs.rm(*args, **kwargs) - else: - self.rfs.rm(*args, **kwargs).compute() - - def copy(self, *args, **kwargs): - if self.worker: - self.fs.copy(*args, **kwargs) - else: - self.rfs.copy(*args, **kwargs).compute() - - def mv(self, *args, **kwargs): - if self.worker: - self.fs.mv(*args, **kwargs) - else: - self.rfs.mv(*args, **kwargs).compute() - - def ls(self, *args, **kwargs): - if self.worker: - return self.fs.ls(*args, **kwargs) - else: - return self.rfs.ls(*args, **kwargs).compute() - - def _open(self, path, mode="rb", **kwargs): - if self.worker: - return self.fs._open(path, mode=mode) - else: - return DaskFile(self, path, mode, **kwargs) - - def fetch_range(self, path, mode, start, end): - if self.worker: - with self._open(path, mode) as f: - f.seek(start) - return f.read(end - start) - else: - return self.rfs.fetch_range(path, mode, start, end).compute() - - -class DaskFile(AbstractBufferedFile): - def __init__( - self, - fs, - path, - mode="rb", - block_size="default", - autocommit=True, - cache_type="bytes", - **kwargs - ): - super().__init__( - fs, - path, - mode=mode, - block_size=block_size, - autocommit=autocommit, - cache_type=cache_type, - **kwargs - ) - - def _upload_chunk(self, final=False): - pass - - def _initiate_upload(self): - """ Create remote file/upload """ - pass - - def _fetch_range(self, start, end): - """Get the specified set of bytes from remote""" - return self.fs.fetch_range(self.path, self.mode, start, end) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/ftp.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/ftp.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/ftp.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/ftp.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,265 +0,0 @@ -from ftplib import FTP, Error, error_perm -from socket import timeout -import uuid -from ..spec import AbstractBufferedFile, AbstractFileSystem -from ..utils import infer_storage_options - - -class FTPFileSystem(AbstractFileSystem): - """A filesystem over classic """ - - root_marker = "/" - cachable = False - - def __init__( - self, - host, - port=21, - username=None, - password=None, - acct=None, - block_size=None, - tempdir="/tmp", - timeout=30, - **kwargs - ): - """ - You can use _get_kwargs_from_urls to get some kwargs from - a reasonable FTP url. - - Authentication will be anonymous if username/password are not - given. - - Parameters - ---------- - host: str - The remote server name/ip to connect to - port: int - Port to connect with - username: str or None - If authenticating, the user's identifier - password: str of None - User's password on the server, if using - acct: str or None - Some servers also need an "account" string for auth - block_size: int or None - If given, the read-ahead or write buffer size. - tempdir: str - Directory on remote to put temporary files when in a transaction - """ - super(FTPFileSystem, self).__init__(**kwargs) - self.host = host - self.port = port - self.tempdir = tempdir - self.cred = username, password, acct - self.timeout = timeout - if block_size is not None: - self.blocksize = block_size - else: - self.blocksize = 2 ** 16 - self._connect() - - def _connect(self): - self.ftp = FTP(timeout=self.timeout) - self.ftp.connect(self.host, self.port) - self.ftp.login(*self.cred) - - @classmethod - def _strip_protocol(cls, path): - return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/") - - @staticmethod - def _get_kwargs_from_urls(urlpath): - out = infer_storage_options(urlpath) - out.pop("path", None) - out.pop("protocol", None) - return out - - def invalidate_cache(self, path=None): - if path is not None: - self.dircache.pop(path, None) - else: - self.dircache.clear() - - def ls(self, path, detail=True): - path = self._strip_protocol(path) - out = [] - if path not in self.dircache: - try: - try: - out = [ - (fn, details) - for (fn, details) in self.ftp.mlsd(path) - if fn not in [".", ".."] - and details["type"] not in ["pdir", "cdir"] - ] - except error_perm: - out = _mlsd2(self.ftp, path) # Not platform independent - for fn, details in out: - if path == "/": - path = "" # just for forming the names, below - details["name"] = "/".join([path, fn.lstrip("/")]) - if details["type"] == "file": - details["size"] = int(details["size"]) - else: - details["size"] = 0 - self.dircache[path] = out - except Error: - try: - info = self.info(path) - if info["type"] == "file": - out = [(path, info)] - except (Error, IndexError): - raise FileNotFoundError - files = self.dircache.get(path, out) - if not detail: - return sorted([fn for fn, details in files]) - return [details for fn, details in files] - - def info(self, path, **kwargs): - # implement with direct method - path = self._strip_protocol(path) - files = self.ls(self._parent(path).lstrip("/"), True) - try: - out = [f for f in files if f["name"] == path][0] - except IndexError: - raise FileNotFoundError(path) - return out - - def _open(self, path, mode="rb", block_size=None, autocommit=True, **kwargs): - path = self._strip_protocol(path) - block_size = block_size or self.blocksize - return FTPFile( - self, - path, - mode=mode, - block_size=block_size, - tempdir=self.tempdir, - autocommit=autocommit, - ) - - def _rm(self, path): - path = self._strip_protocol(path) - self.ftp.delete(path) - self.invalidate_cache(path.rsplit("/", 1)[0]) - - def mkdir(self, path, **kwargs): - path = self._strip_protocol(path) - self.ftp.mkd(path) - - def rmdir(self, path): - path = self._strip_protocol(path) - self.ftp.rmd(path) - - def mv(self, path1, path2, **kwargs): - path1 = self._strip_protocol(path1) - path2 = self._strip_protocol(path2) - self.ftp.rename(path1, path2) - self.invalidate_cache(self._parent(path1)) - self.invalidate_cache(self._parent(path2)) - - def __del__(self): - self.ftp.close() - - -class TransferDone(Exception): - """Internal exception to break out of transfer""" - - pass - - -class FTPFile(AbstractBufferedFile): - """Interact with a remote FTP file with read/write buffering""" - - def __init__(self, fs, path, **kwargs): - super().__init__(fs, path, **kwargs) - if kwargs.get("autocommit", False) is False: - self.target = self.path - self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())]) - - def commit(self): - self.fs.mv(self.path, self.target) - - def discard(self): - self.fs.rm(self.path) - - def _fetch_range(self, start, end): - """Get bytes between given byte limits - - Implemented by raising an exception in the fetch callback when the - number of bytes received reaches the requested amount. - - Will fail if the server does not respect the REST command on - retrieve requests. - """ - out = [] - total = [0] - - def callback(x): - total[0] += len(x) - if total[0] > end - start: - out.append(x[: (end - start) - total[0]]) - raise TransferDone - else: - out.append(x) - - if total[0] == end - start: - raise TransferDone - - try: - self.fs.ftp.retrbinary( - "RETR %s" % self.path, - blocksize=self.blocksize, - rest=start, - callback=callback, - ) - except TransferDone: - try: - self.fs.ftp.abort() - self.fs.ftp.voidresp() - except timeout: - self.fs._connect() - return b"".join(out) - - def _upload_chunk(self, final=False): - self.buffer.seek(0) - self.fs.ftp.storbinary( - "STOR " + self.path, self.buffer, blocksize=self.blocksize, rest=self.offset - ) - return True - - -def _mlsd2(ftp, path="."): - """ - Fall back to using `dir` instead of `mlsd` if not supported. - - This parses a Linux style `ls -l` response to `dir`, but the response may - be platform dependent. - - Parameters - ---------- - ftp: ftplib.FTP - path: str - Expects to be given path, but defaults to ".". - """ - lines = [] - minfo = [] - ftp.dir(path, lines.append) - for line in lines: - line = line.split() - this = ( - line[-1], - { - "modify": " ".join(line[5:8]), - "unix.owner": line[2], - "unix.group": line[3], - "unix.mode": line[0], - "size": line[4], - }, - ) - if "d" == this[1]["unix.mode"][0]: - this[1]["type"] = "dir" - else: - this[1]["type"] = "file" - minfo.append(this) - return minfo diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/github.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/github.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/github.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/github.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,67 +0,0 @@ -import io -import requests -from ..spec import AbstractFileSystem - - -class GithubFileSystem(AbstractFileSystem): - """[Experimental] interface to files in github - - An instance of this class provides the files residing within a remote github - repository. You may specify a point in the repos history, by SHA, branch - or tag (default is current master). - - Given that code files tend to be small, and that github does not support - retrieving partial content, we always fetch whole files. - """ - - url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}" - rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}" - protocol = "github" - - def __init__(self, org, repo, sha="master", **kwargs): - super().__init__(**kwargs) - self.org = org - self.repo = repo - self.root = sha - self.ls("") - - def ls(self, path, detail=False, sha=None, **kwargs): - if path == "": - sha = self.root - if sha is None: - parts = path.rstrip("/").split("/") - so_far = "" - sha = self.root - for part in parts: - out = self.ls(so_far, True, sha=sha) - so_far += "/" + part if so_far else part - out = [o for o in out if o["name"] == so_far][0] - if out["type"] == "file": - if detail: - return [out] - else: - return path - sha = out["sha"] - if path not in self.dircache: - r = requests.get(self.url.format(org=self.org, repo=self.repo, sha=sha)) - self.dircache[path] = [ - { - "name": path + "/" + f["path"] if path else f["path"], - "mode": f["mode"], - "type": {"blob": "file", "tree": "directory"}[f["type"]], - "size": f.get("size", 0), - "sha": f["sha"], - } - for f in r.json()["tree"] - ] - if detail: - return self.dircache[path] - else: - return sorted([f["name"] for f in self.dircache[path]]) - - def _open(self, path, mode="rb", **kwargs): - if mode != "rb": - raise NotImplementedError - url = self.rurl.format(org=self.org, repo=self.repo, path=path, sha=self.root) - r = requests.get(url) - return io.BytesIO(r.content) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/hdfs.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/hdfs.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/hdfs.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/hdfs.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,192 +0,0 @@ -from ..spec import AbstractFileSystem -from ..utils import infer_storage_options -from pyarrow.hdfs import HadoopFileSystem - - -class PyArrowHDFS(AbstractFileSystem): - """Adapted version of Arrow's HadoopFileSystem - - This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which - passes on all calls to the underlying class. - """ - - def __init__( - self, - host="default", - port=0, - user=None, - kerb_ticket=None, - driver="libhdfs", - extra_conf=None, - **kwargs - ): - """ - - Parameters - ---------- - host: str - Hostname, IP or "default" to try to read from Hadoop config - port: int - Port to connect on, or default from Hadoop config if 0 - user: str or None - If given, connect as this username - kerb_ticket: str or None - If given, use this ticket for authentication - driver: 'libhdfs' or 'libhdfs3' - Binary driver; libhdfs if the JNI library and default - extra_conf: None or dict - Passed on to HadoopFileSystem - """ - if self._cached: - return - AbstractFileSystem.__init__(self, **kwargs) - self.pars = (host, port, user, kerb_ticket, driver, extra_conf) - self.pahdfs = HadoopFileSystem( - host=host, - port=port, - user=user, - kerb_ticket=kerb_ticket, - driver=driver, - extra_conf=extra_conf, - ) - - def _open(self, path, mode="rb", block_size=None, autocommit=True, **kwargs): - """ - - Parameters - ---------- - path: str - Location of file; should start with '/' - mode: str - block_size: int - Hadoop block size, e.g., 2**26 - autocommit: True - Transactions are not yet implemented for HDFS; errors if not True - kwargs: dict or None - Hadoop config parameters - - Returns - ------- - HDFSFile file-like instance - """ - if not autocommit: - raise NotImplementedError - return HDFSFile(self, path, mode, block_size, **kwargs) - - def __reduce_ex__(self, protocol): - return PyArrowHDFS, self.pars - - def ls(self, path, detail=True): - out = self.pahdfs.ls(path, detail) - if detail: - for p in out: - p["type"] = p["kind"] - p["name"] = self._strip_protocol(p["name"]) - else: - out = [self._strip_protocol(p) for p in out] - return out - - @staticmethod - def _get_kwargs_from_urls(paths): - ops = infer_storage_options(paths) - out = {} - if ops.get("host", None): - out["host"] = ops["host"] - if ops.get("username", None): - out["user"] = ops["username"] - if ops.get("port", None): - out["port"] = ops["port"] - return out - - @classmethod - def _strip_protocol(cls, path): - ops = infer_storage_options(path) - return ops["path"] - - def __getattribute__(self, item): - if item in [ - "_open", - "__init__", - "__getattribute__", - "__reduce_ex__", - "open", - "ls", - "makedirs", - ]: - # all the methods defined in this class. Note `open` here, since - # it calls `_open`, but is actually in superclass - return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args, **kw) - if item == "__class__": - return PyArrowHDFS - d = object.__getattribute__(self, "__dict__") - pahdfs = d.get("pahdfs", None) # fs is not immediately defined - if pahdfs is not None and item in [ - "chmod", - "chown", - "user", - "df", - "disk_usage", - "download", - "driver", - "exists", - "extra_conf", - "get_capacity", - "get_space_used", - "host", - "is_open", - "kerb_ticket", - "strip_protocol", - "mkdir", - "mv", - "port", - "get_capacity", - "get_space_used", - "df", - "chmod", - "chown", - "disk_usage", - "download", - "upload", - "_get_kwargs_from_urls", - "read_parquet", - "rm", - "stat", - "upload", - ]: - return getattr(pahdfs, item) - else: - # attributes of the superclass, while target is being set up - return super().__getattribute__(item) - - -class HDFSFile(object): - """Wrapper around arrow's HdfsFile - - Allows seek beyond EOF and (eventually) commit/discard - """ - - def __init__(self, fs, path, mode, block_size, **kwargs): - self.fs = fs - self.path = path - self.mode = mode - self.block_size = block_size - self.fh = fs.pahdfs.open(path, mode, block_size, **kwargs) - if self.fh.readable(): - self.seek_size = self.size() - - def seek(self, loc, whence=0): - if whence == 0 and self.readable(): - loc = min(loc, self.seek_size) - return self.fh.seek(loc, whence) - - def __getattr__(self, item): - return getattr(self.fh, item) - - def __reduce_ex__(self, protocol): - return HDFSFile, (self.fs, self.path, self.mode, self.block_size) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.close() diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/http.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/http.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/http.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/http.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,358 +0,0 @@ -from __future__ import print_function, division, absolute_import - -import re -import requests -from urllib.parse import urlparse -from fsspec import AbstractFileSystem -from fsspec.spec import AbstractBufferedFile -from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE - -# https://stackoverflow.com/a/15926317/3821154 -ex = re.compile(r"""]*?\s+)?href=(["'])(.*?)\1""") -ex2 = re.compile(r"""(http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""") - - -class HTTPFileSystem(AbstractFileSystem): - """ - Simple File-System for fetching data via HTTP(S) - - ``ls()`` is implemented by loading the parent page and doing a regex - match on the result. If simple_link=True, anything of the form - "http(s)://server.com/stuff?thing=other"; otherwise only links within - HTML href tags will be used. - """ - - sep = "/" - - def __init__( - self, - simple_links=True, - block_size=None, - same_scheme=True, - size_policy=None, - **storage_options - ): - """ - Parameters - ---------- - block_size: int - Blocks to read bytes; if 0, will default to raw requests file-like - objects instead of HTTPFile instances - simple_links: bool - If True, will consider both HTML tags and anything that looks - like a URL; if False, will consider only the former. - same_scheme: True - When doing ls/glob, if this is True, only consider paths that have - http/https matching the input URLs. - size_policy: this argument is deprecated - storage_options: key-value - May be credentials, e.g., `{'auth': ('username', 'pword')}` or any - other parameters passed on to requests - """ - AbstractFileSystem.__init__(self) - self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE - self.simple_links = simple_links - self.same_schema = same_scheme - self.kwargs = storage_options - self.session = requests.Session() - - @classmethod - def _strip_protocol(cls, path): - """ For HTTP, we always want to keep the full URL - """ - return path - - # TODO: override get - - def ls(self, url, detail=True): - # ignoring URL-encoded arguments - r = self.session.get(url, **self.kwargs) - if self.simple_links: - links = ex2.findall(r.text) + ex.findall(r.text) - else: - links = ex.findall(r.text) - out = set() - parts = urlparse(url) - for l in links: - if isinstance(l, tuple): - l = l[1] - if l.startswith("http"): - if self.same_schema: - if l.split(":", 1)[0] == url.split(":", 1)[0]: - out.add(l) - elif l.replace("https", "http").startswith( - url.replace("https", "http") - ): - # allowed to cross http <-> https - out.add(l) - elif l.startswith("/") and len(l) > 1: - out.add(parts.scheme + "://" + parts.netloc + l) - else: - if l not in ["..", "../"]: - # Ignore FTP-like "parent" - out.add("/".join([url.rstrip("/"), l.lstrip("/")])) - if not out and url.endswith("/"): - return self.ls(url.rstrip("/"), detail=True) - if detail: - return [ - { - "name": u, - "size": None, - "type": "directory" if u.endswith("/") else "file", - } - for u in out - ] - else: - return list(sorted(out)) - - def cat(self, url): - r = requests.get(url, **self.kwargs) - r.raise_for_status() - return r.content - - def mkdirs(self, url): - """Make any intermediate directories to make path writable""" - raise NotImplementedError - - def exists(self, path): - kwargs = self.kwargs.copy() - kwargs["stream"] = True - try: - r = self.session.get(path, **kwargs) - r.close() - return r.ok - except requests.HTTPError: - return False - - def _open(self, url, mode="rb", block_size=None, cache_options=None, **kwargs): - """Make a file-like object - - Parameters - ---------- - url: str - Full URL with protocol - mode: string - must be "rb" - block_size: int or None - Bytes to download in one request; use instance value if None. If - zero, will return a streaming Requests file-like instance. - kwargs: key-value - Any other parameters, passed to requests calls - """ - if mode != "rb": - raise NotImplementedError - block_size = block_size if block_size is not None else self.block_size - kw = self.kwargs.copy() - kw.update(kwargs) - kw.pop("autocommit", None) - if block_size: - return HTTPFile( - self, url, self.session, block_size, cache_options=cache_options, **kw - ) - else: - kw["stream"] = True - r = self.session.get(url, **kw) - r.raise_for_status() - r.raw.decode_content = True - return r.raw - - def ukey(self, url): - """Unique identifier; assume HTTP files are static, unchanging""" - return tokenize(url, self.kwargs, self.protocol) - - def info(self, url, **kwargs): - """Get info of URL - - Tries to access location via HEAD, and then GET methods, but does - not fetch the data. - - It is possible that the server does not supply any size information, in - which case size will be given as None (and certain operations on the - corresponding file will not work). - """ - size = False - for policy in ["head", "get"]: - try: - size = file_size(url, self.session, policy, **self.kwargs) - if size: - break - except Exception: - pass - else: - # get failed, so conclude URL does not exist - if size is False: - raise FileNotFoundError(url) - return {"name": url, "size": size or None, "type": "file"} - - -class HTTPFile(AbstractBufferedFile): - """ - A file-like object pointing to a remove HTTP(S) resource - - Supports only reading, with read-ahead of a predermined block-size. - - In the case that the server does not supply the filesize, only reading of - the complete file in one go is supported. - - Parameters - ---------- - url: str - Full URL of the remote resource, including the protocol - session: requests.Session or None - All calls will be made within this session, to avoid restarting - connections where the server allows this - block_size: int or None - The amount of read-ahead to do, in bytes. Default is 5MB, or the value - configured for the FileSystem creating this file - size: None or int - If given, this is the size of the file in bytes, and we don't attempt - to call the server to find the value. - kwargs: all other key-values are passed to requests calls. - """ - - def __init__( - self, - fs, - url, - session=None, - block_size=None, - mode="rb", - cache_type="bytes", - cache_options=None, - size=None, - **kwargs - ): - if mode != "rb": - raise NotImplementedError("File mode not supported") - self.url = url - self.session = session if session is not None else requests.Session() - if size is not None: - self.details = {"name": url, "size": size, "type": "file"} - super().__init__( - fs=fs, - path=url, - mode=mode, - block_size=block_size, - cache_type=cache_type, - cache_options=cache_options, - **kwargs - ) - self.cache.size = self.size or self.blocksize - - def read(self, length=-1): - """Read bytes from file - - Parameters - ---------- - length: int - Read up to this many bytes. If negative, read all content to end of - file. If the server has not supplied the filesize, attempting to - read only part of the data will raise a ValueError. - """ - if ( - (length < 0 and self.loc == 0) - or (length > (self.size or length)) # explicit read all - or ( # read more than there is - self.size and self.size < self.blocksize - ) # all fits in one block anyway - ): - self._fetch_all() - if self.size is None: - if length < 0: - self._fetch_all() - else: - length = min(self.size - self.loc, length) - return super().read(length) - - def _fetch_all(self): - """Read whole file in one shot, without caching - - This is only called when position is still at zero, - and read() is called without a byte-count. - """ - if not isinstance(self.cache, AllBytes): - r = self.session.get(self.url, **self.kwargs) - r.raise_for_status() - out = r.content - self.cache = AllBytes(out) - self.size = len(out) - - def _fetch_range(self, start, end): - """Download a block of data - - The expectation is that the server returns only the requested bytes, - with HTTP code 206. If this is not the case, we first check the headers, - and then stream the output - if the data size is bigger than we - requested, an exception is raised. - """ - kwargs = self.kwargs.copy() - headers = kwargs.pop("headers", {}) - headers["Range"] = "bytes=%i-%i" % (start, end - 1) - r = self.session.get(self.url, headers=headers, stream=True, **kwargs) - if r.status_code == 416: - # range request outside file - return b"" - r.raise_for_status() - if r.status_code == 206: - # partial content, as expected - out = r.content - elif "Content-Length" in r.headers: - cl = int(r.headers["Content-Length"]) - if cl <= end - start: - # data size OK - out = r.content - else: - raise ValueError( - "Got more bytes (%i) than requested (%i)" % (cl, end - start) - ) - else: - cl = 0 - out = [] - for chunk in r.iter_content(chunk_size=2 ** 20): - # data size unknown, let's see if it goes too big - if chunk: - out.append(chunk) - cl += len(chunk) - if cl > end - start: - raise ValueError( - "Got more bytes so far (>%i) than requested (%i)" - % (cl, end - start) - ) - else: - break - out = b"".join(out) - return out - - -def file_size(url, session=None, size_policy="head", **kwargs): - """Call HEAD on the server to get file size - - Default operation is to explicitly allow redirects and use encoding - 'identity' (no compression) to get the true size of the target. - """ - kwargs = kwargs.copy() - ar = kwargs.pop("allow_redirects", True) - head = kwargs.get("headers", {}).copy() - head["Accept-Encoding"] = "identity" - session = session or requests.Session() - if size_policy == "head": - r = session.head(url, allow_redirects=ar, **kwargs) - elif size_policy == "get": - kwargs["stream"] = True - r = session.get(url, allow_redirects=ar, **kwargs) - else: - raise TypeError('size_policy must be "head" or "get", got %s' "" % size_policy) - if "Content-Length" in r.headers: - return int(r.headers["Content-Length"]) - elif "Content-Range" in r.headers: - return int(r.headers["Content-Range"].split("/")[1]) - - -class AllBytes(object): - """Cache entire contents of a remote URL""" - - def __init__(self, data): - self.data = data - - def _fetch(self, start, end): - return self.data[start:end] diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/local.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/local.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/local.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/local.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,233 +0,0 @@ -import io -import os -import shutil -import posixpath -import re -import tempfile -from fsspec import AbstractFileSystem -from fsspec.utils import stringify_path - - -class LocalFileSystem(AbstractFileSystem): - """Interface to files on local storage - - Parameters - ---------- - auto_mkdirs: bool - Whether, when opening a file, the directory containing it should - be created (if it doesn't already exist). This is assumed by pyarrow - code. - """ - - root_marker = "/" - - def __init__(self, auto_mkdir=True, **kwargs): - super().__init__(**kwargs) - self.auto_mkdir = auto_mkdir - - def mkdir(self, path, create_parents=True, **kwargs): - path = self._strip_protocol(path) - if create_parents: - self.makedirs(path, exist_ok=True) - else: - os.mkdir(path, **kwargs) - - def makedirs(self, path, exist_ok=False): - path = self._strip_protocol(path) - os.makedirs(path, exist_ok=exist_ok) - - def rmdir(self, path): - os.rmdir(path) - - def ls(self, path, detail=False): - path = self._strip_protocol(path) - paths = [posixpath.join(path, f) for f in os.listdir(path)] - if detail: - return [self.info(f) for f in paths] - else: - return paths - - def glob(self, path, **kargs): - path = self._strip_protocol(path) - return super().glob(path) - - def info(self, path, **kwargs): - path = self._strip_protocol(path) - out = os.stat(path, follow_symlinks=False) - dest = False - if os.path.islink(path): - t = "link" - dest = os.readlink(path) - elif os.path.isdir(path): - t = "directory" - elif os.path.isfile(path): - t = "file" - else: - t = "other" - result = {"name": path, "size": out.st_size, "type": t, "created": out.st_ctime} - for field in ["mode", "uid", "gid", "mtime"]: - result[field] = getattr(out, "st_" + field) - if dest: - result["destination"] = dest - try: - out2 = os.stat(path, follow_symlinks=True) - result["size"] = out2.st_size - except IOError: - result["size"] = 0 - return result - - def copy(self, path1, path2, **kwargs): - shutil.copyfile(path1, path2) - - def get(self, path1, path2, **kwargs): - if kwargs.get("recursive"): - return super(LocalFileSystem, self).get(path1, path2, **kwargs) - else: - return self.copy(path1, path2, **kwargs) - - def put(self, path1, path2, **kwargs): - if kwargs.get("recursive"): - return super(LocalFileSystem, self).put(path1, path2, **kwargs) - else: - return self.copy(path1, path2, **kwargs) - - def mv(self, path1, path2, **kwargs): - os.rename(path1, path2) - - def rm(self, path, recursive=False, maxdepth=None): - if recursive and self.isdir(path): - shutil.rmtree(path) - else: - os.remove(path) - - def _open(self, path, mode="rb", block_size=None, **kwargs): - path = self._strip_protocol(path) - if self.auto_mkdir: - self.makedirs(self._parent(path), exist_ok=True) - return LocalFileOpener(path, mode, fs=self, **kwargs) - - def touch(self, path, **kwargs): - path = self._strip_protocol(path) - if self.exists(path): - os.utime(path, None) - else: - open(path, "a").close() - - @classmethod - def _parent(cls, path): - path = cls._strip_protocol(path).rstrip("/") - if "/" in path: - return path.rsplit("/", 1)[0] - else: - return cls.root_marker - - @classmethod - def _strip_protocol(cls, path): - path = stringify_path(path) - if path.startswith("file://"): - path = path[7:] - return make_path_posix(path) - - -def make_path_posix(path, sep=os.sep): - """ Make path generic """ - if re.match("/[A-Za-z]:", path): - # for windows file URI like "file:///C:/folder/file" - # or "file:///C:\\dir\\file" - path = path[1:] - if path.startswith("\\\\"): - # special case for windows UNC/DFS-style paths, do nothing, - # jsut flip the slashes around (case below does not work!) - return path.replace("\\", "/") - if path.startswith("\\") or re.match("[\\\\]*[A-Za-z]:", path): - # windows full path "\\server\\path" or "C:\\local\\path" - return path.lstrip("\\").replace("\\", "/").replace("//", "/") - if ( - sep not in path - and "/" not in path - or (sep == "/" and not path.startswith("/")) - or (sep == "\\" and ":" not in path) - ): - # relative path like "path" or "rel\\path" (win) or rel/path" - path = os.path.abspath(path) - if os.sep == "\\": - # abspath made some more '\\' separators - return make_path_posix(path, sep) - return path - - -class LocalFileOpener(object): - def __init__(self, path, mode, autocommit=True, fs=None, **kwargs): - self.path = path - self.mode = mode - self.fs = fs - self.f = None - self.autocommit = autocommit - self.blocksize = io.DEFAULT_BUFFER_SIZE - self._open() - - def _open(self): - if self.f is None or self.f.closed: - if self.autocommit or "w" not in self.mode: - self.f = open(self.path, mode=self.mode) - else: - # TODO: check if path is writable? - i, name = tempfile.mkstemp() - self.temp = name - self.f = open(name, mode=self.mode) - if "w" not in self.mode: - self.details = self.fs.info(self.path) - self.size = self.details["size"] - self.f.size = self.size - - def _fetch_range(self, start, end): - # probably only used by cached FS - if "r" not in self.mode: - raise ValueError - self._open() - self.f.seek(start) - return self.f.read(end - start) - - def __setstate__(self, state): - if "r" in state["mode"]: - loc = self.state.pop("loc") - self._open() - self.f.seek(loc) - else: - self.f = None - self.__dict__.update(state) - - def __getstate__(self): - d = self.__dict__.copy() - d.pop("f") - if "r" in self.mode: - d["loc"] = self.f.tell() - else: - if not self.f.closed: - raise ValueError("Cannot serialise open write-mode local file") - return d - - def commit(self): - if self.autocommit: - raise RuntimeError("Can only commit if not already set to autocommit") - os.rename(self.temp, self.path) - - def discard(self): - if self.autocommit: - raise RuntimeError("Cannot discard if set to autocommit") - os.remove(self.temp) - - def __fspath__(self): - # uniquely for fsspec implementations, this is a real path - return self.path - - def __getattr__(self, item): - return getattr(self.f, item) - - def __enter__(self): - self._incontext = True - return self.f.__enter__() - - def __exit__(self, exc_type, exc_value, traceback): - self._incontext = False - self.f.__exit__(exc_type, exc_value, traceback) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/memory.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/memory.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/memory.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/memory.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,169 +0,0 @@ -from __future__ import print_function, division, absolute_import - -from io import BytesIO -from fsspec import AbstractFileSystem -import logging - -logger = logging.Logger("fsspec.memoryfs") - - -class MemoryFileSystem(AbstractFileSystem): - """A filesystem based on a dict of BytesIO objects""" - - store = {} # global - pseudo_dirs = [] - protocol = "memory" - root_marker = "" - - def ls(self, path, detail=False): - if path in self.store: - # there is a key with this exact name, but could also be directory - out = [ - { - "name": path, - "size": self.store[path].getbuffer().nbytes, - "type": "file", - } - ] - else: - out = [] - path = path.strip("/").lstrip("/") - paths = set() - for p2 in self.store: - has_slash = "/" if p2.startswith("/") else "" - p = p2.lstrip("/") - if "/" in p: - root = p.rsplit("/", 1)[0] - else: - root = "" - if root == path: - out.append( - { - "name": has_slash + p, - "size": self.store[p2].getbuffer().nbytes, - "type": "file", - } - ) - elif path and all( - (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/")) - ): - # implicit directory - ppath = "/".join(p.split("/")[: len(path.split("/")) + 1]) - if ppath not in paths: - out.append( - { - "name": has_slash + ppath + "/", - "size": 0, - "type": "directory", - } - ) - paths.add(ppath) - elif all( - (a == b) - for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) - ): - # root directory entry - ppath = p.rstrip("/").split("/", 1)[0] - if ppath not in paths: - out.append( - { - "name": has_slash + ppath + "/", - "size": 0, - "type": "directory", - } - ) - paths.add(ppath) - for p2 in self.pseudo_dirs: - if self._parent(p2).strip("/").rstrip("/") == path: - out.append({"name": p2 + "/", "size": 0, "type": "directory"}) - if detail: - return out - return sorted([f["name"] for f in out]) - - def mkdir(self, path): - path = path.rstrip("/") - if path not in self.pseudo_dirs: - self.pseudo_dirs.append(path) - - def rmdir(self, path): - path = path.rstrip("/") - if path in self.pseudo_dirs: - if self.ls(path) == []: - self.pseudo_dirs.remove(path) - else: - raise OSError("Directory %s not empty" % path) - else: - raise FileNotFoundError(path) - - def exists(self, path): - return path in self.store - - def _open(self, path, mode="rb", **kwargs): - """Make a file-like object - - Parameters - ---------- - path: str - identifier - mode: str - normally "rb", "wb" or "ab" - """ - if mode in ["rb", "ab", "rb+"]: - if path in self.store: - f = self.store[path] - if mode == "rb": - f.seek(0) - else: - f.seek(0, 2) - return f - else: - raise FileNotFoundError(path) - if mode == "wb": - m = MemoryFile(self, path) - if not self._intrans: - m.commit() - return m - - def copy(self, path1, path2, **kwargs): - self.store[path2] = MemoryFile(self, path2, self.store[path1].getbuffer()) - - def cat(self, path): - return self.store[path].getvalue() - - def _rm(self, path): - del self.store[path] - - def size(self, path): - """Size in bytes of the file at path""" - if path not in self.store: - raise FileNotFoundError(path) - return self.store[path].getbuffer().nbytes - - -class MemoryFile(BytesIO): - """A BytesIO which can't close and works as a context manager - - Can initialise with data - - No need to provide fs, path if auto-committing (default) - """ - - def __init__(self, fs, path, data=None): - self.fs = fs - self.path = path - if data: - self.write(data) - self.size = len(data) - self.seek(0) - - def __enter__(self): - return self - - def close(self): - self.size = self.seek(0, 2) - - def discard(self): - pass - - def commit(self): - self.fs.store[self.path] = self diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/sftp.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/sftp.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/sftp.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/sftp.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,139 +0,0 @@ -import paramiko -from stat import S_ISDIR, S_ISLNK -import types -import uuid -from .. import AbstractFileSystem -from ..utils import infer_storage_options - - -class SFTPFileSystem(AbstractFileSystem): - """Files over SFTP/SSH - - Peer-to-peer filesystem over SSH using paramiko. - """ - - protocol = "sftp", "ssh" - - def __init__(self, host, **ssh_kwargs): - """ - - Parameters - ---------- - host: str - Hostname or IP as a string - temppath: str - Location on the server to put files, when within a transaction - ssh_kwargs: dict - Parameters passed on to connection. See details in - http://docs.paramiko.org/en/2.4/api/client.html#paramiko.client.SSHClient.connect - May include port, username, password... - """ - if self._cached: - return - super(SFTPFileSystem, self).__init__(**ssh_kwargs) - self.temppath = ssh_kwargs.pop("temppath", "/tmp") - self.host = host - self.ssh_kwargs = ssh_kwargs - self._connect() - - def _connect(self): - self.client = paramiko.SSHClient() - self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - self.client.connect(self.host, **self.ssh_kwargs) - self.ftp = self.client.open_sftp() - - @classmethod - def _strip_protocol(cls, path): - return infer_storage_options(path)["path"] - - @staticmethod - def _get_kwargs_from_urls(urlpath): - out = infer_storage_options(urlpath) - out.pop("path", None) - out.pop("protocol", None) - return out - - def mkdir(self, path, mode=511): - self.ftp.mkdir(path, mode) - - def makedirs(self, path, exist_ok=False, mode=511): - if self.exists(path) and not exist_ok: - raise FileExistsError("File exists: {}".format(path)) - - parts = path.split("/") - path = "" - - for part in parts: - path += "/" + part - if not self.exists(path): - self.mkdir(path, mode) - - def rmdir(self, path): - self.ftp.rmdir(path) - - def info(self, path): - s = self.ftp.stat(path) - if S_ISDIR(s.st_mode): - t = "directory" - elif S_ISLNK(s.st_mode): - t = "link" - else: - t = "file" - return { - "name": path + "/" if t == "directory" else path, - "size": s.st_size, - "type": t, - "uid": s.st_uid, - "gui": s.st_gid, - "time": s.st_atime, - "mtime": s.st_mtime, - } - - def ls(self, path, detail=False): - out = ["/".join([path.rstrip("/"), p]) for p in self.ftp.listdir(path)] - out = [self.info(o) for o in out] - if detail: - return out - return sorted([p["name"] for p in out]) - - def put(self, lpath, rpath): - self.ftp.put(lpath, rpath) - - def get(self, rpath, lpath): - self.ftp.get(rpath, lpath) - - def _open(self, path, mode="rb", block_size=None, **kwargs): - """ - block_size: int or None - If 0, no buffering, if 1, line buffering, if >1, buffer that many - bytes, if None use default from paramiko. - """ - if kwargs.get("autocommit", True) is False: - # writes to temporary file, move on commit - path2 = "{}/{}".format(self.temppath, uuid.uuid4()) - f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1) - f.temppath = path2 - f.targetpath = path - f.fs = self - f.commit = types.MethodType(commit_a_file, f) - f.discard = types.MethodType(discard_a_file, f) - else: - f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1) - return f - - def _rm(self, path): - if self.isdir(path): - self.ftp.rmdir(path) - else: - self.ftp.remove(path) - - def mv(self, old, new): - self.ftp.posix_rename(old, new) - - -def commit_a_file(self): - self.fs.mv(self.temppath, self.targetpath) - - -def discard_a_file(self): - self.fs._rm(self.temppath) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/webhdfs.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/webhdfs.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/webhdfs.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/webhdfs.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,384 +0,0 @@ -# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html - -import requests -from urllib.parse import quote -import uuid -from ..spec import AbstractFileSystem, AbstractBufferedFile -from ..utils import infer_storage_options -import logging - -logger = logging.getLogger("webhdfs") - - -class WebHDFS(AbstractFileSystem): - """ - Interface to HDFS over HTTP - - Three auth mechanisms are supported: - - insecure: no auth is done, and the user is assumed to be whoever they - say they are (parameter `user`), or a predefined value such as - "dr.who" if not given - spnego: when kerberos authentication is enabled, auth is negotiated by - requests_kerberos https://github.com/requests/requests-kerberos . - This establishes a session based on existing kinit login and/or - specified principal/password; paraneters are passed with ``kerb_kwargs`` - token: uses an existing Hadoop delegation token from another secured - service. Indeed, this client can also generate such tokens when - not insecure. Note that tokens expire, but can be renewed (by a - previously specified user) and may allow for proxying. - - """ - - tempdir = "/tmp" - protocol = "webhdfs", "webHDFS" - - def __init__( - self, - host, - port=50070, - kerberos=False, - token=None, - user=None, - proxy_to=None, - kerb_kwargs=None, - data_proxy=None, - **kwargs - ): - """ - Parameters - ---------- - host: str - Name-node address - port: int - Port for webHDFS - kerberos: bool - Whether to authenticate with kerberos for this connection - token: str or None - If given, use this token on every call to authenticate. A user - and user-proxy may be encoded in the token and should not be also - given - user: str or None - If given, assert the user name to connect with - proxy_to: str or None - If given, the user has the authority to proxy, and this value is - the user in who's name actions are taken - kerb_kwargs: dict - Any extra arguments for HTTPKerberosAuth, see - https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py - data_proxy: dict, callable or None - If given, map data-node addresses. This can be necessary if the - HDFS cluster is behind a proxy, running on Docker or otherwise has - a mismatch between the host-names given by the name-node and the - address by which to refer to them from the client. If a dict, - maps host names `host->data_proxy[host]`; if a callable, full - URLs are passed, and function must conform to - `url->data_proxy(url)`. - kwargs - """ - if self._cached: - return - super().__init__(**kwargs) - self.url = "http://{host}:{port}/webhdfs/v1".format(host=host, port=port) - self.kerb = kerberos - self.kerb_kwargs = kerb_kwargs or {} - self.pars = {} - self.proxy = data_proxy or {} - if token is not None: - if user is not None or proxy_to is not None: - raise ValueError( - "If passing a delegation token, must not set " - "user or proxy_to, as these are encoded in the" - " token" - ) - self.pars["delegation"] = token - if user is not None: - self.pars["user.name"] = user - if proxy_to is not None: - self.pars["doas"] = proxy_to - if kerberos and user is not None: - raise ValueError( - "If using Kerberos auth, do not specify the " - "user, this is handled by kinit." - ) - self._connect() - - def _connect(self): - self.session = requests.Session() - if self.kerb: - from requests_kerberos import HTTPKerberosAuth - - self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs) - - def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs): - url = self.url + quote(path or "") - args = kwargs.copy() - args.update(self.pars) - args["op"] = op.upper() - logger.debug(url, method, args) - out = self.session.request( - method=method.upper(), - url=url, - params=args, - data=data, - allow_redirects=redirect, - ) - if out.status_code == 404: - raise FileNotFoundError(path) - if out.status_code == 403: - raise PermissionError(path or "") - if out.status_code == 401: - raise PermissionError # not specific to path - out.raise_for_status() - return out - - def _open( - self, - path, - mode="rb", - block_size=None, - autocommit=True, - replication=None, - permissions=None, - **kwargs - ): - """ - - Parameters - ---------- - path: str - File location - mode: str - 'rb', 'wb', etc. - block_size: int - Client buffer size for read-ahead or write buffer - autocommit: bool - If False, writes to temporary file that only gets put in final - location upon commit - replication: int - Number of copies of file on the cluster, write mode only - permissions: str or int - posix permissions, write mode only - kwargs - - Returns - ------- - WebHDFile instance - """ - block_size = block_size or self.blocksize - return WebHDFile( - self, - path, - mode=mode, - block_size=block_size, - tempdir=self.tempdir, - autocommit=autocommit, - replication=replication, - permissions=permissions, - ) - - @staticmethod - def _process_info(info): - info["type"] = info["type"].lower() - info["size"] = info["length"] - return info - - @classmethod - def _strip_protocol(cls, path): - return infer_storage_options(path)["path"] - - @staticmethod - def _get_kwargs_from_urls(urlpath): - out = infer_storage_options(urlpath) - out.pop("path", None) - out.pop("protocol", None) - if "username" in out: - out["user"] = out.pop("username") - return out - - def info(self, path): - out = self._call("GETFILESTATUS", path=path) - info = out.json()["FileStatus"] - info["name"] = path - return self._process_info(info) - - def ls(self, path, detail=False): - out = self._call("LISTSTATUS", path=path) - infos = out.json()["FileStatuses"]["FileStatus"] - for info in infos: - self._process_info(info) - info["name"] = path.rstrip("/") + "/" + info["pathSuffix"] - if detail: - return sorted(infos, key=lambda i: i["name"]) - else: - return sorted(info["name"] for info in infos) - - def content_summary(self, path): - """Total numbers of files, directories and bytes under path""" - out = self._call("GETCONTENTSUMMARY", path=path) - return out.json()["ContentSummary"] - - def ukey(self, path): - """Checksum info of file, giving method and result""" - out = self._call("GETFILECHECKSUM", path=path, redirect=False) - location = self._apply_proxy(out.headers["Location"]) - out2 = self.session.get(location) - out2.raise_for_status() - return out2.json()["FileChecksum"] - - def home_directory(self): - """Get user's home directory""" - out = self._call("GETHOMEDIRECTORY") - return out.json()["Path"] - - def get_delegation_token(self, renewer=None): - """Retrieve token which can give the same authority to other uses - - Parameters - ---------- - renewer: str or None - User who may use this token; if None, will be current user - """ - if renewer: - out = self._call("GETDELEGATIONTOKEN", renewer=renewer) - else: - out = self._call("GETDELEGATIONTOKEN") - t = out.json()["Token"] - if t is None: - raise ValueError("No token available for this user/security context") - return t["urlString"] - - def renew_delegation_token(self, token): - """Make token live longer. Returns new expiry time""" - out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token) - return out.json()["long"] - - def cancel_delegation_token(self, token): - """Stop the token from being useful""" - self._call("CANCELDELEGATIONTOKEN", method="put", token=token) - - def chmod(self, path, mod): - """Set the permission at path - - Parameters - ---------- - path: str - location to set (file or directory) - mod: str or int - posix epresentation or permission, give as oct string, e.g, '777' - or 0o777 - """ - self._call("SETPERMISSION", method="put", path=path, permission=mod) - - def chown(self, path, owner=None, group=None): - """Change owning user and/or group""" - kwargs = {} - if owner is not None: - kwargs["owner"] = owner - if group is not None: - kwargs["group"] = group - self._call("SETOWNER", method="put", path=path, **kwargs) - - def set_replication(self, path, replication): - """ - Set file replication factor - - Parameters - ---------- - path: str - File location (not for directories) - replication: int - Number of copies of file on the cluster. Should be smaller than - number of data nodes; normally 3 on most systems. - """ - self._call("SETREPLICATION", path=path, method="put", replication=replication) - - def mkdir(self, path, **kwargs): - self._call("MKDIRS", method="put", path=path) - - def makedirs(self, path, exist_ok=False): - if exist_ok is False and self.exists(path): - raise FileExistsError(path) - self.mkdir(path) - - def mv(self, path1, path2, **kwargs): - self._call("RENAME", method="put", path=path1, destination=path2) - - def rm(self, path, recursive=False, **kwargs): - self._call( - "DELETE", - method="delete", - path=path, - recursive="true" if recursive else "false", - ) - - def _apply_proxy(self, location): - if self.proxy and callable(self.proxy): - location = self.proxy(location) - elif self.proxy: - # as a dict - for k, v in self.proxy.items(): - location = location.replace(k, v, 1) - return location - - -class WebHDFile(AbstractBufferedFile): - """A file living in HDFS over webHDFS""" - - def __init__(self, fs, path, **kwargs): - super().__init__(fs, path, **kwargs) - kwargs = kwargs.copy() - if kwargs.get("permissions", None) is None: - kwargs.pop("permissions", None) - if kwargs.get("replication", None) is None: - kwargs.pop("replication", None) - self.permissions = kwargs.pop("permissions", 511) - tempdir = kwargs.pop("tempdir") - if kwargs.pop("autocommit", False) is False: - self.target = self.path - self.path = "/".join([tempdir, str(uuid.uuid4())]) - - def _upload_chunk(self, final=False): - """ Write one part of a multi-block file upload - - Parameters - ========== - final: bool - This is the last block, so should complete file, if - self.autocommit is True. - """ - out = self.fs.session.post(self.location, data=self.buffer.getvalue()) - out.raise_for_status() - return True - - def _initiate_upload(self): - """ Create remote file/upload """ - if "a" in self.mode: - op, method = "APPEND", "POST" - else: - op, method = "CREATE", "PUT" - if self.fs.exists(self.path): - # no "truncate" or "create empty" - self.fs.rm(self.path) - out = self.fs._call(op, method, self.path, redirect=False, **self.kwargs) - location = self.fs._apply_proxy(out.headers["Location"]) - if "w" in self.mode: - # create empty file to append to - out2 = self.fs.session.put(location) - out2.raise_for_status() - self.location = location.replace("CREATE", "APPEND") - - def _fetch_range(self, start, end): - out = self.fs._call( - "OPEN", path=self.path, offset=start, length=end - start, redirect=False - ) - out.raise_for_status() - location = out.headers["Location"] - out2 = self.fs.session.get(self.fs._apply_proxy(location)) - return out2.content - - def commit(self): - self.fs.mv(self.path, self.target) - - def discard(self): - self.fs.rm(self.path) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/zip.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/zip.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/zip.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/implementations/zip.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,119 +0,0 @@ -from __future__ import print_function, division, absolute_import - -import zipfile -from fsspec import AbstractFileSystem, open_files -from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE - - -class ZipFileSystem(AbstractFileSystem): - """Read contents of ZIP archive as a file-system - - Keeps file object open while instance lives. - - This class is pickleable, but not necessarily thread-safe - """ - - root_marker = "" - - def __init__(self, fo="", mode="r", **storage_options): - """ - Parameters - ---------- - fo: str or file-like - Contains ZIP, and must exist. If a str, will fetch file using - `open_files()`, which must return one file exactly. - mode: str - Currently, only 'r' accepted - storage_options: key-value - May be credentials, e.g., `{'auth': ('username', 'pword')}` or any - other parameters for requests - """ - if self._cached: - return - AbstractFileSystem.__init__(self) - if mode != "r": - raise ValueError("Only read from zip files accepted") - self.in_fo = fo - if isinstance(fo, str): - files = open_files(fo) - if len(files) != 1: - raise ValueError( - 'Path "{}" did not resolve to exactly' - 'one file: "{}"'.format(fo, files) - ) - fo = files[0] - self.fo = fo.__enter__() # the whole instance is a context - self.zip = zipfile.ZipFile(self.fo) - self.block_size = storage_options.get("block_size", DEFAULT_BLOCK_SIZE) - self.dir_cache = None - - @classmethod - def _strip_protocol(cls, path): - # zip file paths are always relative to the archive root - return super()._strip_protocol(path).lstrip("/") - - def _get_dirs(self): - if self.dir_cache is None: - files = self.zip.infolist() - self.dir_cache = {} - for z in files: - f = {s: getattr(z, s) for s in zipfile.ZipInfo.__slots__} - f.update( - { - "name": z.filename, - "size": z.file_size, - "type": ("directory" if z.is_dir() else "file"), - } - ) - self.dir_cache[f["name"]] = f - - def ls(self, path, detail=False): - self._get_dirs() - paths = {} - for p, f in self.dir_cache.items(): - p = p.rstrip("/") - if "/" in p: - root = p.rsplit("/", 1)[0] - else: - root = "" - if root == path.rstrip("/"): - paths[p] = f - elif path and all( - (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/")) - ): - # implicit directory - ppath = "/".join(p.split("/")[: len(path.split("/")) + 1]) - if ppath not in paths: - out = {"name": ppath + "/", "size": 0, "type": "directory"} - paths[ppath] = out - - elif all( - (a == b) - for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) - ): - # root directory entry - ppath = p.rstrip("/").split("/", 1)[0] - if ppath not in paths: - out = {"name": ppath + "/", "size": 0, "type": "directory"} - paths[ppath] = out - out = list(paths.values()) - if detail: - return out - else: - return list(sorted(f["name"] for f in out)) - - def cat(self, path): - return self.zip.read(path) - - def _open(self, path, mode="rb", **kwargs): - path = self._strip_protocol(path) - if mode != "rb": - raise NotImplementedError - info = self.info(path) - out = self.zip.open(path, "r") - out.size = info["size"] - out.name = info["name"] - return out - - def ukey(self, path): - return tokenize(path, self.in_fo, self.protocol) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/__init__.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/__init__.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/__init__.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/__init__.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,24 +0,0 @@ -from ._version import get_versions - -from .spec import AbstractFileSystem -from .registry import get_filesystem_class, registry, filesystem -from .mapping import FSMap, get_mapper -from .core import open_files, get_fs_token_paths, open -from . import caching - -__version__ = get_versions()["version"] -del get_versions - - -__all__ = [ - "AbstractFileSystem", - "FSMap", - "filesystem", - "get_filesystem_class", - "get_fs_token_paths", - "get_mapper", - "open", - "open_files", - "registry", - "caching", -] diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/mapping.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/mapping.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/mapping.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/mapping.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,152 +0,0 @@ -from collections.abc import MutableMapping -from .registry import get_filesystem_class -from .core import split_protocol - - -class FSMap(MutableMapping): - """Wrap a FileSystem instance as a mutable wrapping. - - The keys of the mapping become files under the given root, and the - values (which must be bytes) the contents of those files. - - Parameters - ---------- - root: string - prefix for all the files - fs: FileSystem instance - check: bool (=True) - performs a touch at the location, to check for write access. - - Examples - -------- - >>> fs = FileSystem(**parameters) # doctest: +SKIP - >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP - or, more likely - >>> d = fs.get_mapper('my-data/path/') - - >>> d['loc1'] = b'Hello World' # doctest: +SKIP - >>> list(d.keys()) # doctest: +SKIP - ['loc1'] - >>> d['loc1'] # doctest: +SKIP - b'Hello World' - """ - - def __init__(self, root, fs, check=False, create=False): - self.fs = fs - self.root = fs._strip_protocol(root).rstrip( - "/" - ) # we join on '/' in _key_to_str - if create: - if not self.fs.exists(root): - self.fs.mkdir(root) - if check: - if not self.fs.exists(root): - raise ValueError( - "Path %s does not exist. Create " - " with the ``create=True`` keyword" % root - ) - self.fs.touch(root + "/a") - self.fs.rm(root + "/a") - - def clear(self): - """Remove all keys below root - empties out mapping - """ - try: - self.fs.rm(self.root, True) - self.fs.mkdir(self.root) - except: # noqa: E722 - pass - - def _key_to_str(self, key): - """Generate full path for the key""" - if isinstance(key, (tuple, list)): - key = str(tuple(key)) - else: - key = str(key) - return "/".join([self.root, key]) if self.root else key - - def _str_to_key(self, s): - """Strip path of to leave key name""" - return s[len(self.root) :].lstrip("/") - - def __getitem__(self, key, default=None): - """Retrieve data""" - key = self._key_to_str(key) - try: - result = self.fs.cat(key) - except: # noqa: E722 - if default is not None: - return default - raise KeyError(key) - return result - - def pop(self, key, default=None): - result = self.__getitem__(key, default) - try: - del self[key] - except KeyError: - pass - return result - - def __setitem__(self, key, value): - """Store value in key""" - key = self._key_to_str(key) - self.fs.mkdirs(self.fs._parent(key), exist_ok=True) - with self.fs.open(key, "wb") as f: - f.write(value) - - def __iter__(self): - return (self._str_to_key(x) for x in self.fs.find(self.root)) - - def __len__(self): - return len(self.fs.find(self.root)) - - def __delitem__(self, key): - """Remove key""" - try: - self.fs.rm(self._key_to_str(key)) - except: # noqa: E722 - raise KeyError - - def __contains__(self, key): - """Does key exist in mapping?""" - return self.fs.exists(self._key_to_str(key)) - - def __getstate__(self): - """Mapping should be pickleable""" - # TODO: replace with reduce to reinstantiate? - return self.fs, self.root - - def __setstate__(self, state): - fs, root = state - self.fs = fs - self.root = root - - -def get_mapper(url, check=False, create=False, **kwargs): - """Create key-value interface for given URL and options - - The URL will be of the form "protocol://location" and point to the root - of the mapper required. All keys will be file-names below this location, - and their values the contents of each key. - - Parameters - ---------- - url: str - Root URL of mapping - check: bool - Whether to attempt to read from the location before instantiation, to - check that the mapping does exist - create: bool - Whether to make the directory corresponding to the root before - instantiating - - Returns - ------- - ``FSMap`` instance, the dict-like key-value store. - """ - protocol, path = split_protocol(url) - cls = get_filesystem_class(protocol) - fs = cls(**kwargs) - # Removing protocol here - could defer to each open() on the backend - return FSMap(url, fs, check, create) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/registry.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/registry.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/registry.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/registry.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,116 +0,0 @@ -import importlib -from distutils.version import LooseVersion - -__all__ = ["registry", "get_filesystem_class", "default"] - -# mapping protocol: implementation class object -registry = {} -default = "file" - -# protocols mapped to the class which implements them. This dict can -# be dynamically updated. -known_implementations = { - "file": {"class": "fsspec.implementations.local.LocalFileSystem"}, - "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"}, - "http": { - "class": "fsspec.implementations.http.HTTPFileSystem", - "err": 'HTTPFileSystem requires "requests" to be installed', - }, - "https": { - "class": "fsspec.implementations.http.HTTPFileSystem", - "err": 'HTTPFileSystem requires "requests" to be installed', - }, - "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"}, - "gcs": { - "class": "gcsfs.GCSFileSystem", - "err": "Please install gcsfs to access Google Storage", - }, - "gs": { - "class": "gcsfs.GCSFileSystem", - "err": "Please install gcsfs to access Google Storage", - }, - "sftp": { - "class": "fsspec.implementations.sftp.SFTPFileSystem", - "err": 'SFTPFileSystem requires "paramiko" to be installed', - }, - "ssh": { - "class": "fsspec.implementations.sftp.SFTPFileSystem", - "err": 'SFTPFileSystem requires "paramiko" to be installed', - }, - "ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"}, - "hdfs": { - "class": "fsspec.implementations.hdfs.PyArrowHDFS", - "err": "pyarrow and local java libraries required for HDFS", - }, - "webhdfs": { - "class": "fsspec.implementations.webhdfs.WebHDFS", - "err": 'webHDFS access requires "requests" to be installed', - }, - "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"}, - "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"}, - "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"}, - "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"}, - "dask": { - "class": "fsspec.implementations.dask.DaskWorkerFileSystem", - "err": "Install dask distributed to access worker file system", - }, -} - -minversions = {"s3fs": LooseVersion("0.3.0"), "gcsfs": LooseVersion("0.3.0")} - - -def get_filesystem_class(protocol): - """Fetch named protocol implementation from the registry - - The dict ``known_implementations`` maps protocol names to the locations - of classes implementing the corresponding file-system. When used for the - first time, appropriate imports will happen and the class will be placed in - the registry. All subsequent calls will fetch directly from the registry. - - Some protocol implementations require additional dependencies, and so the - import may fail. In this case, the string in the "err" field of the - ``known_implementations`` will be given as the error message. - """ - if protocol is None: - protocol = default - - if protocol not in registry: - if protocol not in known_implementations: - raise ValueError("Protocol not known: %s" % protocol) - bit = known_implementations[protocol] - mod, name = bit["class"].rsplit(".", 1) - minversion = minversions.get(mod, None) - err = None - try: - mod = importlib.import_module(mod) - except ImportError: - err = ImportError(bit["err"]) - - except Exception as e: - err = e - if err is not None: - raise RuntimeError(str(err)) - - if minversion: - version = getattr(mod, "__version__", None) - if version and LooseVersion(version) < minversion: - raise RuntimeError( - "'{}={}' is installed, but version '{}' or " - "higher is required".format(mod.__name__, version, minversion) - ) - registry[protocol] = getattr(mod, name) - cls = registry[protocol] - if getattr(cls, "protocol", None) in ("abstract", None): - cls.protocol = protocol - - return cls - - -def filesystem(protocol, **storage_options): - """Instantiate filesystems for given protocol and arguments - - ``storage_options`` are specific to the protocol being chosen, and are - passed directly to the class. - """ - cls = get_filesystem_class(protocol) - return cls(**storage_options) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/spec.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/spec.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/spec.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/spec.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,1246 +0,0 @@ -import warnings -from hashlib import md5 -import io -import os -import logging - -from .transaction import Transaction -from .utils import read_block, tokenize, stringify_path - -logger = logging.getLogger("fsspec") - - -def make_instance(cls, args, kwargs): - return cls(*args, **kwargs) - - -class _Cached(type): - """ - Metaclass for caching file system instances. - - Notes - ----- - Instances are cached according to - - * The values of the class attributes listed in `_extra_tokenize_attributes` - * The arguments passed to ``__init__``. - - This creates an additional reference to the filesystem, which prevents the - filesystem from being garbage collected when all *user* references go away. - A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also* - be made for a filesystem instance to be garbage collected. - """ - - cachable = True - _extra_tokenize_attributes = () - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # Note: we intentionally create a reference here, to avoid garbage - # collecting instances when all other references are gone. To really - # delete a FileSystem, the cache must be cleared. - self._cache = {} - - def __call__(self, *args, **kwargs): - cls = type(self) - extra_tokens = tuple( - getattr(self, attr, None) for attr in self._extra_tokenize_attributes - ) - token = tokenize(cls, *args, *extra_tokens, **kwargs) - if self.cachable and token in self._cache: - return self._cache[token] - else: - obj = super().__call__(*args, **kwargs) - # Setting _fs_token here causes some static linters to complain. - obj._fs_token_ = token - self.storage_args = args - self.storage_options = kwargs - - if self.cachable: - self._cache[token] = obj - return obj - - -try: # optionally derive from pyarrow's FileSystem, if available - import pyarrow as pa - - up = pa.filesystem.DaskFileSystem -except ImportError: - up = object - - -class AbstractFileSystem(up, metaclass=_Cached): - """ - An abstract super-class for pythonic file-systems - - Implementations are expected to be compatible with or, better, subclass - from here. - """ - - cachable = True # this class can be cached, instances reused - _cached = False - blocksize = 2 ** 22 - sep = "/" - protocol = "abstract" - root_marker = "" # For some FSs, may require leading '/' or other character - - #: Extra *class attributes* that should be considered when hashing. - _extra_tokenize_attributes = () - - def __init__(self, *args, **storage_options): - """Create and configure file-system instance - - Instances may be cachable, so if similar enough arguments are seen - a new instance is not required. The token attribute exists to allow - implementations to cache instances if they wish. - - A reasonable default should be provided if there are no arguments. - - Subclasses should call this method. - - Magic kwargs that affect functionality here: - add_docs: if True, will append docstrings from this spec to the - specific implementation - """ - if self._cached: - # reusing instance, don't change - return - self._cached = True - self._intrans = False - self._transaction = None - self.dircache = {} - - if storage_options.pop("add_docs", None): - warnings.warn("add_docs is no longer supported.", FutureWarning) - - if storage_options.pop("add_aliases", None): - warnings.warn("add_aliases has been removed.", FutureWarning) - # This is set in _Cached - self._fs_token_ = None - - @property - def _fs_token(self): - return self._fs_token_ - - def __dask_tokenize__(self): - return self._fs_token - - def __hash__(self): - return int(self._fs_token, 16) - - def __eq__(self, other): - return isinstance(other, type(self)) and self._fs_token == other._fs_token - - @classmethod - def _strip_protocol(cls, path): - """ Turn path from fully-qualified to file-system-specific - - May require FS-specific handling, e.g., for relative paths or links. - """ - path = stringify_path(path) - protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol - for protocol in protos: - path = path.rstrip("/") - if path.startswith(protocol + "://"): - path = path[len(protocol) + 3 :] - elif path.startswith(protocol + ":"): - path = path[len(protocol) + 1 :] - # use of root_marker to make minimum required path, e.g., "/" - return path or cls.root_marker - - @staticmethod - def _get_kwargs_from_urls(paths): - """If kwargs can be encoded in the paths, extract them here - - This should happen before instantiation of the class; incoming paths - then should be amended to strip the options in methods. - - Examples may look like an sftp path "sftp://user@host:/my/path", where - the user and host should become kwargs and later get stripped. - """ - # by default, nothing happens - return {} - - @classmethod - def current(cls): - """ Return the most recently created FileSystem - - If no instance has been created, then create one with defaults - """ - if not len(cls._cache): - return cls() - else: - return list(cls._cache.values())[-1] - - @property - def transaction(self): - """A context within which files are committed together upon exit - - Requires the file class to implement `.commit()` and `.discard()` - for the normal and exception cases. - """ - if self._transaction is None: - self._transaction = Transaction(self) - return self._transaction - - def start_transaction(self): - """Begin write transaction for deferring files, non-context version""" - self._intrans = True - self._transaction = Transaction(self) - return self.transaction - - def end_transaction(self): - """Finish write transaction, non-context version""" - self.transaction.complete() - self._transaction = None - - def invalidate_cache(self, path=None): - """ - Discard any cached directory information - - Parameters - ---------- - path: string or None - If None, clear all listings cached else listings at or under given - path. - """ - pass # not necessary to implement, may have no cache - - def mkdir(self, path, create_parents=True, **kwargs): - """ - Create directory entry at path - - For systems that don't have true directories, may create an for - this instance only and not touch the real filesystem - - Parameters - ---------- - path: str - location - create_parents: bool - if True, this is equivalent to ``makedirs`` - kwargs: - may be permissions, etc. - """ - pass # not necessary to implement, may not have directories - - def makedirs(self, path, exist_ok=False): - """Recursively make directories - - Creates directory at path and any intervening required directories. - Raises exception if, for instance, the path already exists but is a - file. - - Parameters - ---------- - path: str - leaf directory name - exist_ok: bool (False) - If True, will error if the target already exists - """ - pass # not necessary to implement, may not have directories - - def rmdir(self, path): - """Remove a directory, if empty""" - pass # not necessary to implement, may not have directories - - def ls(self, path, detail=True, **kwargs): - """List objects at path. - - This should include subdirectories and files at that location. The - difference between a file and a directory must be clear when details - are requested. - - The specific keys, or perhaps a FileInfo class, or similar, is TBD, - but must be consistent across implementations. - Must include: - - full path to the entry (without protocol) - - size of the entry, in bytes. If the value cannot be determined, will - be ``None``. - - type of entry, "file", "directory" or other - - Additional information - may be present, aproriate to the file-system, e.g., generation, - checksum, etc. - - May use refresh=True|False to allow use of self._ls_from_cache to - check for a saved listing and avoid calling the backend. This would be - common where listing may be expensive. - - Parameters - ---------- - path: str - detail: bool - if True, gives a list of dictionaries, where each is the same as - the result of ``info(path)``. If False, gives a list of paths - (str). - kwargs: may have additional backend-specific options, such as version - information - - Returns - ------- - List of strings if detail is False, or list of directory information - dicts if detail is True. - """ - raise NotImplementedError - - def _ls_from_cache(self, path): - """Check cache for listing - - Returns listing, if found (may me empty list for a directly that exists - but contains nothing), None if not in cache. - """ - parent = self._parent(path) - if path in self.dircache: - return self.dircache[path] - elif parent in self.dircache: - files = [f for f in self.dircache[parent] if f["name"] == path] - if len(files) == 0: - # parent dir was listed but did not contain this file - raise FileNotFoundError(path) - return files - - def walk(self, path, maxdepth=None, **kwargs): - """ Return all files belows path - - List all files, recursing into subdirectories; output is iterator-style, - like ``os.walk()``. For a simple list of files, ``find()`` is available. - - Note that the "files" outputted will include anything that is not - a directory, such as links. - - Parameters - ---------- - path: str - Root to recurse into - maxdepth: int - Maximum recursion depth. None means limitless, but not recommended - on link-based file-systems. - kwargs: passed to ``ls`` - """ - path = self._strip_protocol(path) - full_dirs = [] - dirs = [] - files = [] - - try: - listing = self.ls(path, detail=True, **kwargs) - except (FileNotFoundError, IOError): - return [], [], [] - - for info in listing: - # each info name must be at least [path]/part , but here - # we check also for names like [path]/part/ - name = info["name"].rstrip("/") - if info["type"] == "directory" and name != path: - # do not include "self" path - full_dirs.append(name) - dirs.append(name.rsplit("/", 1)[-1]) - elif name == path: - # file-like with same name as give path - files.append("") - else: - files.append(name.rsplit("/", 1)[-1]) - yield path, dirs, files - - for d in full_dirs: - if maxdepth is None or maxdepth > 1: - for res in self.walk( - d, - maxdepth=(maxdepth - 1) if maxdepth is not None else None, - **kwargs - ): - yield res - - def find(self, path, maxdepth=None, withdirs=False, **kwargs): - """List all files below path. - - Like posix ``find`` command without conditions - - Parameters - ---------- - path : str - maxdepth: int or None - If not None, the maximum number of levels to descend - withdirs: bool - Whether to include directory paths in the output. This is True - when used by glob, but users usually only want files. - kwargs are passed to ``ls``. - """ - # TODO: allow equivalent of -name parameter - out = set() - for path, dirs, files in self.walk(path, maxdepth, **kwargs): - if withdirs: - files += dirs - for name in files: - if name and name not in out: - out.add("/".join([path.rstrip("/"), name]) if path else name) - if self.isfile(path) and path not in out: - # walk works on directories, but find should also return [path] - # when path happens to be a file - out.add(path) - return sorted(out) - - def du(self, path, total=True, maxdepth=None, **kwargs): - """Space used by files within a path - - Parameters - ---------- - path: str - total: bool - whether to sum all the file sizes - maxdepth: int or None - maximum number of directory levels to descend, None for unlimited. - kwargs: passed to ``ls`` - - Returns - ------- - Dict of {fn: size} if total=False, or int otherwise, where numbers - refer to bytes used. - """ - sizes = {} - for f in self.find(path, maxdepth=maxdepth, **kwargs): - info = self.info(f) - sizes[info["name"]] = info["size"] - if total: - return sum(sizes.values()) - else: - return sizes - - def glob(self, path, **kwargs): - """ - Find files by glob-matching. - - If the path ends with '/' and does not contain "*", it is essentially - the same as ``ls(path)``, returning only files. - - We support ``"**"``, - ``"?"`` and ``"[..]"``. - - kwargs are passed to ``ls``. - """ - import re - from glob import has_magic - - ends = path.endswith("/") - path = self._strip_protocol(path) - indstar = path.find("*") if path.find("*") >= 0 else len(path) - indques = path.find("?") if path.find("?") >= 0 else len(path) - indbrace = path.find("[") if path.find("[") >= 0 else len(path) - - ind = min(indstar, indques, indbrace) - - if not has_magic(path): - root = path - depth = 1 - if ends: - path += "/*" - elif self.exists(path): - return [path] - else: - return [] # glob of non-existent returns empty - elif "/" in path[:ind]: - ind2 = path[:ind].rindex("/") - root = path[: ind2 + 1] - depth = 20 if "**" in path else path[ind2 + 1 :].count("/") + 1 - else: - root = "" - depth = 20 if "**" in path else 1 - allpaths = self.find(root, maxdepth=depth, withdirs=True, **kwargs) - pattern = ( - "^" - + ( - path.replace("\\", r"\\") - .replace(".", r"\.") - .replace("+", r"\+") - .replace("//", "/") - .replace("(", r"\(") - .replace(")", r"\)") - .replace("|", r"\|") - .rstrip("/") - .replace("?", ".") - ) - + "$" - ) - pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern) - pattern = re.sub("[*]", "[^/]*", pattern) - pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*")) - out = {p for p in allpaths if pattern.match(p.replace("//", "/").rstrip("/"))} - return list(sorted(out)) - - def exists(self, path): - """Is there a file at the given path""" - try: - self.info(path) - return True - except: # noqa: E722 - # any exception allowed bar FileNotFoundError? - return False - - def info(self, path, **kwargs): - """Give details of entry at path - - Returns a single dictionary, with exactly the same information as ``ls`` - would with ``detail=True``. - - The default implementation should calls ls and could be overridden by a - shortcut. kwargs are passed on to ```ls()``. - - Some file systems might not be able to measure the file's size, in - which case, the returned dict will include ``'size': None``. - - Returns - ------- - dict with keys: name (full path in the FS), size (in bytes), type (file, - directory, or something else) and other FS-specific keys. - """ - path = self._strip_protocol(path) - out = self.ls(self._parent(path), detail=True, **kwargs) - out = [o for o in out if o["name"].rstrip("/") == path] - if out: - return out[0] - out = self.ls(path, detail=True, **kwargs) - path = path.rstrip("/") - out1 = [o for o in out if o["name"].rstrip("/") == path] - if len(out1) == 1: - if "size" not in out1[0]: - out1[0]["size"] = None - return out1[0] - elif len(out1) > 1 or out: - return {"name": path, "size": 0, "type": "directory"} - else: - raise FileNotFoundError(path) - - def checksum(self, path): - """Unique value for current version of file - - If the checksum is the same from one moment to another, the contents - are guaranteed to be the same. If the checksum changes, the contents - *might* have changed. - - This should normally be overridden; default will probably capture - creation/modification timestamp (which would be good) or maybe - access timestamp (which would be bad) - """ - return int(tokenize(self.info(path)), 16) - - def size(self, path): - """Size in bytes of file""" - return self.info(path).get("size", None) - - def isdir(self, path): - """Is this entry directory-like?""" - try: - return self.info(path)["type"] == "directory" - except FileNotFoundError: - return False - - def isfile(self, path): - """Is this entry file-like?""" - try: - return self.info(path)["type"] == "file" - except: # noqa: E722 - return False - - def cat(self, path): - """ Get the content of a file """ - return self.open(path, "rb").read() - - def get(self, rpath, lpath, recursive=False, **kwargs): - """Copy file to local. - - Possible extension: maybe should be able to copy to any file-system - (streaming through local). - """ - rpath = self._strip_protocol(rpath) - if recursive: - rpaths = self.find(rpath) - lpaths = [ - os.path.join(lpath, path[len(rpath) :].lstrip("/")) for path in rpaths - ] - for lpath in lpaths: - dirname = os.path.dirname(lpath) - if not os.path.isdir(dirname): - os.makedirs(dirname) - else: - rpaths = [rpath] - lpaths = [lpath] - for lpath, rpath in zip(lpaths, rpaths): - with self.open(rpath, "rb", **kwargs) as f1: - with open(lpath, "wb") as f2: - data = True - while data: - data = f1.read(self.blocksize) - f2.write(data) - - def put(self, lpath, rpath, recursive=False, **kwargs): - """ Upload file from local """ - if recursive: - lpaths = [] - for dirname, subdirlist, filelist in os.walk(lpath): - lpaths += [os.path.join(dirname, filename) for filename in filelist] - rootdir = os.path.basename(lpath.rstrip("/")) - if self.exists(rpath): - # copy lpath inside rpath directory - rpath2 = os.path.join(rpath, rootdir) - else: - # copy lpath as rpath directory - rpath2 = rpath - rpaths = [ - os.path.join(rpath2, path[len(lpath) :].lstrip("/")) for path in lpaths - ] - else: - lpaths = [lpath] - rpaths = [rpath] - for lpath, rpath in zip(lpaths, rpaths): - with open(lpath, "rb") as f1: - with self.open(rpath, "wb", **kwargs) as f2: - data = True - while data: - data = f1.read(self.blocksize) - f2.write(data) - - def head(self, path, size=1024): - """ Get the first ``size`` bytes from file """ - with self.open(path, "rb") as f: - return f.read(size) - - def tail(self, path, size=1024): - """ Get the last ``size`` bytes from file """ - with self.open(path, "rb") as f: - f.seek(max(-size, -f.size), 2) - return f.read() - - def copy(self, path1, path2, **kwargs): - """ Copy within two locations in the filesystem""" - raise NotImplementedError - - def mv(self, path1, path2, **kwargs): - """ Move file from one location to another """ - self.copy(path1, path2, **kwargs) - self.rm(path1, recursive=False) - - def _rm(self, path): - """Delete a file""" - raise NotImplementedError - - def rm(self, path, recursive=False, maxdepth=None): - """Delete files. - - Parameters - ---------- - path: str or list of str - File(s) to delete. - recursive: bool - If file(s) are directories, recursively delete contents and then - also remove the directory - maxdepth: int or None - Depth to pass to walk for finding files to delete, if recursive. - If None, there will be no limit and infinite recursion may be - possible. - """ - # prefer some bulk method, if possible - if not isinstance(path, list): - path = [path] - for p in path: - if recursive: - out = self.walk(p, maxdepth=maxdepth) - for pa_, _, files in reversed(list(out)): - for name in files: - fn = "/".join([pa_, name]) if pa_ else name - self.rm(fn) - self.rmdir(pa_) - else: - self._rm(p) - - @classmethod - def _parent(cls, path): - path = cls._strip_protocol(path.rstrip("/")) - if "/" in path: - return cls.root_marker + path.rsplit("/", 1)[0] - else: - return cls.root_marker - - def _open( - self, - path, - mode="rb", - block_size=None, - autocommit=True, - cache_options=None, - **kwargs - ): - """Return raw bytes-mode file-like from the file-system""" - return AbstractBufferedFile( - self, - path, - mode, - block_size, - autocommit, - cache_options=cache_options, - **kwargs - ) - - def open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs): - """ - Return a file-like object from the filesystem - - The resultant instance must function correctly in a context ``with`` - block. - - Parameters - ---------- - path: str - Target file - mode: str like 'rb', 'w' - See builtin ``open()`` - block_size: int - Some indication of buffering - this is a value in bytes - cache_options : dict, optional - Extra arguments to pass through to the cache. - encoding, errors, newline: passed on to TextIOWrapper for text mode - """ - import io - - path = self._strip_protocol(path) - if "b" not in mode: - mode = mode.replace("t", "") + "b" - - text_kwargs = { - k: kwargs.pop(k) - for k in ["encoding", "errors", "newline"] - if k in kwargs - } - return io.TextIOWrapper( - self.open(path, mode, block_size, **kwargs), **text_kwargs - ) - else: - ac = kwargs.pop("autocommit", not self._intrans) - f = self._open( - path, - mode=mode, - block_size=block_size, - autocommit=ac, - cache_options=cache_options, - **kwargs - ) - if not ac: - self.transaction.files.append(f) - return f - - def touch(self, path, truncate=True, **kwargs): - """ Create empty file, or update timestamp - - Parameters - ---------- - path: str - file location - truncate: bool - If True, always set file size to 0; if False, update timestamp and - leave file unchanged, if backend allows this - """ - if truncate or not self.exists(path): - with self.open(path, "wb", **kwargs): - pass - else: - raise NotImplementedError # update timestamp, if possible - - def ukey(self, path): - """Hash of file properties, to tell if it has changed""" - return md5(str(self.info(path)).encode()).hexdigest() - - def read_block(self, fn, offset, length, delimiter=None): - """ Read a block of bytes from - - Starting at ``offset`` of the file, read ``length`` bytes. If - ``delimiter`` is set then we ensure that the read starts and stops at - delimiter boundaries that follow the locations ``offset`` and ``offset - + length``. If ``offset`` is zero then we start at zero. The - bytestring returned WILL include the end delimiter string. - - If offset+length is beyond the eof, reads to eof. - - Parameters - ---------- - fn: string - Path to filename - offset: int - Byte offset to start read - length: int - Number of bytes to read - delimiter: bytes (optional) - Ensure reading starts and stops at delimiter bytestring - - Examples - -------- - >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP - b'Alice, 100\\nBo' - >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP - b'Alice, 100\\nBob, 200\\n' - - Use ``length=None`` to read to the end of the file. - >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP - b'Alice, 100\\nBob, 200\\nCharlie, 300' - - See Also - -------- - utils.read_block - """ - with self.open(fn, "rb") as f: - size = f.size - if length is None: - length = size - if size is not None and offset + length > size: - length = size - offset - return read_block(f, offset, length, delimiter) - - def __reduce__(self): - return make_instance, (type(self), self.storage_args, self.storage_options) - - def _get_pyarrow_filesystem(self): - """ - Make a version of the FS instance which will be acceptable to pyarrow - """ - # all instances already also derive from pyarrow - return self - - def get_mapper(self, root, check=False, create=False): - """Create key/value store based on this file-system - - Makes a MutibleMapping interface to the FS at the given root path. - See ``fsspec.mapping.FSMap`` for further details. - """ - from .mapping import FSMap - - return FSMap(root, self, check, create) - - @classmethod - def clear_instance_cache(cls): - """ - Clear the cache of filesystem instances. - - Notes - ----- - Unless overridden by setting the ``cachable`` class attribute to False, - the filesystem class stores a reference to newly created instances. This - prevents Python's normal rules around garbage collection from working, - since the instances refcount will not drop to zero until - ``clear_instance_cache`` is called. - """ - cls._cache.clear() - - # ------------------------------------------------------------------------ - # Aliases - - def makedir(self, path, create_parents=True, **kwargs): - """Alias of :ref:`FilesystemSpec.mkdir`.""" - return self.mkdir(path, create_parents=create_parents, **kwargs) - - def mkdirs(self, path, exist_ok=False): - """Alias of :ref:`FilesystemSpec.makedirs`.""" - return self.makedirs(path, exist_ok=exist_ok) - - def listdir(self, path, detail=True, **kwargs): - """Alias of :ref:`FilesystemSpec.ls`.""" - return self.ls(path, detail=detail, **kwargs) - - def cp(self, path1, path2, **kwargs): - """Alias of :ref:`FilesystemSpec.copy`.""" - return self.copy(path1, path2, **kwargs) - - def move(self, path1, path2, **kwargs): - """Alias of :ref:`FilesystemSpec.mv`.""" - return self.mv(path1, path2, **kwargs) - - def stat(self, path, **kwargs): - """Alias of :ref:`FilesystemSpec.info`.""" - return self.info(path, **kwargs) - - def disk_usage(self, path, total=True, maxdepth=None, **kwargs): - """Alias of :ref:`FilesystemSpec.du`.""" - return self.du(path, total=total, maxdepth=maxdepth, **kwargs) - - def rename(self, path1, path2, **kwargs): - """Alias of :ref:`FilesystemSpec.mv`.""" - return self.mv(path1, path2, **kwargs) - - def delete(self, path, recursive=False, maxdepth=None): - """Alias of :ref:`FilesystemSpec.rm`.""" - return self.rm(path, recursive=recursive, maxdepth=maxdepth) - - def upload(self, lpath, rpath, recursive=False, **kwargs): - """Alias of :ref:`FilesystemSpec.put`.""" - return self.put(lpath, rpath, recursive=recursive, **kwargs) - - def download(self, rpath, lpath, recursive=False, **kwargs): - """Alias of :ref:`FilesystemSpec.get`.""" - return self.get(rpath, lpath, recursive=recursive, **kwargs) - - -class AbstractBufferedFile(io.IOBase): - """Convenient class to derive from to provide buffering - - In the case that the backend does not provide a pythonic file-like object - already, this class contains much of the logic to build one. The only - methods that need to be overridden are ``_upload_chunk``, - ``_initate_upload`` and ``_fetch_range``. - """ - - DEFAULT_BLOCK_SIZE = 5 * 2 ** 20 - - def __init__( - self, - fs, - path, - mode="rb", - block_size="default", - autocommit=True, - cache_type="readahead", - cache_options=None, - **kwargs - ): - """ - Template for files with buffered reading and writing - - Parameters - ---------- - fs: instance of FileSystem - path: str - location in file-system - mode: str - Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file - systems may be read-only, and some may not support append. - block_size: int - Buffer size for reading or writing, 'default' for class default - autocommit: bool - Whether to write to final destination; may only impact what - happens when file is being closed. - cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead" - Caching policy in read mode. See the definitions in ``core``. - cache_options : dict - Additional options passed to the constructor for the cache specified - by `cache_type`. - kwargs: - Gets stored as self.kwargs - """ - from .core import caches - - self.path = path - self.fs = fs - self.mode = mode - self.blocksize = ( - self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size - ) - self.loc = 0 - self.autocommit = autocommit - self.end = None - self.start = None - self.closed = False - - if cache_options is None: - cache_options = {} - - if "trim" in kwargs: - warnings.warn( - "Passing 'trim' to control the cache behavior has been deprecated. " - "Specify it within the 'cache_options' argument instead.", - FutureWarning, - ) - cache_options["trim"] = kwargs.pop("trim") - - self.kwargs = kwargs - - if mode not in {"ab", "rb", "wb"}: - raise NotImplementedError("File mode not supported") - if mode == "rb": - if not hasattr(self, "details"): - self.details = fs.info(path) - self.size = self.details["size"] - self.cache = caches[cache_type]( - self.blocksize, self._fetch_range, self.size, **cache_options - ) - else: - self.buffer = io.BytesIO() - self.offset = None - self.forced = False - self.location = None - - @property - def closed(self): - # get around this attr being read-only in IOBase - return self._closed - - @closed.setter - def closed(self, c): - self._closed = c - - def __hash__(self): - if "w" in self.mode: - return id(self) - else: - return int(tokenize(self.details), 16) - - def __eq__(self, other): - """Files are equal if they have the same checksum, only in read mode""" - return self.mode == "rb" and other.mode == "rb" and hash(self) == hash(other) - - def commit(self): - """Move from temp to final destination""" - - def discard(self): - """Throw away temporary file""" - - def info(self): - """ File information about this path """ - if "r" in self.mode: - return self.details - else: - raise ValueError("Info not available while writing") - - def tell(self): - """ Current file location """ - return self.loc - - def seek(self, loc, whence=0): - """ Set current file location - - Parameters - ---------- - loc: int - byte location - whence: {0, 1, 2} - from start of file, current location or end of file, resp. - """ - loc = int(loc) - if not self.mode == "rb": - raise ValueError("Seek only available in read mode") - if whence == 0: - nloc = loc - elif whence == 1: - nloc = self.loc + loc - elif whence == 2: - nloc = self.size + loc - else: - raise ValueError("invalid whence (%s, should be 0, 1 or 2)" % whence) - if nloc < 0: - raise ValueError("Seek before start of file") - self.loc = nloc - return self.loc - - def write(self, data): - """ - Write data to buffer. - - Buffer only sent on flush() or if buffer is greater than - or equal to blocksize. - - Parameters - ---------- - data: bytes - Set of bytes to be written. - """ - if self.mode not in {"wb", "ab"}: - raise ValueError("File not in write mode") - if self.closed: - raise ValueError("I/O operation on closed file.") - if self.forced: - raise ValueError("This file has been force-flushed, can only close") - out = self.buffer.write(data) - self.loc += out - if self.buffer.tell() >= self.blocksize: - self.flush() - return out - - def flush(self, force=False): - """ - Write buffered data to backend store. - - Writes the current buffer, if it is larger than the block-size, or if - the file is being closed. - - Parameters - ---------- - force: bool - When closing, write the last block even if it is smaller than - blocks are allowed to be. Disallows further writing to this file. - """ - - if self.closed: - raise ValueError("Flush on closed file") - if force and self.forced: - raise ValueError("Force flush cannot be called more than once") - if force: - self.forced = True - - if self.mode not in {"wb", "ab"}: - # no-op to flush on read-mode - return - - if not force and self.buffer.tell() < self.blocksize: - # Defer write on small block - return - - if self.offset is None: - # Initialize a multipart upload - self.offset = 0 - self._initiate_upload() - - if self._upload_chunk(final=force) is not False: - self.offset += self.buffer.seek(0, 2) - self.buffer = io.BytesIO() - - def _upload_chunk(self, final=False): - """ Write one part of a multi-block file upload - - Parameters - ========== - final: bool - This is the last block, so should complete file, if - self.autocommit is True. - """ - # may not yet have been initialized, may neet to call _initialize_upload - - def _initiate_upload(self): - """ Create remote file/upload """ - pass - - def _fetch_range(self, start, end): - """Get the specified set of bytes from remote""" - raise NotImplementedError - - def read(self, length=-1): - """ - Return data from cache, or fetch pieces as necessary - - Parameters - ---------- - length: int (-1) - Number of bytes to read; if <0, all remaining bytes. - """ - length = -1 if length is None else int(length) - if self.mode != "rb": - raise ValueError("File not in read mode") - if length < 0: - length = self.size - self.loc - if self.closed: - raise ValueError("I/O operation on closed file.") - logger.debug("%s read: %i - %i" % (self, self.loc, self.loc + length)) - if length == 0: - # don't even bother calling fetch - return b"" - out = self.cache._fetch(self.loc, self.loc + length) - self.loc += len(out) - return out - - def readinto(self, b): - """mirrors builtin file's readinto method - - https://docs.python.org/3/library/io.html#io.RawIOBase.readinto - """ - data = self.read(len(b)) - b[: len(data)] = data - return len(data) - - def readuntil(self, char=b"\n", blocks=None): - """Return data between current position and first occurrence of char - - char is included in the output, except if the end of the tile is - encountered first. - - Parameters - ---------- - char: bytes - Thing to find - blocks: None or int - How much to read in each go. Defaults to file blocksize - which may - mean a new read on every call. - """ - out = [] - while True: - start = self.tell() - part = self.read(blocks or self.blocksize) - if len(part) == 0: - break - found = part.find(char) - if found > -1: - out.append(part[: found + len(char)]) - self.seek(start + found + len(char)) - break - out.append(part) - return b"".join(out) - - def readline(self): - """Read until first occurrence of newline character - - Note that, because of character encoding, this is not necessarily a - true line ending. - """ - return self.readuntil(b"\n") - - def __next__(self): - out = self.readline() - if out: - return out - raise StopIteration - - def __iter__(self): - return self - - def readlines(self): - """Return all data, split by the newline character""" - data = self.read() - lines = data.split(b"\n") - out = [l + b"\n" for l in lines[:-1]] - if data.endswith(b"\n"): - return out - else: - return out + [lines[-1]] - # return list(self) ??? - - def readinto1(self, b): - return self.readinto(b) - - def close(self): - """ Close file - - Finalizes writes, discards cache - """ - if self.closed: - return - if self.mode == "rb": - self.cache = None - else: - if not self.forced: - self.flush(force=True) - - if self.fs is not None: - self.fs.invalidate_cache(self.path) - self.fs.invalidate_cache(self.fs._parent(self.path)) - - self.closed = True - - def readable(self): - """Whether opened for reading""" - return self.mode == "rb" and not self.closed - - def seekable(self): - """Whether is seekable (only in read mode)""" - return self.readable() - - def writable(self): - """Whether opened for writing""" - return self.mode in {"wb", "ab"} and not self.closed - - def __del__(self): - self.close() - - def __str__(self): - return "" % (type(self.fs).__name__, self.path) - - __repr__ = __str__ - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/transaction.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/transaction.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/transaction.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/transaction.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,80 +0,0 @@ -class Transaction(object): - """Filesystem transaction write context - - Gathers files for deferred commit or discard, so that several write - operations can be finalized semi-atomically. This works by having this - instance as the ``.transaction`` attribute of the given filesystem - """ - - def __init__(self, fs): - """ - Parameters - ---------- - fs: FileSystem instance - """ - self.fs = fs - self.files = [] - - def __enter__(self): - self.start() - - def __exit__(self, exc_type, exc_val, exc_tb): - """End transaction and commit, if exit is not due to exception""" - # only commit if there was no exception - self.complete(commit=exc_type is None) - self.fs._intrans = False - self.fs._transaction = None - - def start(self): - """Start a transaction on this FileSystem""" - self.fs._intrans = True - - def complete(self, commit=True): - """Finish transaction: commit or discard all deferred files""" - for f in self.files: - if commit: - f.commit() - else: - f.discard() - self.files = [] - self.fs._intrans = False - - -class FileActor(object): - def __init__(self): - self.files = [] - - def commit(self): - for f in self.files: - f.commit() - self.files.clear() - - def discard(self): - for f in self.files: - f.discard() - self.files.clear() - - def append(self, f): - self.files.append(f) - - -class DaskTransaction(Transaction): - def __init__(self, fs): - """ - Parameters - ---------- - fs: FileSystem instance - """ - import distributed - - super().__init__(fs) - client = distributed.default_client() - self.files = client.submit(FileActor, actor=True).result() - - def complete(self, commit=True): - """Finish transaction: commit or discard all deferred files""" - if commit: - self.files.commit().result() - else: - self.files.discard().result() - self.fs._intrans = False diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/utils.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/utils.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/utils.py 2019-11-13 16:37:40.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/utils.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,304 +0,0 @@ -from hashlib import md5 -import math -import os -import pathlib -import re -from urllib.parse import urlsplit - - -DEFAULT_BLOCK_SIZE = 5 * 2 ** 20 - - -def infer_storage_options(urlpath, inherit_storage_options=None): - """ Infer storage options from URL path and merge it with existing storage - options. - - Parameters - ---------- - urlpath: str or unicode - Either local absolute file path or URL (hdfs://namenode:8020/file.csv) - inherit_storage_options: dict (optional) - Its contents will get merged with the inferred information from the - given path - - Returns - ------- - Storage options dict. - - Examples - -------- - >>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP - {"protocol": "file", "path", "/mnt/datasets/test.csv"} - >>> infer_storage_options( - ... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1', - ... inherit_storage_options={'extra': 'value'}) # doctest: +SKIP - {"protocol": "hdfs", "username": "username", "password": "pwd", - "host": "node", "port": 123, "path": "/mnt/datasets/test.csv", - "url_query": "q=1", "extra": "value"} - """ - # Handle Windows paths including disk name in this special case - if re.match(r"^[a-zA-Z]:[\\/]", urlpath): - return {"protocol": "file", "path": urlpath} - - parsed_path = urlsplit(urlpath) - protocol = parsed_path.scheme or "file" - if parsed_path.fragment: - path = "#".join([parsed_path.path, parsed_path.fragment]) - else: - path = parsed_path.path - if protocol == "file": - # Special case parsing file protocol URL on Windows according to: - # https://msdn.microsoft.com/en-us/library/jj710207.aspx - windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path) - if windows_path: - path = "%s:%s" % windows_path.groups() - - if protocol in ["http", "https"]: - # for HTTP, we don't want to parse, as requests will anyway - return {"protocol": protocol, "path": urlpath} - - options = {"protocol": protocol, "path": path} - - if parsed_path.netloc: - # Parse `hostname` from netloc manually because `parsed_path.hostname` - # lowercases the hostname which is not always desirable (e.g. in S3): - # https://github.com/dask/dask/issues/1417 - options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0] - - if protocol in ("s3", "gcs", "gs"): - options["path"] = options["host"] + options["path"] - else: - options["host"] = options["host"] - if parsed_path.port: - options["port"] = parsed_path.port - if parsed_path.username: - options["username"] = parsed_path.username - if parsed_path.password: - options["password"] = parsed_path.password - - if parsed_path.query: - options["url_query"] = parsed_path.query - if parsed_path.fragment: - options["url_fragment"] = parsed_path.fragment - - if inherit_storage_options: - update_storage_options(options, inherit_storage_options) - - return options - - -def update_storage_options(options, inherited=None): - if not inherited: - inherited = {} - collisions = set(options) & set(inherited) - if collisions: - collisions = "\n".join("- %r" % k for k in collisions) - raise KeyError( - "Collision between inferred and specified storage " - "options:\n%s" % collisions - ) - options.update(inherited) - - -# Compression extensions registered via fsspec.compression.register_compression -compressions = {} - - -def infer_compression(filename): - """Infer compression, if available, from filename. - - Infer a named compression type, if registered and available, from filename - extension. This includes builtin (gz, bz2, zip) compressions, as well as - optional compressions. See fsspec.compression.register_compression. - """ - extension = os.path.splitext(filename)[-1].strip(".") - if extension in compressions: - return compressions[extension] - - -def build_name_function(max_int): - """ Returns a function that receives a single integer - and returns it as a string padded by enough zero characters - to align with maximum possible integer - - >>> name_f = build_name_function(57) - - >>> name_f(7) - '07' - >>> name_f(31) - '31' - >>> build_name_function(1000)(42) - '0042' - >>> build_name_function(999)(42) - '042' - >>> build_name_function(0)(0) - '0' - """ - # handle corner cases max_int is 0 or exact power of 10 - max_int += 1e-8 - - pad_length = int(math.ceil(math.log10(max_int))) - - def name_function(i): - return str(i).zfill(pad_length) - - return name_function - - -def seek_delimiter(file, delimiter, blocksize): - r"""Seek current file to file start, file end, or byte after delimiter seq. - - Seeks file to next chunk delimiter, where chunks are defined on file start, - a delimiting sequence, and file end. Use file.tell() to see location afterwards. - Note that file start is a valid split, so must be at offset > 0 to seek for - delimiter. - - Parameters - ---------- - file: a file - delimiter: bytes - a delimiter like ``b'\n'`` or message sentinel, matching file .read() type - blocksize: int - Number of bytes to read from the file at once. - - - Returns - ------- - Returns True if a delimiter was found, False if at file start or end. - - """ - - if file.tell() == 0: - # beginning-of-file, return without seek - return False - - # Interface is for binary IO, with delimiter as bytes, but initialize last - # with result of file.read to preserve compatibility with text IO. - last = None - while True: - current = file.read(blocksize) - if not current: - # end-of-file without delimiter - return False - full = last + current if last else current - try: - if delimiter in full: - i = full.index(delimiter) - file.seek(file.tell() - (len(full) - i) + len(delimiter)) - return True - elif len(current) < blocksize: - # end-of-file without delimiter - return False - except (OSError, ValueError): - pass - last = full[-len(delimiter) :] - - -def read_block(f, offset, length, delimiter=None, split_before=False): - """ Read a block of bytes from a file - - Parameters - ---------- - f: File - Open file - offset: int - Byte offset to start read - length: int - Number of bytes to read, read through end of file if None - delimiter: bytes (optional) - Ensure reading starts and stops at delimiter bytestring - split_before: bool (optional) - Start/stop read *before* delimiter bytestring. - - - If using the ``delimiter=`` keyword argument we ensure that the read - starts and stops at delimiter boundaries that follow the locations - ``offset`` and ``offset + length``. If ``offset`` is zero then we - start at zero, regardless of delimiter. The bytestring returned WILL - include the terminating delimiter string. - - Examples - -------- - - >>> from io import BytesIO # doctest: +SKIP - >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP - >>> read_block(f, 0, 13) # doctest: +SKIP - b'Alice, 100\\nBo' - - >>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP - b'Alice, 100\\nBob, 200\\n' - - >>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP - b'Bob, 200\\nCharlie, 300' - """ - if delimiter: - f.seek(offset) - found_start_delim = seek_delimiter(f, delimiter, 2 ** 16) - if length is None: - return f.read() - start = f.tell() - length -= start - offset - - f.seek(start + length) - found_end_delim = seek_delimiter(f, delimiter, 2 ** 16) - end = f.tell() - - # Adjust split location to before delimiter iff seek found the - # delimiter sequence, not start or end of file. - if found_start_delim and split_before: - start -= len(delimiter) - - if found_end_delim and split_before: - end -= len(delimiter) - - offset = start - length = end - start - - f.seek(offset) - b = f.read(length) - return b - - -def tokenize(*args, **kwargs): - """ Deterministic token - - (modified from dask.base) - - >>> tokenize([1, 2, '3']) - '9d71491b50023b06fc76928e6eddb952' - - >>> tokenize('Hello') == tokenize('Hello') - True - """ - if kwargs: - args += (kwargs,) - return md5(str(args).encode()).hexdigest() - - -def stringify_path(filepath): - """ Attempt to convert a path-like object to a string. - - Parameters - ---------- - filepath: object to be converted - - Returns - ------- - filepath_str: maybe a string version of the object - - Notes - ----- - Objects supporting the fspath protocol (Python 3.6+) are coerced - according to its __fspath__ method. - - For backwards compatibility with older Python version, pathlib.Path - objects are specially coerced. - - Any other object is passed through unchanged, which includes bytes, - strings, buffers, or anything else that's not even path-like. - """ - if hasattr(filepath, "__fspath__"): - return filepath.__fspath__() - elif isinstance(filepath, pathlib.Path): - return str(filepath) - return filepath diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/_version.py fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/_version.py --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/_version.py 2019-11-27 17:39:38.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec/_version.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,21 +0,0 @@ - -# This file was generated by 'versioneer.py' (0.18) from -# revision-control system data, or from the parent directory name of an -# unpacked source archive. Distribution tarballs contain a pre-generated copy -# of this file. - -import json - -version_json = ''' -{ - "date": "2019-11-13T10:37:40-0600", - "dirty": false, - "error": null, - "full-revisionid": "8b59dc8c2c035db5793102b9513c46e6a1bd4fb0", - "version": "0.6.0" -} -''' # END VERSION_JSON - - -def get_versions(): - return json.loads(version_json) diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/dependency_links.txt fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/dependency_links.txt --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/dependency_links.txt 2019-11-27 17:39:38.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/dependency_links.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ - diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/not-zip-safe fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/not-zip-safe --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/not-zip-safe 2019-11-27 06:34:55.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/not-zip-safe 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ - diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/PKG-INFO fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/PKG-INFO --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/PKG-INFO 2019-11-27 17:39:38.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/PKG-INFO 1970-01-01 00:00:00.000000000 +0000 @@ -1,93 +0,0 @@ -Metadata-Version: 2.1 -Name: fsspec -Version: 0.6.0 -Summary: File-system specification -Home-page: http://github.com/intake/filesystem_spec -Maintainer: Martin Durant -Maintainer-email: mdurant@anaconda.com -License: BSD -Description: # filesystem_spec - - [![Build Status](https://travis-ci.org/intake/filesystem_spec.svg?branch=master)](https://travis-ci.org/martindurant/filesystem_spec) - [![Docs](https://readthedocs.org/projects/filesystem-spec/badge/?version=latest)](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest) - - A specification for pythonic filesystems. - - ## Install - - ```bash - pip install fsspec - ``` - or - ```bash - conda install -c conda-forge fsspec - ``` - - ## Purpose - - To produce a template or specification for a file-system interface, that specific implementations should follow, - so that applications making use of them can rely on a common behaviour and not have to worry about the specific - internal implementation decisions with any given backend. Many such implementations are included in this package, - or in sister projects such as `s3fs` and `gcsfs`. - - In addition, if this is well-designed, then additional functionality, such as a key-value store or FUSE - mounting of the file-system implementation may be available for all implementations "for free". - - ## Documentation - - Please refer to [RTD](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest) - - ## Develop - - fsspec uses [tox](https://tox.readthedocs.io/en/latest/) and - [tox-conda](https://github.com/tox-dev/tox-conda) to manage dev and test - environments. First, install conda with tox and tox-conda in a base environment - (eg. `conda install -c conda-forge tox tox-conda`). Calls to `tox` can then be - used to configure a development environment and run tests. - - First, setup a development conda environment via `tox -e dev`. This will - install fspec dependencies, test & dev tools, and install fsspec in develop - mode. Then, activate the dev environment under `.tox/dev` via `conda activate .tox/dev`. - - ### Testing - - Tests can be run directly in the activated dev environment via `pytest fsspec`. - - The full fsspec test suite can be run via `tox`, which will setup and execute - tests against multiple dependency versions in isolated environment. Run `tox - -av` to list available test environments, select environments via `tox -e `. - - The full fsspec suite requires a system-level docker, docker-compose, and fuse - installation. See `ci/install.sh` for a detailed installation example. - - ### Code Formatting - - fsspec uses [Black](https://black.readthedocs.io/en/stable) to ensure - a consistent code format throughout the project. ``black`` is automatically - installed in the tox dev env, activated via `conda activate .tox/dev`. - - Then, run `black fsspec` from the root of the filesystem_spec repository to - auto-format your code. Additionally, many editors have plugins that will apply - `black` as you edit files. - - Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com) to - automatically run `black` when you make a git commit. ``black`` is automatically - installed in the tox dev env, activated via `conda activate .tox/dev`. - - Then, run `pre-commit install --install-hooks` from the root of the - filesystem_spec repository to setup pre-commit hooks. `black` will now be run - before you commit, reformatting any changed files. You can format without - committing via `pre-commit run` or skip these checks with `git commit - --no-verify`. - -Keywords: file -Platform: UNKNOWN -Classifier: Development Status :: 4 - Beta -Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: BSD License -Classifier: Operating System :: OS Independent -Classifier: Programming Language :: Python :: 3.5 -Classifier: Programming Language :: Python :: 3.6 -Classifier: Programming Language :: Python :: 3.7 -Requires-Python: >=3.5 -Description-Content-Type: text/markdown diff -Nru fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/top_level.txt fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/top_level.txt --- fsspec-0.6.1/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/top_level.txt 2019-11-27 17:39:38.000000000 +0000 +++ fsspec-0.8.4/debian/python3-python-fsspec/usr/lib/python3.8/dist-packages/fsspec-0.6.0.egg-info/top_level.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -fsspec diff -Nru fsspec-0.6.1/debian/salsa-ci.yml fsspec-0.8.4/debian/salsa-ci.yml --- fsspec-0.6.1/debian/salsa-ci.yml 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/debian/salsa-ci.yml 2020-10-16 16:36:59.000000000 +0000 @@ -0,0 +1,4 @@ +--- +include: + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml diff -Nru fsspec-0.6.1/debian/upstream/metadata fsspec-0.8.4/debian/upstream/metadata --- fsspec-0.6.1/debian/upstream/metadata 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/debian/upstream/metadata 2020-10-15 19:30:12.000000000 +0000 @@ -0,0 +1,5 @@ +--- +Bug-Database: https://github.com/intake/filesystem_spec/issues +Bug-Submit: https://github.com/intake/filesystem_spec/issues/new +Repository: https://github.com/intake/filesystem_spec.git +Repository-Browse: https://github.com/intake/filesystem_spec diff -Nru fsspec-0.6.1/docs/environment.yml fsspec-0.8.4/docs/environment.yml --- fsspec-0.6.1/docs/environment.yml 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/docs/environment.yml 2020-10-14 16:51:19.000000000 +0000 @@ -1,9 +1,8 @@ name: fsspec channels: - defaults - - conda-forge dependencies: - - python=3.6 + - python=3.7 - paramiko - requests - numpydoc diff -Nru fsspec-0.6.1/docs/source/api.rst fsspec-0.8.4/docs/source/api.rst --- fsspec-0.6.1/docs/source/api.rst 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/docs/source/api.rst 2020-10-14 16:51:19.000000000 +0000 @@ -9,17 +9,22 @@ .. autosummary:: fsspec.open_files fsspec.open + fsspec.open_local fsspec.filesystem fsspec.get_filesystem_class fsspec.get_mapper fsspec.fuse.run + fsspec.gui.FileSelector .. autofunction:: fsspec.open_files .. autofunction:: fsspec.open +.. autofunction:: fsspec.open_local .. autofunction:: fsspec.filesystem .. autofunction:: fsspec.get_filesystem_class .. autofunction:: fsspec.get_mapper .. autofunction:: fsspec.fuse.run +.. autoclass:: fsspec.gui.FileSelector + :members: Base Classes ------------ @@ -29,10 +34,17 @@ fsspec.spec.Transaction fsspec.spec.AbstractBufferedFile fsspec.FSMap + fsspec.asyn.AsyncFileSystem fsspec.core.OpenFile + fsspec.core.OpenFiles fsspec.core.BaseCache + fsspec.core.get_fs_token_paths + fsspec.dircache.DirCache + fsspec.registry.ReadOnlyRegistry + fsspec.registry.register_implementation .. autoclass:: fsspec.spec.AbstractFileSystem + :members: .. autoclass:: fsspec.spec.Transaction :members: @@ -40,15 +52,28 @@ .. autoclass:: fsspec.spec.AbstractBufferedFile :members: +.. autoclass:: fsspec.asyn.AsyncFileSystem + .. autoclass:: fsspec.FSMap :members: .. autoclass:: fsspec.core.OpenFile :members: +.. autoclass:: fsspec.core.OpenFiles + .. autoclass:: fsspec.core.BaseCache :members: +.. autofunction:: fsspec.core.get_fs_token_paths + +.. autoclass:: fsspec.dircache.DirCache + :members: __init__ + +.. autoclass:: fsspec.registry.ReadOnlyRegistry + :members: __init__ + +.. autofunction:: fsspec.registry.register_implementation .. _implementations: @@ -58,14 +83,20 @@ .. autosummary:: fsspec.implementations.ftp.FTPFileSystem fsspec.implementations.hdfs.PyArrowHDFS + fsspec.implementations.dask.DaskWorkerFileSystem fsspec.implementations.http.HTTPFileSystem fsspec.implementations.local.LocalFileSystem fsspec.implementations.memory.MemoryFileSystem + fsspec.implementations.github.GithubFileSystem fsspec.implementations.sftp.SFTPFileSystem fsspec.implementations.webhdfs.WebHDFS fsspec.implementations.zip.ZipFileSystem fsspec.implementations.cached.CachingFileSystem fsspec.implementations.cached.WholeFileCacheFileSystem + fsspec.implementations.cached.SimpleCacheFileSystem + fsspec.implementations.git.GitFileSystem + fsspec.implementations.smb.SMBFileSystem + fsspec.implementations.jupyter.JupyterFileSystem .. autoclass:: fsspec.implementations.ftp.FTPFileSystem :members: __init__ @@ -73,11 +104,14 @@ .. autoclass:: fsspec.implementations.hdfs.PyArrowHDFS :members: __init__ +.. autoclass:: fsspec.implementations.dask.DaskWorkerFileSystem + :members: __init__ + .. autoclass:: fsspec.implementations.http.HTTPFileSystem :members: __init__ .. autoclass:: fsspec.implementations.local.LocalFileSystem - :members: + :members: __init__ .. autoclass:: fsspec.implementations.memory.MemoryFileSystem :members: __init__ @@ -95,6 +129,39 @@ :members: __init__ .. autoclass:: fsspec.implementations.cached.WholeFileCacheFileSystem + :members: __init__ + +.. autoclass:: fsspec.implementations.cached.SimpleCacheFileSystem + :members: __init__ + +.. autoclass:: fsspec.implementations.github.GithubFileSystem + :members: __init__ + +.. autoclass:: fsspec.implementations.git.GitFileSystem + :members: __init__ + +.. autoclass:: fsspec.implementations.smb.SMBFileSystem + :members: __init__ + +.. autoclass:: fsspec.implementations.jupyter.JupyterFileSystem + :members: __init__ + +Other Known Implementations +--------------------------- + +- `s3fs`_ for Amazon S3 and other compatible stores +- `gcsfs`_ for Google Cloud Storage +- `adl`_ for Azure DataLake storage +- `abfs`_ for Azure Blob service +- `dropbox`_ for access to dropbox shares +- `gdrive`_ to access Google Drive and shares (experimental) + +.. _s3fs: https://s3fs.readthedocs.io/en/latest/ +.. _gcsfs: https://gcsfs.readthedocs.io/en/latest/ +.. _adl: https://github.com/dask/adlfs +.. _abfs: https://github.com/dask/adlfs +.. _dropbox: https://github.com/MarineChap/intake_dropbox +.. _gdrive: https://github.com/intake/gdrivefs .. _readbuffering: diff -Nru fsspec-0.6.1/docs/source/changelog.rst fsspec-0.8.4/docs/source/changelog.rst --- fsspec-0.6.1/docs/source/changelog.rst 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/docs/source/changelog.rst 2020-10-14 16:51:19.000000000 +0000 @@ -1,6 +1,137 @@ Changelog ========= +Version 0.8.4 +------------- + +Features: + +- function ``can_be_local`` to see whether URL is compatible with ``open_local`` +- concurrent cat with filecaches, if backend supports it + +Fixes: + +- dircache expiry after transaction +- blockcache garbage collection +- close for HDFS +- windows tests +- glob depth with "**" + +Version 0.8.3 +------------- + +Features: + +- error options for cat +- memory fs created time in detailed `ls` + + +Fixes: + +- duplicate directories could appear in MemoreFileSystem +- Added support for hat dollar lbrace rbrace regex character escapes in glob +- Fix blockcache (was doing unnecessary work) +- handle multibyte dtypes in readinto +- Fix missing kwargs in call to _copy in asyn + +Other: + +- Stop inheriting from pyarrow.filesystem for pyarrow>=2.0 +- Raise low-level program friendly OSError. +- Guard against instance reuse in new processes +- Make hash_name a method on CachingFileSystem to make it easier to change. +- Use get_event_loop for py3.6 compatibility + +Version 0.8.2 +------------- + +Fixes: + +- More careful strip for caching + +Version 0.8.1 +------------- + +Features: + +- add sign to base class +- Allow calling of coroutines from normal code when running async +- Implement writing for cached many files +- Allow concurrent caching of remote files +- Add gdrive:// protocol + +Fixes: + +- Fix memfs with exact ls +- HTTPFileSystem requires requests and aiohttp in registry + +Other: + +- Allow http kwargs to clientSession +- Use extras_require in setup.py for optional dependencies +- Replacing md5 with sha256 for hash (CVE req) +- Test against Python 3.8, drop 3.5 testing +- add az alias for abfs + +Version 0.8.0 +------------- + +Major release allowing async implementations with concurrent batch +operations. + +Features: + +- async filesystem spec, first applied to HTTP +- OpenFiles cContext for multiple files +- Document async, and ensure docstrings +- Make LocalFileOpener iterable +- handle smb:// protocol using smbprotocol package +- allow Path object in open +- simplecache write mode + +Fixes: + +- test_local: fix username not in home path +- Tighten cacheFS if dir deleted +- Fix race condition of lzma import when using threads +- properly rewind MemoryFile +- OpenFile newline in reduce + +Other: + +- Add aiobotocore to deps for s3fs check +- Set default clobber=True on impl register +- Use _get_kwargs_from_url when unchaining +- Add cache_type and cache_options to HTTPFileSystem constructor + +Version 0.7.5 +------------- + +* async implemented for HTTP as prototype (read-only) +* write for simplecache +* added SMB (Samba, protocol >=2) implementation + +Version 0.7.4 +------------- + +* panel-based GUI + +0.7.3 series +------------ + +* added ``git`` and ``github`` interfaces +* added chained syntax for open, open_files and get_mapper +* adapt webHDFS for HttpFS +* added open_local +* added ``simplecache``, and compression to both file caches + + +Version 0.6.2 +------------- + +* Added ``adl`` and ``abfs`` protocols to the known implementations registry (:pr:`209`) +* Fixed issue with whole-file caching and implementations providing multiple protocols (:pr:`219`) + Version 0.6.1 ------------- @@ -28,4 +159,4 @@ * Improved handling of requests for :class:`fsspec.implementations.http.HTTPFileSystem` when the HTTP server responds with an (incorrect) content-length of 0 (:pr:`163`) * Added a ``detail=True`` parameter to :meth:`fsspec.spec.AbstractFileSystem.ls` (:pr:`168`) -* Fixed handling of UNC/DFS paths (:issue:`154`) \ No newline at end of file +* Fixed handling of UNC/DFS paths (:issue:`154`) diff -Nru fsspec-0.6.1/docs/source/developer.rst fsspec-0.8.4/docs/source/developer.rst --- fsspec-0.6.1/docs/source/developer.rst 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/docs/source/developer.rst 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,80 @@ +Developing with fsspec +---------------------- + +Whereas the majority of the documentation describes the use of ``fsspec`` +from the end-user's point of view, ``fsspec`` is used by many libraries +as the primary/only interface to file operations. + +Clients of the library +~~~~~~~~~~~~~~~~~~~~~~ + +The most common entrance point for libraries which wish to rely on ``fsspec`` +will be ``open`` or ``open_files``, as a way of generating an object compatible +with the python file interface. This actually produces an ``OpenFile`` instance, +which can be serialised across a network, and resources are only engaged when +entering a context, e.g. + +.. code-block:: python + + with fsspec.open("protocol://path", 'rb', param=value) as f: + process_file(f) + +Note the backend-specific parameters that can be passed in this call. + +In cases where the caller wants to control the context directly, they can use the +``open`` method of the ``OpenFile``, or get the filesystem object directly, +skipping the ``OpenFile`` route. In the latter case, text encoding and compression +or **not** handled for you. The file-like object can also be used as a context +manager, or the ``close()`` method must be called explicitly to release resources. + +.. code-block:: python + + # OpenFile route + of = fsspec.open("protocol://path", 'rb', param=value) + f = of.open() + process_file(f) + f.close() + + # filesystem class route, context + fs = fsspec.filesystem("protocol", param=value) + with fs.open("path", "rb") as f: + process_file(f) + + # filesystem class route, explicit close + fs = fsspec.filesystem("protocol", param=value) + f = fs.open("path", "rb") + process_file(f) + f.close() + +Implementing a backend +~~~~~~~~~~~~~~~~~~~~~~ + +The class ``AbstractFileSystem`` provides a template of the methods +that a potential implementation should supply, as well as default +implementation of functionality that depends on these. Methods that +*could* be implemented are marked with ``NotImplementedError`` or +``pass`` (the patter specifically for directory operations that might +not be required for some backends where directories are emulated. + +Note that not all of the methods need to be implemented: for example, +some implementations may be read-only, in which case things like ``pipe``, +``put``, ``touch``, ``rm``, etc., can be left as not-implemented +(or you might implement them are raise PermissionError, OSError 30 or some +read-only exception). + +We may eventually refactor ``AbstractFileSystem`` to split the default implementation, +the set of methods that you might implement in a new backend, and the +documented end-user API. + +For now, new backends must register themselves on import +(``register_implementation``) or post a PR to the ``fsspec`` repo +asking to be included in ``fsspec.registry.known_implementations``. + +Implementing async +~~~~~~~~~~~~~~~~~~ + +Starting in version 0.7.5, we provide async operations for some methods +of some implementations. + +This section will contain details on how to implement backends offering +async, once the details are ironed out on our end. diff -Nru fsspec-0.6.1/docs/source/features.rst fsspec-0.8.4/docs/source/features.rst --- fsspec-0.6.1/docs/source/features.rst 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/docs/source/features.rst 2020-10-14 16:51:19.000000000 +0000 @@ -11,10 +11,10 @@ .. _s3fs: https://s3fs.readthedocs.io/en/latest/ .. _gcsfs: https://gcsfs.readthedocs.io/en/latest/ .. _hdfs3: https://hdfs3.readthedocs.io/en/latest/ -.. _adlfs: https://azure-datalake-store.readthedocs.io/en/latest/ +.. _adlfs: https://docs.microsoft.com/en-us/azure/data-lake-store/ -Here follows a brief description of some features of note of ``fsspec`` that promide to make -it an interesting project beyond some other file-system abstractions +Here follows a brief description of some features of note of ``fsspec`` that provides to make +it an interesting project beyond some other file-system abstractions. Serialisability --------------- @@ -35,8 +35,8 @@ The :func:`fsspec.core.OpenFile` class provides a convenient way to prescribe the manner to open some file (local, -remote, in a compressed store, etc.) which is portable, and ca also apply any compression and -text-mode to the file. These instances are also serialisable, because the do not contain any open +remote, in a compressed store, etc.) which is portable, and can also apply any compression and +text-mode to the file. These instances are also serialisable, because they do not contain any open files. The way to work with ``OpenFile`` s is to isolate interaction with in a ``with`` context. It is @@ -61,7 +61,7 @@ performed on the back-end storage. This is also a critical feature in the big-data access model, where each sub-task of an operation -may need on a small part of a file, and does not, therefore want to be forces into downloading the +may need on a small part of a file, and does not, therefore want to be forced into downloading the whole thing. Transparent text-mode and compression @@ -138,7 +138,7 @@ ------------------------ Any path of any file-system can be mapped to a local directory using pyfuse and -:func:`sspec.fuse.run`. This feature is experimental, but basic file listing with +:func:`fsspec.fuse.run`. This feature is experimental, but basic file listing with details, and read/write should generally be available to the extent that the remote file-system provides enough information. Naturally, if a file-system is read-only, then write operations will fail - but they will tend to fail late and with obscure @@ -166,13 +166,35 @@ will get ``close``d, and the data discarded. Only when there is also an unfinalised transaction or captured traceback might this be anticipated becoming a problem. +To disable instance caching, i.e., get a fresh instance which is not in the cache +even for a cachable class, pass ``skip_instance_cache=True``. + +Listings Caching +---------------- + +For some implementations, getting file listings (i.e., ``ls`` and anything that +depends on it) is expensive. These implementations use dict-like instances of +:class:`fsspec.dircache.DirCache` to manage the listings. + +The cache allows for time-based expiry of entries with the ``listings_expiry_time`` +parameter, or LRU expiry with the ``max_paths`` parameter. These can be +set on any implementation instance that uses listings caching; or to skip the +caching altogether, use ``use_listings_cache=False``. That would be appropriate +when the target location is known to be volatile because it is being written +to from other sources. + +When the ``fsspec`` instance writes to the backend, the method ``invalidate_cache`` +is called, so that subsequent listing of the given paths will force a refresh. In +addition, some methods like ``ls`` have a ``refresh`` parameter to force fetching +the listing again. + File Buffering -------------- Most implementations create file objects which derive from ``fsspec.spec.AbstractBufferedFile``, and have many behaviours in common. These files offer buffering of both read and write operations, so that communication with the remote resource is limited. The size of the buffer is generally configured -with the ``blocksize=`` kwargs at p[en time, although the implementation may have some minimum or +with the ``blocksize=`` kwargs at open time, although the implementation may have some minimum or maximum sizes that need to be respected. For reading, a number of buffering schemes are available, listed in ``fsspec.caching.caches`` @@ -185,12 +207,59 @@ with fs.open(path, mode='rb', cache_type='readahead') as f: use_for_something(f) +URL chaining +------------ + +Some implementations proxy or otherwise make use of another filesystem implementation, such +as locally caching remote files, i.e., finding out what files exist using the remote implementation, +but actually opening the local copies upon access. Other examples include reading from a Dask worker +which can see file-systems the client cannot, and accessing a zip file which is being read from +another backend. + +In such cases, you can specify the parameters exactly as specified in the implementation docstrings, +for the dask case something like + +.. code-block:: python + + of = fsspec.open('dask://bucket/key', target_protocol='s3', target_options={'anon': True}) + +As a shorthand, particularly useful where you have multiple hops, is to "chain" the URLs with +the special separator ``"::"``. The arguments to be passed on to each of the implementations referenced +are keyed by the protocol names included in the URL. Here is the equivalent to the line above: + +.. code-block:: python + + of = fsspec.open('dask::s3://bucket/key', s3={'anon': True}) + +A couple of more complicates cases: + +.. code-block:: python + + of = fsspec.open_files('zip://*.csv::simplecache::gcs://bucket/afile.zip', + simplecache={'cache_storage': '/stored/zip/files'}, + gcs={'project': 'my-project'}) + +reads a zip-file from google, stores it locally, and gives access to the contained CSV files. Conversely, + +.. code-block:: python + + of = fsspec.open_files('simplecache::zip://*.csv::gcs://bucket/afile.zip', + simplecache={'cache_storage': '/stored/csv/files'}, + gcs={'project': 'my-project'}) + +reads the same zip-file, but extracts the CSV files and stores them locally in the cache. + +**For developers**: this "chaining" methods works by formatting the arguments passed to ``open_*`` +into ``target_protocol`` (a simple string) and ``target_options`` (a dict) and also optionally +``fo`` (target path, if a specific file is required). In order for an implementation to chain +successfully like this, it must look for exactly those named arguments. + Caching Files Locally --------------------- ``fsspec`` allows you to access data on remote file systems, that is its purpose. However, such access can often be rather slow compared to local storage, so as well as buffering (see above), the -option exists to cp[y files locally when you first access them, and thereafter to use the local data. +option exists to copy files locally when you first access them, and thereafter to use the local data. This local cache of data might be temporary (i.e., attached to the process and discarded when the process ends) or at some specific location in your local storage. @@ -208,6 +277,15 @@ you can also set policies to have cached files expire after some time, or to check the remote file system on each open, to see if the target file has changed since it was copied. +With the top-level functions ``open``, ``open_local`` and ``open_files``, you can use the +same set of kwargs as the example above, or you can chain the URL - the following would +be the equivalent + +.. code-block:: python + + of = fsspec.open("filecache::s3://bucket/key", + s3={'anon': True}, filecache={'cache_storage'='/tmp/files'}) + With the "blockcache" variant, data is downloaded block-wise: only the specific parts of the remote file which are accessed. This means that the local copy of the file might end up being much smaller than the remote one, if only certain parts of it are required. @@ -216,4 +294,63 @@ libraries to use, "blockcache" has restrictions: that you have a storage/OS combination which supports sparse files, that the backend implementation uses files which derive ``from AbstractBufferedFile``, and that the library you pass the resultant object to accepts generic python file-like objects. You -should not mix block- and file-caches in the same directory. +should not mix block- and file-caches in the same directory. "simplecache" is the same as "filecache", +except without the options for cache expiry and to check the original source - it can be used where the +target can be considered static, and particularly where a large number of target files are expected +(because no metadata is written to disc). Only "simplecache" is guaranteed thread/process-safe. + +Remote Write Caching +-------------------- + +You can cache files to local files to send to remote using the "simplecache" protocol. +The following example demonstrates how this might look + +.. code-block:: python + + with fsspec.open('simplecache::s3://mybucket/myfile', 'wb', + s3={"profile": "writer"}) as f: + f.write(b"some data") + +This will open a local file for writing, and when this file is closed, it will be uploaded +to the target URL, in this case on S3. The file-like object ``f`` can be passed to any +library expecting to write to a file. Note that we pass parameters to ``S3FileSystem`` using +the key ``"s3"``, the same as the name of the protocol. + +File Selector +------------- + +The module ``fsspec.gui`` contains a graphical file selector interface. It is built +using `panel`_, which must be installed in order to use the GUI. Upon instantiation, +you can provide the initial URL location (which can be returned to with the "🏠" button), +arguments and filters. + +.. _panel: https://panel.holoviz.org/ + +.. image:: img/gui.png + +Clicking on a directory will descend into it, and selecting a file will mark it as +the output of the interface. You can select any of the known protocols, but should +provide any required arguments in the "kwargs" box (as a dictionary) and any +absolute URL location before clicking "⇨" to go to that location. If using file filters, +they will appear as a list of checkboxes; only those file-endings selected will be +shown (or if none are selected, all files are shown). + +The interface provides the following outputs: + +- ``.urlpath``: the currently selected item (if any) +- ``.storage_options``: the value of the kwargs box +- ``.fs``: the current filesystem instance +- ``.open_file()``: produces an ``OpenFile`` instance for the current selection + +Async +===== + +Some implementations, those deriving from ``fsspec.asyn.AsyncFileSystem``, have +async/coroutine implementations of some file operations. The async methods have +names beginning with ``_``, and listed in the ``asyn`` module; synchronous or +blocking functions are automatically generated, which will operate via an +event loop in another thread, by default. + +Async methods allow for concurrent +execution of certain batch operations such as ``get``, ``rm`` and ``cat`` even when +called via the blocking API. Binary files /tmp/tmp5H3Zbi/kCTwYG6tlS/fsspec-0.6.1/docs/source/img/gui.png and /tmp/tmp5H3Zbi/FHSafMsljv/fsspec-0.8.4/docs/source/img/gui.png differ diff -Nru fsspec-0.6.1/docs/source/index.rst fsspec-0.8.4/docs/source/index.rst --- fsspec-0.6.1/docs/source/index.rst 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/docs/source/index.rst 2020-10-14 16:51:19.000000000 +0000 @@ -1,7 +1,7 @@ -fsspec's: python filesystem interfaces +FSSPEC: Filesystem interfaces for Python ====================================== -Filesystem Spec is a project to unify various projects and classes to work with remote filesystems and +Filesystem Spec (FSSPEC) is a project to unify various projects and classes to work with remote filesystems and file-system-like abstractions using a standard pythonic interface. @@ -31,6 +31,11 @@ pip install fsspec +Not all included filesystems are usable by default without installing extra +dependencies. For example to be able to access data in S3:: + + pip install fsspec[s3] + or conda install -c conda-forge fsspec @@ -62,6 +67,7 @@ features.rst api.rst changelog.rst + developer.rst Indices and tables diff -Nru fsspec-0.6.1/docs/source/usage.rst fsspec-0.8.4/docs/source/usage.rst --- fsspec-0.6.1/docs/source/usage.rst 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/docs/source/usage.rst 2020-10-14 16:51:19.000000000 +0000 @@ -55,7 +55,9 @@ .. code-block:: python - with fs.open('https://raw.githubusercontent.com/dask/' + import fsspec + import pandas as pd + with fsspec.open('https://raw.githubusercontent.com/dask/' 'fastparquet/master/test-data/nation.csv') as f: df = pd.read_csv(f, sep='|', header=None) diff -Nru fsspec-0.6.1/fsspec/asyn.py fsspec-0.8.4/fsspec/asyn.py --- fsspec-0.6.1/fsspec/asyn.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/asyn.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,319 @@ +import asyncio +import functools +import inspect +import re +import os +import sys +import threading + +from .utils import other_paths, is_exception +from .spec import AbstractFileSystem + +# this global variable holds whether this thread is running async or not +thread_state = threading.local() +private = re.compile("_[^_]") + + +def _run_until_done(coro): + """execute coroutine, when already in the event loop""" + if sys.version_info < (3, 7): # pragma: no cover + raise RuntimeError( + "async file systems do not work completely on py<37. " + "The nested call currently underway cannot be processed. " + "Please downgrade your fsspec or upgrade python." + ) + loop = asyncio.get_event_loop() + task = asyncio.current_task() + asyncio.tasks._unregister_task(task) + del asyncio.tasks._current_tasks[loop] + runner = loop.create_task(coro) + while not runner.done(): + loop._run_once() + asyncio.tasks._current_tasks[loop] = task + return runner.result() + + +def sync(loop, func, *args, callback_timeout=None, **kwargs): + """ + Run coroutine in loop running in separate thread. + """ + + e = threading.Event() + main_tid = threading.get_ident() + result = [None] + error = [False] + + async def f(): + try: + if main_tid == threading.get_ident(): + raise RuntimeError("sync() called from thread of running loop") + await asyncio.sleep(0) + thread_state.asynchronous = True + future = func(*args, **kwargs) + if callback_timeout is not None: + future = asyncio.wait_for(future, callback_timeout) + result[0] = await future + except Exception: + error[0] = sys.exc_info() + finally: + thread_state.asynchronous = False + e.set() + + asyncio.run_coroutine_threadsafe(f(), loop=loop) + if callback_timeout is not None: + if not e.wait(callback_timeout): + raise TimeoutError("timed out after %s s." % (callback_timeout,)) + else: + while not e.is_set(): + e.wait(10) + if error[0]: + typ, exc, tb = error[0] + raise exc.with_traceback(tb) + else: + return result[0] + + +def maybe_sync(func, self, *args, **kwargs): + """Make function call into coroutine or maybe run + + If we are running async, run coroutine on current loop until done; + otherwise runs it on the loop (if is a coroutine already) or directly. Will guess + we are running async if either "self" has an attribute asynchronous which is True, + or thread_state does (this gets set in ``sync()`` itself, to avoid nesting loops). + """ + loop = self.loop + # second condition below triggers if this is running in the thread of the + # event loop *during* the call to sync(), i.e., while running + # asynchronously + if getattr(self, "asynchronous", False) or getattr( + thread_state, "asynchronous", False + ): + if inspect.iscoroutinefunction(func): + # run coroutine while pausing this one (because we are within async) + return _run_until_done(func(*args, **kwargs)) + else: + # make awaitable which then calls the blocking function + return _run_as_coroutine(func, *args, **kwargs) + else: + if inspect.iscoroutinefunction(func): + # run the awaitable on the loop + return sync(loop, func, *args, **kwargs) + else: + # just call the blocking function + return func(*args, **kwargs) + + +async def _run_as_coroutine(func, *args, **kwargs): + # This is not currently used + return func(*args, **kwargs) + + +def sync_wrapper(func, obj=None): + """Given a function, make so can be called in async or blocking contexts + + Leave obj=None if defining within a class. Pass the instance if attaching + as an attribute of the instance. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + self = obj or args[0] + return maybe_sync(func, self, *args, **kwargs) + + return wrapper + + +def async_wrapper(func): + """Run a sync function on the event loop""" + + @functools.wraps(func) + async def wrapper(*args, **kwargs): + return func(*args, **kwargs) + + return wrapper + + +def get_loop(): + """Create a running loop in another thread""" + loop = asyncio.new_event_loop() + t = threading.Thread(target=loop.run_forever) + t.daemon = True + t.start() + return loop + + +# these methods should be implemented as async by any async-able backend +async_methods = [ + "_ls", + "_cat_file", + "_get_file", + "_put_file", + "_rm_file", + "_cp_file", + "_pipe_file", +] +# these methods could be overridden, but have default sync versions which rely on _ls +# the sync methods below all call expand_path, which in turn may call walk or glob +# (if passed paths with glob characters, or for recursive=True, respectively) +default_async_methods = [ + "_expand_path", + "_info", + "_isfile", + "_isdir", + "_exists", + "_walk", + "_glob", + "_find", + "_du", +] + + +class AsyncFileSystem(AbstractFileSystem): + """Async file operations, default implementations + + Passes bulk operations to asyncio.gather for concurrent operation. + + Implementations that have concurrent batch operations and/or async methods + should inherit from this class instead of AbstractFileSystem. Docstrings are + copied from the un-underscored method in AbstractFileSystem, if not given. + """ + + # note that methods do not have docstring here; they will be copied + # for _* methods and inferred for overridden methods. + + async_impl = True + + def __init__(self, *args, asynchronous=False, loop=None, **kwargs): + self.asynchronous = asynchronous + self.loop = loop or get_loop() + super().__init__(*args, **kwargs) + + async def _rm(self, path, recursive=False, **kwargs): + await asyncio.gather(*[self._rm_file(p, **kwargs) for p in path]) + + def rm(self, path, recursive=False, **kwargs): + path = self.expand_path(path, recursive=recursive) + maybe_sync(self._rm, self, path, **kwargs) + + async def _copy(self, paths, path2, **kwargs): + await asyncio.gather( + *[self._cp_file(p1, p2, **kwargs) for p1, p2 in zip(paths, path2)] + ) + + def copy(self, path1, path2, recursive=False, **kwargs): + paths = self.expand_path(path1, recursive=recursive) + path2 = other_paths(paths, path2) + maybe_sync(self._copy, self, paths, path2) + + async def _pipe(self, path, value=None, **kwargs): + if isinstance(path, str): + path = {path: value} + await asyncio.gather( + *[self._pipe_file(k, v, **kwargs) for k, v in path.items()] + ) + + async def _cat(self, paths, **kwargs): + return await asyncio.gather( + *[ + asyncio.ensure_future(self._cat_file(path, **kwargs), loop=self.loop) + for path in paths + ], + return_exceptions=True + ) + + def cat(self, path, recursive=False, on_error="raise", **kwargs): + paths = self.expand_path(path, recursive=recursive) + out = maybe_sync(self._cat, self, paths, **kwargs) + if on_error == "raise": + ex = next(filter(is_exception, out), False) + if ex: + raise ex + if ( + len(paths) > 1 + or isinstance(path, list) + or paths[0] != self._strip_protocol(path) + ): + return { + k: v + for k, v in zip(paths, out) + if on_error != "omit" or not is_exception(v) + } + else: + return out[0] + + async def _put(self, lpaths, rpaths, **kwargs): + return await asyncio.gather( + *[ + self._put_file(lpath, rpath, **kwargs) + for lpath, rpath in zip(lpaths, rpaths) + ] + ) + + def put(self, lpath, rpath, recursive=False, **kwargs): + from .implementations.local import make_path_posix, LocalFileSystem + + rpath = self._strip_protocol(rpath) + if isinstance(lpath, str): + lpath = make_path_posix(lpath) + fs = LocalFileSystem() + lpaths = fs.expand_path(lpath, recursive=recursive) + rpaths = other_paths(lpaths, rpath) + maybe_sync(self._put, self, lpaths, rpaths, **kwargs) + + async def _get(self, rpaths, lpaths, **kwargs): + dirs = [os.path.dirname(lp) for lp in lpaths] + [os.makedirs(d, exist_ok=True) for d in dirs] + return await asyncio.gather( + *[ + self._get_file(rpath, lpath, **kwargs) + for lpath, rpath in zip(lpaths, rpaths) + ] + ) + + def get(self, rpath, lpath, recursive=False, **kwargs): + from fsspec.implementations.local import make_path_posix + + rpath = self._strip_protocol(rpath) + lpath = make_path_posix(lpath) + rpaths = self.expand_path(rpath, recursive=recursive) + lpaths = other_paths(rpaths, lpath) + [os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths] + return sync(self.loop, self._get, rpaths, lpaths) + + +def mirror_sync_methods(obj): + """Populate sync and async methods for obj + + For each method will create a sync version if the name refers to an async method + (coroutine) and there is no override in the child class; will create an async + method for the corresponding sync method if there is no implementation. + + Uses the methods specified in + - async_methods: the set that an implementation is expected to provide + - default_async_methods: that can be derived from their sync version in + AbstractFileSystem + - AsyncFileSystem: async-specific default coroutines + """ + from fsspec import AbstractFileSystem + + for method in async_methods + default_async_methods + dir(AsyncFileSystem): + if not method.startswith("_"): + continue + smethod = method[1:] + if private.match(method): + isco = inspect.iscoroutinefunction(getattr(obj, method, None)) + unsync = getattr(getattr(obj, smethod, False), "__func__", None) + is_default = unsync is getattr(AbstractFileSystem, smethod, "") + if isco and is_default: + mth = sync_wrapper(getattr(obj, method), obj=obj) + setattr(obj, smethod, mth) + if not mth.__doc__: + mth.__doc__ = getattr( + getattr(AbstractFileSystem, smethod, None), "__doc__", "" + ) + elif ( + hasattr(obj, smethod) + and inspect.ismethod(getattr(obj, smethod)) + and not hasattr(obj, method) + ): + setattr(obj, method, async_wrapper(getattr(obj, smethod))) diff -Nru fsspec-0.6.1/fsspec/caching.py fsspec-0.8.4/fsspec/caching.py --- fsspec-0.6.1/fsspec/caching.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/caching.py 2020-10-14 16:51:19.000000000 +0000 @@ -28,34 +28,14 @@ self.fetcher = fetcher self.size = size - def _fetch(self, start, end): - return self.fetcher(start, end) - - def __getitem__(self, item: slice): - if not isinstance(item, slice): - raise TypeError( - "Cache indices must be a contiguous slice. Got {} instead.".format( - type(item) - ) - ) - if item.step and item.step != 1: - raise ValueError( - "Cache indices must be a contiguous slice. 'item' has step={}".format( - item.step - ) - ) - - # handle endpoints - if item.start is None: - item = slice(0, item.stop) - elif item.start < 0: - item = slice(self.size + item.start, item.stop) - if item.stop is None: - item = slice(item.start, self.size) - elif item.stop < 0: - item = slice(item.start, self.size + item.stop) - - return self._fetch(item.start, item.stop) + def _fetch(self, start, stop): + if start is None: + start = 0 + if stop is None: + stop = self.size + if start >= self.size or start >= stop: + return b"" + return self.fetcher(start, stop) class MMapCache(BaseCache): @@ -96,6 +76,12 @@ return mmap.mmap(fd.fileno(), self.size) def _fetch(self, start, end): + if start is None: + start = 0 + if end is None: + end = self.size + if start >= self.size or start >= end: + return b"" start_block = start // self.blocksize end_block = end // self.blocksize need = [i for i in range(start_block, end_block + 1) if i not in self.blocks] @@ -123,7 +109,7 @@ class ReadAheadCache(BaseCache): - """ Cache which reads only when we get beyond a block of data + """Cache which reads only when we get beyond a block of data This is a much simpler version of BytesCache, and does not attempt to fill holes in the cache or keep fragments alive. It is best suited to @@ -137,11 +123,14 @@ self.end = 0 def _fetch(self, start, end): - end = min(self.size, end) - l = end - start - if start >= self.size: + if start is None: + start = 0 + if end is None or end > self.size: + end = self.size + if start >= self.size or start >= end: return b"" - elif start >= self.start and end <= self.end: + l = end - start + if start >= self.start and end <= self.end: # cache hit return self.cache[start - self.start : end - self.start] elif self.start <= start < self.end: @@ -216,13 +205,12 @@ ) def _fetch(self, start, end): - if end < start: - raise ValueError( - "'end' ({}) is smaller than 'start' ({}).".format(end, start) - ) - - if end > self.size: - raise ValueError("'end={}' larger than size ('{}')".format(end, self.size)) + if start is None: + start = 0 + if end is None: + end = self.size + if start >= self.size or start >= end: + return b"" # byte position -> block numbers start_block_number = start // self.blocksize @@ -230,7 +218,7 @@ # these are cached, so safe to do multiple calls for the same start and end. for block_number in range(start_block_number, end_block_number + 1): - self._fetch_block(block_number) + self._fetch_block_cached(block_number) return self._read_cache( start, @@ -314,6 +302,12 @@ def _fetch(self, start, end): # TODO: only set start/end after fetch, in case it fails? # is this where retry logic might go? + if start is None: + start = 0 + if end is None: + end = self.size + if start >= self.size or start >= end: + return b"" if ( self.start is not None and start >= self.start @@ -370,6 +364,16 @@ return len(self.cache) +class AllBytes(object): + """Cache entire contents of the file""" + + def __init__(self, data): + self.data = data + + def _fetch(self, start, end): + return self.data[start:end] + + caches = { "none": BaseCache, "mmap": MMapCache, diff -Nru fsspec-0.6.1/fsspec/compression.py fsspec-0.8.4/fsspec/compression.py --- fsspec-0.6.1/fsspec/compression.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/compression.py 2020-10-14 16:51:19.000000000 +0000 @@ -11,6 +11,7 @@ return file +# TODO: files should also be available as contexts # should be functions of the form func(infile, mode=, **kwargs) -> file-like compr = {None: noop_file} @@ -72,10 +73,10 @@ register_compression("gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz") try: - import lzma + from lzma import LZMAFile - register_compression("lzma", lzma.LZMAFile, "xz") - register_compression("xz", lzma.LZMAFile, "xz", force=True) + register_compression("lzma", LZMAFile, "xz") + register_compression("xz", LZMAFile, "xz", force=True) except ImportError: pass diff -Nru fsspec-0.6.1/fsspec/core.py fsspec-0.8.4/fsspec/core.py --- fsspec-0.6.1/fsspec/core.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/core.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,8 +1,10 @@ from __future__ import print_function, division, absolute_import import io -import os +from glob import has_magic import logging +import os +import re from .compression import compr from .utils import ( infer_compression, @@ -10,7 +12,7 @@ update_storage_options, stringify_path, ) -from .registry import get_filesystem_class +from .registry import get_filesystem_class, filesystem # for backwards compat, we export cache things from here too from .caching import ( # noqa: F401 @@ -83,6 +85,7 @@ self.compression, self.encoding, self.errors, + self.newline, ), ) @@ -90,7 +93,8 @@ return "".format(self.path) def __fspath__(self): - return self.path + # may raise if cannot be resolved to local file + return self.open().__fspath__() def __enter__(self): mode = self.mode.replace("t", "").replace("b", "") + "b" @@ -117,23 +121,93 @@ self.close() def __del__(self): - self.close() + self.fobjects.clear() # may cause cleanup of objects and close files def open(self): """Materialise this as a real open file without context - The file should be explicitly closed to avoid enclosed open file - instances persisting + The file should be explicitly closed to avoid enclosed file + instances persisting. This code-path monkey-patches the file-like + objects, so they can close even if the parent OpenFile object has already + been deleted; but a with-context is better style. """ - return self.__enter__() + out = self.__enter__() + closer = out.close + fobjects = self.fobjects.copy()[:-1] + mode = self.mode + + def close(): + # this func has no reference to + closer() # original close bound method of the final file-like + _close(fobjects, mode) # call close on other dependent file-likes + + out.close = close + return out def close(self): """Close all encapsulated file objects""" - for f in reversed(self.fobjects): - if "r" not in self.mode and not f.closed: - f.flush() - f.close() - self.fobjects = [] + _close(self.fobjects, self.mode) + + +class OpenFiles(list): + """List of OpenFile instances + + Can be used in a single context, which opens and closes all of the + contained files. Normal list access to get the elements works as + normal. + + A special case is made for caching filesystems - the files will + be down/uploaded together at the start or end of the context, and + this may happen concurrently, if the target filesystem supports it. + """ + + def __init__(self, *args, mode="rb", fs=None): + self.mode = mode + self.fs = fs + self.files = [] + super().__init__(*args) + + def __enter__(self): + if self.fs is None: + raise ValueError("Context has already been used") + + fs = self.fs + while True: + if hasattr(fs, "open_many"): + # check for concurrent cache download; or set up for upload + self.files = fs.open_many(self) + return self.files + if hasattr(fs, "fs") and fs.fs is not None: + fs = fs.fs + else: + break + return [s.__enter__() for s in self] + + def __exit__(self, *args): + fs = self.fs + if "r" not in self.mode: + while True: + if hasattr(fs, "open_many"): + # check for concurrent cache upload + fs.commit_many(self.files) + self.files.clear() + return + if hasattr(fs, "fs") and fs.fs is not None: + fs = fs.fs + else: + break + [s.__exit__(*args) for s in self] + + def __repr__(self): + return "" % len(self) + + +def _close(fobjects, mode): + for f in reversed(fobjects): + if "r" not in mode and not f.closed: + f.flush() + f.close() + fobjects.clear() def open_files( @@ -146,9 +220,11 @@ num=1, protocol=None, newline=None, + auto_mkdir=True, + expand=True, **kwargs ): - """ Given a path or paths, return a list of ``OpenFile`` objects. + """Given a path or paths, return a list of ``OpenFile`` objects. For writing, a str path must contain the "*" character, which will be filled in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2. @@ -181,6 +257,10 @@ newline: bytes or None Used for line terminator in text mode. If None, uses system default; if blank, uses no translation. + auto_mkdir: bool (True) + If in write mode, this will ensure the target directory exists before + writing, by calling ``fs.mkdirs(exist_ok=True)``. + expand: bool **kwargs: dict Extra options that make sense to a particular storage connection, e.g. host, port, username, password, etc. @@ -194,7 +274,8 @@ Returns ------- - List of ``OpenFile`` objects. + An ``OpenFiles`` instance, which is a ist of ``OpenFile`` objects that can + be used as a single context """ fs, fs_token, paths = get_fs_token_paths( urlpath, @@ -203,19 +284,98 @@ name_function=name_function, storage_options=kwargs, protocol=protocol, + expand=expand, ) - return [ - OpenFile( - fs, - path, - mode=mode, - compression=compression, - encoding=encoding, - errors=errors, - newline=newline, - ) - for path in paths - ] + if "r" not in mode and auto_mkdir: + parents = {fs._parent(path) for path in paths} + [fs.makedirs(parent, exist_ok=True) for parent in parents] + return OpenFiles( + [ + OpenFile( + fs, + path, + mode=mode, + compression=compression, + encoding=encoding, + errors=errors, + newline=newline, + ) + for path in paths + ], + mode=mode, + fs=fs, + ) + + +def _un_chain(path, kwargs): + if isinstance(path, (tuple, list)): + bits = [_un_chain(p, kwargs) for p in path] + out = [] + for pbit in zip(*bits): + paths, protocols, kwargs = zip(*pbit) + if len(set(protocols)) > 1: + raise ValueError("Protocol mismatch in URL chain") + if len(set(paths)) == 1: + paths = paths[0] + else: + paths = list(paths) + out.append([paths, protocols[0], kwargs[0]]) + return out + x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word + bits = ( + [p if "://" in p or x.match(p) else p + "://" for p in path.split("::")] + if "::" in path + else [path] + ) + if len(bits) < 2: + return [] + # [[url, protocol, kwargs], ...] + out = [] + previous_bit = None + previous_protocol = None + for bit in reversed(bits): + protocol = split_protocol(bit)[0] or "file" + cls = get_filesystem_class(protocol) + extra_kwargs = cls._get_kwargs_from_urls(bit) + kws = kwargs.get(split_protocol(bit)[0] or "file", {}) + kw = dict(**extra_kwargs, **kws) + if ( + protocol in {"blockcache", "filecache", "simplecache"} + and "target_protocol" not in kw + ): + bit = previous_bit.replace(previous_protocol, protocol, 1) + out.append((bit, protocol, kw)) + previous_bit = bit + previous_protocol = protocol + out = list(reversed(out)) + # We should only do the url rewrite if the cache is in the middle of the chain + if out[0][1] in {"blockcache", "filecache", "simplecache"}: + out[0] = (f"{out[0][1]}://", out[0][1], out[0][2]) + return out + + +def url_to_fs(url, **kwargs): + """Turn fully-qualified and potentially chained URL into filesystem instance""" + chain = _un_chain(url, kwargs) + if len(chain) > 1: + kwargs = chain[0][2] + inkwargs = kwargs + for i, ch in enumerate(chain): + urls, protocol, kw = ch + if i == 0: + continue + inkwargs["target_protocol"] = protocol + inkwargs["target_options"] = kw.copy() + inkwargs["fo"] = urls + inkwargs = inkwargs["target_options"] + protocol = chain[0][1] + urlpath = chain[-1][1] + "://" + split_protocol(urls)[1] + fs = filesystem(protocol, **kwargs) + else: + protocol, urlpath = split_protocol(url) + fs = filesystem(protocol, **kwargs) + urlpath = fs._strip_protocol(url) + return fs, urlpath def open( @@ -228,7 +388,7 @@ newline=None, **kwargs ): - """ Given a path or paths, return one ``OpenFile`` object. + """Given a path or paths, return one ``OpenFile`` object. Parameters ---------- @@ -274,10 +434,40 @@ errors, protocol, newline=newline, + expand=False, **kwargs )[0] +def open_local(url, mode="rb", **storage_options): + """Open file(s) which can be resolved to local + + For files which either are local, or get downloaded upon open + (e.g., by file caching) + + Parameters + ---------- + url: str or list(str) + mode: str + Must be read mode + storage_options: + passed on to FS for or used by open_files (e.g., compression) + """ + if "r" not in mode: + raise ValueError("Can only ensure local files when reading") + of = open_files(url, mode=mode, **storage_options) + if not getattr(of[0].fs, "local_file", False): + raise ValueError( + "open_local can only be used on a filesystem which" + " has attribute local_file=True" + ) + with of as files: + paths = [f.name for f in files] + if isinstance(url, str) and not has_magic(url): + return paths[0] + return paths + + def get_compression(urlpath, compression): if compression == "infer": compression = infer_compression(urlpath) @@ -342,7 +532,13 @@ def get_fs_token_paths( - urlpath, mode="rb", num=1, name_function=None, storage_options=None, protocol=None + urlpath, + mode="rb", + num=1, + name_function=None, + storage_options=None, + protocol=None, + expand=True, ): """Filesystem, deterministic token, and paths from a urlpath and options. @@ -363,17 +559,44 @@ Additional keywords to pass to the filesystem class. protocol: str or None To override the protocol specifier in the URL + expand: bool + Expand string paths for writing, assuming the path is a directory """ + if isinstance(urlpath, (list, tuple, set)): + urlpath = [stringify_path(u) for u in urlpath] + else: + urlpath = stringify_path(urlpath) + chain = _un_chain(urlpath, storage_options or {}) + if len(chain) > 1: + storage_options = chain[0][2] + inkwargs = storage_options + urlpath = False + for i, ch in enumerate(chain): + urls, protocol, kw = ch + if isinstance(urls, str): + if not urlpath and split_protocol(urls)[1]: + urlpath = protocol + "://" + split_protocol(urls)[1] + else: + if not urlpath and any(split_protocol(u)[1] for u in urls): + urlpath = [protocol + "://" + split_protocol(u)[1] for u in urls] + if i == 0: + continue + inkwargs["target_protocol"] = protocol + inkwargs["target_options"] = kw.copy() + inkwargs["fo"] = urls + inkwargs = inkwargs["target_options"] + protocol = chain[0][1] if isinstance(urlpath, (list, tuple)): if not urlpath: raise ValueError("empty urlpath sequence") protocols, paths = zip(*map(split_protocol, urlpath)) - protocol = protocol or protocols[0] - if not all(p == protocol for p in protocols): - raise ValueError( - "When specifying a list of paths, all paths must " - "share the same protocol" - ) + if protocol is None: + protocol = protocols[0] + if not all(p == protocol for p in protocols): + raise ValueError( + "When specifying a list of paths, all paths must " + "share the same protocol" + ) cls = get_filesystem_class(protocol) optionss = list(map(cls._get_kwargs_from_urls, urlpath)) paths = [cls._strip_protocol(u) for u in urlpath] @@ -397,10 +620,10 @@ update_storage_options(options, storage_options) fs = cls(**options) - if "w" in mode: + if "w" in mode and expand: paths = _expand_paths(path, name_function, num) elif "*" in path: - paths = sorted(fs.glob(path)) + paths = [f for f in sorted(fs.glob(path)) if not fs.isdir(f)] else: paths = [path] diff -Nru fsspec-0.6.1/fsspec/dircache.py fsspec-0.8.4/fsspec/dircache.py --- fsspec-0.6.1/fsspec/dircache.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/dircache.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,96 @@ +from functools import lru_cache +import time +from collections.abc import MutableMapping + + +class DirCache(MutableMapping): + """ + Caching of directory listings, in a structure like + + {"path0": [ + {"name": "path0/file0", + "size": 123, + "type": "file", + ... + }, + {"name": "path0/file1", + }, + ... + ], + "path1": [...] + } + + Parameters to this class control listing expiry or indeed turn + caching off + """ + + def __init__( + self, + use_listings_cache=True, + listings_expiry_time=None, + max_paths=None, + **kwargs + ): + """ + + Parameters + ---------- + use_listings_cache: bool + If False, this cache never returns items, but always reports KeyError, + and setting items has no effect + listings_expiry_time: int (optional) + Time in seconds that a listing is considered valid. If None, + listings do not expire. + max_paths: int (optional) + The number of most recent listings that are considered valid; 'recent' + refers to when the entry was set. + """ + self._cache = {} + self._times = {} + if max_paths: + self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None)) + self.use_listings_cache = use_listings_cache + self.listings_expiry_time = listings_expiry_time + self.max_paths = max_paths + + def __getitem__(self, item): + if self.listings_expiry_time: + if self._times.get(item, 0) - time.time() < -self.listings_expiry_time: + del self._cache[item] + if self.max_paths: + self._q(item) + return self._cache[item] # maybe raises KeyError + + def clear(self): + self._cache.clear() + + def __len__(self): + return len(self._cache) + + def __contains__(self, item): + try: + self[item] + return True + except KeyError: + return False + + def __setitem__(self, key, value): + if not self.use_listings_cache: + return + if self.max_paths: + self._q(key) + self._cache[key] = value + if self.listings_expiry_time: + self._times[key] = time.time() + + def __delitem__(self, key): + del self._cache[key] + + def __iter__(self): + return (k for k in self._cache if k in self) + + def __reduce__(self): + return ( + DirCache, + (self.use_listings_cache, self.listings_expiry_time, self.max_paths), + ) diff -Nru fsspec-0.6.1/fsspec/fuse.py fsspec-0.8.4/fsspec/fuse.py --- fsspec-0.6.1/fsspec/fuse.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/fuse.py 2020-10-14 16:51:19.000000000 +0000 @@ -113,7 +113,7 @@ def run(fs, path, mount_point, foreground=True, threads=False): - """ Mount stuff in a local directory + """Mount stuff in a local directory This uses fusepy to make it appear as if a given path on an fsspec instance is in fact resident within the local file-system. @@ -143,7 +143,7 @@ """ func = lambda: FUSE( - FUSEr(fs, path), mount_point, nothreads=not threads, foreground=True + FUSEr(fs, path), mount_point, nothreads=not threads, foreground=foreground ) if foreground is False: th = threading.Thread(target=func) diff -Nru fsspec-0.6.1/fsspec/gui.py fsspec-0.8.4/fsspec/gui.py --- fsspec-0.6.1/fsspec/gui.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/gui.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,406 @@ +import contextlib +import panel as pn +import os +import ast +import logging +import re +from .registry import known_implementations +from .core import split_protocol, get_filesystem_class, OpenFile + +pn.extension() +logger = logging.getLogger("fsspec.gui") + + +class SigSlot(object): + """Signal-slot mixin, for Panel event passing + + Include this class in a widget manager's superclasses to be able to + register events and callbacks on Panel widgets managed by that class. + + The method ``_register`` should be called as widgets are added, and external + code should call ``connect`` to associate callbacks. + + By default, all signals emit a DEBUG logging statement. + """ + + signals = [] # names of signals that this class may emit + # each of which must be set by _register for any new instance + slots = [] # names of actions that this class may respond to + + # each of which must be a method name + + def __init__(self): + self._ignoring_events = False + self._sigs = {} + self._map = {} + self._setup() + + def _setup(self): + """Create GUI elements and register signals""" + self.panel = pn.pane.PaneBase() + # no signals to set up in the base class + + def _register( + self, widget, name, thing="value", log_level=logging.DEBUG, auto=False + ): + """Watch the given attribute of a widget and assign it a named event + + This is normally called at the time a widget is instantiated, in the + class which owns it. + + Parameters + ---------- + widget : pn.layout.Panel or None + Widget to watch. If None, an anonymous signal not associated with + any widget. + name : str + Name of this event + thing : str + Attribute of the given widget to watch + log_level : int + When the signal is triggered, a logging event of the given level + will be fired in the dfviz logger. + auto : bool + If True, automatically connects with a method in this class of the + same name. + """ + if name not in self.signals: + raise ValueError("Attempt to assign an undeclared signal: %s" % name) + self._sigs[name] = { + "widget": widget, + "callbacks": [], + "thing": thing, + "log": log_level, + } + wn = "-".join( + [ + getattr(widget, "name", str(widget)) if widget is not None else "none", + thing, + ] + ) + self._map[wn] = name + if widget is not None: + widget.param.watch(self._signal, thing, onlychanged=True) + if auto and hasattr(self, name): + self.connect(name, getattr(self, name)) + + def _repr_mimebundle_(self, *args, **kwargs): + """Display in a notebook or a server""" + try: + return self.panel._repr_mimebundle_(*args, **kwargs) + except (ValueError, AttributeError): + raise NotImplementedError("Panel does not seem to be set " "up properly") + + def connect(self, signal, slot): + """Associate call back with given event + + The callback must be a function which takes the "new" value of the + watched attribute as the only parameter. If the callback return False, + this cancels any further processing of the given event. + + Alternatively, the callback can be a string, in which case it means + emitting the correspondingly-named event (i.e., connect to self) + """ + self._sigs[signal]["callbacks"].append(slot) + + def _signal(self, event): + """This is called by a an action on a widget + + Within an self.ignore_events context, nothing happens. + + Tests can execute this method by directly changing the values of + widget components. + """ + if not self._ignoring_events: + wn = "-".join([event.obj.name, event.name]) + if wn in self._map and self._map[wn] in self._sigs: + self._emit(self._map[wn], event.new) + + @contextlib.contextmanager + def ignore_events(self): + """Temporarily turn off events processing in this instance + + (does not propagate to children) + """ + self._ignoring_events = True + try: + yield + finally: + self._ignoring_events = False + + def _emit(self, sig, value=None): + """An event happened, call its callbacks + + This method can be used in tests to simulate message passing without + directly changing visual elements. + + Calling of callbacks will halt whenever one returns False. + """ + logger.log(self._sigs[sig]["log"], "{}: {}".format(sig, value)) + for callback in self._sigs[sig]["callbacks"]: + if isinstance(callback, str): + self._emit(callback) + else: + try: + # running callbacks should not break the interface + ret = callback(value) + if ret is False: + break + except Exception as e: + logger.exception( + "Exception (%s) while executing callback for signal: %s" + "" % (e, sig) + ) + + def show(self, threads=False): + """Open a new browser tab and display this instance's interface""" + self.panel.show(threads=threads, verbose=False) + return self + + +class SingleSelect(SigSlot): + """A multiselect which only allows you to select one item for an event""" + + signals = ["_selected", "selected"] # the first is internal + slots = ["set_options", "set_selection", "add", "clear", "select"] + + def __init__(self, **kwargs): + self.kwargs = kwargs + super().__init__() + + def _setup(self): + self.panel = pn.widgets.MultiSelect(**self.kwargs) + self._register(self.panel, "_selected", "value") + self._register(None, "selected") + self.connect("_selected", self.select_one) + + def _signal(self, *args, **kwargs): + super()._signal(*args, **kwargs) + + def select_one(self, *_): + with self.ignore_events(): + val = [self.panel.value[-1]] if self.panel.value else [] + self.panel.value = val + self._emit("selected", self.panel.value) + + def set_options(self, options): + self.panel.options = options + + def clear(self): + self.panel.options = [] + + @property + def value(self): + return self.panel.value + + def set_selection(self, selection): + self.panel.value = [selection] + + +class FileSelector(SigSlot): + """Panel-based graphical file selector widget + + Instances of this widget are interactive and can be displayed in jupyter by having + them as the output of a cell, or in a separate browser tab using ``.show()``. + """ + + signals = [ + "protocol_changed", + "selection_changed", + "directory_entered", + "home_clicked", + "up_clicked", + "go_clicked", + "filters_changed", + ] + slots = ["set_filters", "go_home"] + + def __init__(self, url=None, filters=None, ignore=None, kwargs=None): + """ + + Parameters + ---------- + url : str (optional) + Initial value of the URL to populate the dialog; should include protocol + filters : list(str) (optional) + File endings to include in the listings. If not included, all files are + allowed. Does not affect directories. + If given, the endings will appear as checkboxes in the interface + ignore : list(str) (optional) + Regex(s) of file basename patterns to ignore, e.g., "\\." for typical + hidden files on posix + kwargs : dict (optional) + To pass to file system instance + """ + if url: + self.init_protocol, url = split_protocol(url) + else: + self.init_protocol, url = "file", os.getcwd() + self.init_url = url + self.init_kwargs = kwargs or "{}" + self.filters = filters + self.ignore = [re.compile(i) for i in ignore or []] + self._fs = None + super().__init__() + + def _setup(self): + self.url = pn.widgets.TextInput( + name="url", + value=self.init_url, + align="end", + sizing_mode="stretch_width", + width_policy="max", + ) + self.protocol = pn.widgets.Select( + options=list(sorted(known_implementations)), + value=self.init_protocol, + name="protocol", + align="center", + ) + self.kwargs = pn.widgets.TextInput(name="kwargs", value="{}", align="center") + self.go = pn.widgets.Button(name="⇨", align="end", width=45) + self.main = SingleSelect(size=10) + self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end") + self.up = pn.widgets.Button(name="‹", width=30, height=30, align="end") + + self._register(self.protocol, "protocol_changed", auto=True) + self._register(self.go, "go_clicked", "clicks", auto=True) + self._register(self.up, "up_clicked", "clicks", auto=True) + self._register(self.home, "home_clicked", "clicks", auto=True) + self._register(None, "selection_changed") + self.main.connect("selected", self.selection_changed) + self._register(None, "directory_entered") + self.prev_protocol = self.protocol.value + self.prev_kwargs = self.storage_options + + self.filter_sel = pn.widgets.CheckBoxGroup( + value=[], options=[], inline=False, align="end", width_policy="min" + ) + self._register(self.filter_sel, "filters_changed", auto=True) + + self.panel = pn.Column( + pn.Row(self.protocol, self.kwargs), + pn.Row(self.home, self.up, self.url, self.go, self.filter_sel), + self.main.panel, + ) + self.set_filters(self.filters) + self.go_clicked() + + def set_filters(self, filters=None): + self.filters = filters + if filters: + self.filter_sel.options = filters + self.filter_sel.value = filters + else: + self.filter_sel.options = [] + self.filter_sel.value = [] + + @property + def storage_options(self): + """Value of the kwargs box as a dictionary""" + return ast.literal_eval(self.kwargs.value) or {} + + @property + def fs(self): + """Current filesystem instance""" + if self._fs is None: + cls = get_filesystem_class(self.protocol.value) + self._fs = cls(**self.storage_options) + return self._fs + + @property + def urlpath(self): + """URL of currently selected item""" + return ( + (self.protocol.value + "://" + self.main.value[0]) + if self.main.value + else None + ) + + def open_file(self, mode="rb", compression=None, encoding=None): + """Create OpenFile instance for the currently selected item + + For example, in a notebook you might do something like + + .. code-block:: + + [ ]: sel = FileSelector(); sel + + # user selects their file + + [ ]: with sel.open_file('rb') as f: + ... out = f.read() + + Parameters + ---------- + mode: str (optional) + Open mode for the file. + compression: str (optional) + The interact with the file as compressed. Set to 'infer' to guess + compression from the file ending + encoding: str (optional) + If using text mode, use this encoding; defaults to UTF8. + """ + if self.urlpath is None: + raise ValueError("No file selected") + return OpenFile(self.fs, self.urlpath, mode, compression, encoding) + + def filters_changed(self, values): + self.filters = values + self.go_clicked() + + def selection_changed(self, *_): + if self.urlpath is None: + return + if self.fs.isdir(self.urlpath): + self.url.value = self.fs._strip_protocol(self.urlpath) + self.go_clicked() + + def go_clicked(self, *_): + if ( + self.prev_protocol != self.protocol.value + or self.prev_kwargs != self.storage_options + ): + self._fs = None # causes fs to be recreated + self.prev_protocol = self.protocol.value + self.prev_kwargs = self.storage_options + listing = sorted( + self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"] + ) + listing = [ + l + for l in listing + if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore) + ] + folders = { + "📁 " + o["name"].rsplit("/", 1)[-1]: o["name"] + for o in listing + if o["type"] == "directory" + } + files = { + "📄 " + o["name"].rsplit("/", 1)[-1]: o["name"] + for o in listing + if o["type"] == "file" + } + if self.filters: + files = { + k: v + for k, v in files.items() + if any(v.endswith(ext) for ext in self.filters) + } + self.main.set_options(dict(**folders, **files)) + + def protocol_changed(self, *_): + self._fs = None + self.main.options = [] + self.url.value = "" + + def home_clicked(self, *_): + self.protocol.value = self.init_protocol + self.kwargs.value = self.init_kwargs + self.url.value = self.init_url + self.go_clicked() + + def up_clicked(self, *_): + self.url.value = self.fs._parent(self.url.value) + self.go_clicked() diff -Nru fsspec-0.6.1/fsspec/implementations/cached.py fsspec-0.8.4/fsspec/implementations/cached.py --- fsspec-0.6.1/fsspec/implementations/cached.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/cached.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,13 +1,17 @@ -import time import pickle import logging import os import hashlib +from shutil import move, rmtree import tempfile +import time import inspect + from fsspec import AbstractFileSystem, filesystem from fsspec.spec import AbstractBufferedFile from fsspec.core import MMapCache, BaseCache +from fsspec.utils import infer_compression +from fsspec.compression import compr logger = logging.getLogger("fsspec") @@ -41,14 +45,17 @@ check_files=False, expiry_time=604800, target_options=None, + fs=None, + same_names=False, + compression=None, **kwargs ): """ Parameters ---------- - target_protocol: str - Target fielsystem protocol + target_protocol: str (optional) + Target filesystem protocol. Provide either this or ``fs``. cache_storage: str or list(str) Location to store files. If "TMP", this is a temporary directory, and will be cleaned up by the OS when this process ends (or later). @@ -67,10 +74,23 @@ week. target_options: dict or None Passed to the instantiation of the FS, if fs is None. + fs: filesystem instance + The target filesystem to run against. Provide this or ``protocol``. + same_names: bool (optional) + By default, target URLs are hashed, so that files from different backends + with the same basename do not conflict. If this is true, the original + basename is used. + compression: str (optional) + To decompress on download. Can be 'infer' (guess from the URL name), + one of the entries in ``fsspec.compression.compr``, or None for no + decompression. """ - if self._cached: - return super().__init__(**kwargs) + if not (fs is None) ^ (target_protocol is None): + raise ValueError( + "Please provide one of filesystem instance (fs) or" + " remote_protocol, not both" + ) if cache_storage == "TMP": storage = [tempfile.mkdtemp()] else: @@ -84,26 +104,26 @@ self.cache_check = cache_check self.check_files = check_files self.expiry = expiry_time + self.compression = compression + # TODO: same_names should allow for variable prefix, not only + # to keep the basename + self.same_names = same_names + self.target_protocol = ( + target_protocol + if isinstance(target_protocol, str) + else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]) + ) self.load_cache() - if isinstance(target_protocol, AbstractFileSystem): - self.fs = target_protocol - self.protocol = self.fs.protocol - else: - self.protocol = target_protocol - self.fs = filesystem(target_protocol, **self.kwargs) + self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs) - def __reduce_ex__(self, *_): - return ( - self.__class__, - ( - self.protocol, - self.storage, - self.cache_check, - self.check_files, - self.expiry, - self.kwargs or None, - ), - ) + def _strip_protocol(path): + # acts as a method, since each instance has a difference target + return self.fs._strip_protocol(type(self)._strip_protocol(path)) + + self._strip_protocol = _strip_protocol + + def _mkcache(self): + os.makedirs(self.storage[-1], exist_ok=True) def load_cache(self): """Read set of stored blocks from file""" @@ -113,10 +133,14 @@ if os.path.exists(fn): with open(fn, "rb") as f: # TODO: consolidate blocks here - cached_files.append(pickle.load(f)) + loaded_cached_files = pickle.load(f) + for c in loaded_cached_files.values(): + if isinstance(c["blocks"], list): + c["blocks"] = set(c["blocks"]) + cached_files.append(loaded_cached_files) else: - os.makedirs(storage, exist_ok=True) cached_files.append({}) + self._mkcache() self.cached_files = cached_files or [{}] self.last_cache = time.time() @@ -135,20 +159,26 @@ c["blocks"] = True else: c["blocks"] = set(c["blocks"]).union(cache[k]["blocks"]) + + # Files can be added to cache after it was written once + for k, c in cache.items(): + if k not in cached_files: + cached_files[k] = c else: cached_files = cache cache = {k: v.copy() for k, v in cached_files.items()} for c in cache.values(): if isinstance(c["blocks"], set): c["blocks"] = list(c["blocks"]) - with open(fn + ".temp", "wb") as f: + fn2 = tempfile.mktemp() + with open(fn2, "wb") as f: pickle.dump(cache, f) - if os.path.exists(fn): - os.remove(fn) - os.rename(fn + ".temp", fn) + self._mkcache() + move(fn2, fn) def _check_cache(self): """Reload caches if time elapsed or any disappeared""" + self._mkcache() if not self.cache_check: # explicitly told not to bother checking return @@ -159,6 +189,8 @@ def _check_file(self, path): """Is path in cache and still valid""" + path = self._strip_protocol(path) + self._check_cache() for storage, cache in zip(self.storage, self.cached_files): if path not in cache: @@ -168,12 +200,42 @@ if detail["uid"] != self.fs.ukey(path): continue if self.expiry: - if detail["time"] - time.time() > self.expiry: + if time.time() - detail["time"] > self.expiry: continue fn = os.path.join(storage, detail["fn"]) if os.path.exists(fn): return detail, fn - return False, None + return False + + def clear_cache(self): + """Remove all files and metadat from the cache + + In the case of multiple cache locations, this clears only the last one, + which is assumed to be the read/write one. + """ + rmtree(self.storage[-1]) + self.load_cache() + + def pop_from_cache(self, path): + """Remove cached version of given file + + Deletes local copy of the given (remote) path. If it is found in a cache + location which is not the last, it is assumed to be read-only, and + raises PermissionError + """ + path = self._strip_protocol(path) + _, fn = self._check_file(path) + if fn is None: + return + if fn.startswith(self.storage[-1]): + # is in in writable cache + os.remove(fn) + self.cached_files[-1].pop(path) + self.save_cache() + else: + raise PermissionError( + "Can only delete cached file in last, writable cache location" + ) def _open( self, @@ -197,9 +259,9 @@ ``close_and_update`` to save the state of the blocks. """ path = self._strip_protocol(path) - if not path.startswith(self.protocol): - path = self.protocol + "://" + path - if mode != "rb": + + path = self.fs._strip_protocol(path) + if "r" not in mode: return self.fs._open( path, mode=mode, @@ -208,18 +270,19 @@ cache_options=cache_options, **kwargs ) - detail, fn = self._check_file(path) + detail = self._check_file(path) if detail: # file is in cache + detail, fn = detail hash, blocks = detail["fn"], detail["blocks"] if blocks is True: # stored file is complete logger.debug("Opening local copy of %s" % path) - return open(fn, "rb") + return open(fn, mode) # TODO: action where partial file exists in read-only cache logger.debug("Opening partially cached copy of %s" % path) else: - hash = hashlib.sha256(path.encode()).hexdigest() + hash = self.hash_name(path, self.same_names) fn = os.path.join(self.storage[-1], hash) blocks = set() detail = { @@ -232,6 +295,7 @@ logger.debug("Creating local sparse file for %s" % path) # call target filesystems open + self._mkcache() f = self.fs._open( path, mode=mode, @@ -241,6 +305,13 @@ cache_type=None, **kwargs ) + if self.compression: + comp = ( + infer_compression(path) + if self.compression == "infer" + else self.compression + ) + f = compr[comp](f, mode="rb") if "blocksize" in detail: if detail["blocksize"] != f.blocksize: raise ValueError( @@ -253,14 +324,18 @@ f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks) close = f.close f.close = lambda: self.close_and_update(f, close) + self.save_cache() return f + def hash_name(self, path, same_name): + return hash_name(path, same_name=same_name) + def close_and_update(self, f, close): """Called when a file is closing, so store the set of blocks""" - if f.path.startswith(self.protocol): - path = f.path - else: - path = self.protocol + "://" + f.path + if f.closed: + return + path = self._strip_protocol(f.path) + c = self.cached_files[-1][path] if c["blocks"] is not True and len(["blocks"]) * f.blocksize >= f.size: c["blocks"] = True @@ -275,19 +350,35 @@ "close_and_update", "__init__", "__getattribute__", - "__reduce_ex__", + "__reduce__", + "_make_local_details", "open", "cat", + "cat_file", "get", "read_block", "tail", "head", "_check_file", "_check_cache", + "_mkcache", + "clear_cache", + "pop_from_cache", + "_mkcache", + "local_file", + "_paths_from_path", + "open_many", + "commit_many", + "hash_name", ]: # all the methods defined in this class. Note `open` here, since # it calls `_open`, but is actually in superclass return lambda *args, **kw: getattr(type(self), item)(self, *args, **kw) + if item in ["__reduce_ex__"]: + raise AttributeError + if item in ["_cache"]: + # class attributes + return getattr(type(self), item) if item == "__class__": return type(self) d = object.__getattribute__(self, "__dict__") @@ -327,53 +418,266 @@ """ protocol = "filecache" + local_file = True + + def open_many(self, open_files): + paths = [of.path for of in open_files] + if "r" in open_files.mode: + self._mkcache() + else: + return [ + LocalTempFile(self.fs, path, mode=open_files.mode, autocommit=False) + for path in paths + ] + + if self.compression: + raise NotImplementedError + details = [self._check_file(sp) for sp in paths] + downpath = [p for p, d in zip(paths, details) if not d] + downfn0 = [ + os.path.join(self.storage[-1], self.hash_name(p, self.same_names)) + for p, d in zip(paths, details) + ] # keep these path names for opening later + downfn = [fn for fn, d in zip(downfn0, details) if not d] + if downpath: + # skip if all files are already cached and up to date + self.fs.get(downpath, downfn) + + # update metadata - only happens when downloads are successful + newdetail = [ + { + "fn": self.hash_name(path, self.same_names), + "blocks": True, + "time": time.time(), + "uid": self.fs.ukey(path), + } + for path in downpath + ] + self.cached_files[-1].update( + {path: detail for path, detail in zip(downpath, newdetail)} + ) + self.save_cache() + + def firstpart(fn): + # helper to adapt both whole-file and simple-cache + return fn[1] if isinstance(fn, tuple) else fn + + return [ + open(firstpart(fn0) if fn0 else fn1, mode=open_files.mode) + for fn0, fn1 in zip(details, downfn0) + ] + + def commit_many(self, open_files): + self.fs.put([f.fn for f in open_files], [f.path for f in open_files]) + + def _make_local_details(self, path): + hash = self.hash_name(path, self.same_names) + fn = os.path.join(self.storage[-1], hash) + detail = { + "fn": hash, + "blocks": True, + "time": time.time(), + "uid": self.fs.ukey(path), + } + self.cached_files[-1][path] = detail + logger.debug("Copying %s to local cache" % path) + return fn + + def cat(self, path, recursive=False, on_error="raise", **kwargs): + paths = self.expand_path( + path, recursive=recursive, maxdepth=kwargs.get("maxdepth", None) + ) + getpaths = [] + storepaths = [] + fns = [] + for p in paths: + detail = self._check_file(p) + if not detail: + fn = self._make_local_details(p) + getpaths.append(p) + storepaths.append(fn) + else: + detail, fn = detail if isinstance(detail, tuple) else (None, detail) + fns.append(fn) + if getpaths: + self.fs.get(getpaths, storepaths) + self.save_cache() + out = {path: open(fn, "rb").read() for path, fn in zip(paths, fns)} + if isinstance(path, str) and len(paths) == 1 and recursive is False: + out = out[paths[0]] + return out def _open(self, path, mode="rb", **kwargs): path = self._strip_protocol(path) - if not path.startswith(self.protocol): - path = self.protocol + "://" + path - if mode != "rb": + if "r" not in mode: return self.fs._open(path, mode=mode, **kwargs) - detail, fn = self._check_file(path) + detail = self._check_file(path) if detail: - hash, blocks = detail["fn"], detail["blocks"] + detail, fn = detail + _, blocks = detail["fn"], detail["blocks"] if blocks is True: logger.debug("Opening local copy of %s" % path) - return open(fn, "rb") + return open(fn, mode) else: raise ValueError( "Attempt to open partially cached file %s" "as a wholly cached file" % path ) else: - hash = hashlib.sha256(path.encode()).hexdigest() - fn = os.path.join(self.storage[-1], hash) - blocks = True - detail = { - "fn": hash, - "blocks": blocks, - "time": time.time(), - "uid": self.fs.ukey(path), - } - self.cached_files[-1][path] = detail - logger.debug("Copying %s to local cache" % path) + fn = self._make_local_details(path) kwargs["mode"] = mode # call target filesystems open - # TODO: why not just use fs.get ?? - f = self.fs._open(path, **kwargs) - with open(fn, "wb") as f2: - if isinstance(f, AbstractBufferedFile): - # want no type of caching if just downloading whole thing - f.cache = BaseCache(0, f.cache.fetcher, f.size) - if getattr(f, "blocksize", 0) and f.size: - # opportunity to parallelise here + self._mkcache() + if self.compression: + with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2: + if isinstance(f, AbstractBufferedFile): + # want no type of caching if just downloading whole thing + f.cache = BaseCache(0, f.cache.fetcher, f.size) + comp = ( + infer_compression(path) + if self.compression == "infer" + else self.compression + ) + f = compr[comp](f, mode="rb") data = True while data: - data = f.read(f.blocksize) + block = getattr(f, "blocksize", 5 * 2 ** 20) + data = f.read(block) f2.write(data) - else: - # this only applies to HTTP, should instead use streaming - f2.write(f.read()) + else: + self.fs.get(path, fn) self.save_cache() return self._open(path, mode) + + +class SimpleCacheFileSystem(WholeFileCacheFileSystem): + """Caches whole remote files on first access + + This class is intended as a layer over any other file system, and + will make a local copy of each file accessed, so that all subsequent + reads are local. This implementation only copies whole files, and + does not keep any metadata about the download time or file details. + It is therefore safer to use in multi-threaded/concurrent situations. + + This is the only of the caching filesystems that supports write: you will + be given a real local open file, and upon close and commit, it will be + uploaded to the target filesystem; the writability or the target URL is + not checked until that time. + + """ + + protocol = "simplecache" + local_file = True + + def __init__(self, **kwargs): + kw = kwargs.copy() + for key in ["cache_check", "expiry_time", "check_files"]: + kw[key] = False + super().__init__(**kw) + for storage in self.storage: + if not os.path.exists(storage): + os.makedirs(storage, exist_ok=True) + self.cached_files = [{}] + + def _check_file(self, path): + self._check_cache() + sha = self.hash_name(path, self.same_names) + for storage in self.storage: + fn = os.path.join(storage, sha) + if os.path.exists(fn): + return fn + + def save_cache(self): + pass + + def load_cache(self): + pass + + def _open(self, path, mode="rb", **kwargs): + path = self._strip_protocol(path) + + if "r" not in mode: + return LocalTempFile(self, path, mode=mode) + fn = self._check_file(path) + if fn: + return open(fn, mode) + + sha = self.hash_name(path, self.same_names) + fn = os.path.join(self.storage[-1], sha) + logger.debug("Copying %s to local cache" % path) + kwargs["mode"] = mode + + self._mkcache() + if self.compression: + with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2: + if isinstance(f, AbstractBufferedFile): + # want no type of caching if just downloading whole thing + f.cache = BaseCache(0, f.cache.fetcher, f.size) + comp = ( + infer_compression(path) + if self.compression == "infer" + else self.compression + ) + f = compr[comp](f, mode="rb") + data = True + while data: + block = getattr(f, "blocksize", 5 * 2 ** 20) + data = f.read(block) + f2.write(data) + else: + self.fs.get(path, fn) + return self._open(path, mode) + + +class LocalTempFile: + """A temporary local file, which will be uploaded on commit""" + + def __init__(self, fs, path, fn=None, mode="wb", autocommit=True, seek=0): + fn = fn or tempfile.mktemp() + self.mode = mode + self.fn = fn + self.fh = open(fn, mode) + if seek: + self.fh.seek(seek) + self.path = path + self.fs = fs + self.closed = False + self.autocommit = autocommit + + def __reduce__(self): + # always open in rb+ to allow continuing writing at a location + return ( + LocalTempFile, + (self.fs, self.path, self.fn, "rb+", self.autocommit, self.tell()), + ) + + def __enter__(self): + return self.fh + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def close(self): + self.fh.close() + self.closed = True + if self.autocommit: + self.commit() + + def discard(self): + self.fh.close() + os.remove(self.fn) + + def commit(self): + self.fs.put(self.fn, self.path) + + def __getattr__(self, item): + return getattr(self.fh, item) + + +def hash_name(path, same_name): + if same_name: + hash = os.path.basename(path) + else: + hash = hashlib.sha256(path.encode()).hexdigest() + return hash diff -Nru fsspec-0.6.1/fsspec/implementations/dask.py fsspec-0.8.4/fsspec/implementations/dask.py --- fsspec-0.6.1/fsspec/implementations/dask.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/dask.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,14 +1,19 @@ from distributed.worker import get_worker -from distributed.client import _get_global_client +from distributed.client import _get_global_client, Client import dask from fsspec.spec import AbstractFileSystem, AbstractBufferedFile from fsspec import filesystem +from fsspec.utils import infer_storage_options -def make_instance(cls, args, kwargs): - inst = cls(*args, **kwargs) - inst._determine_worker() - return inst +def _get_client(client): + if client is None: + return _get_global_client() + elif isinstance(client, Client): + return client + else: + # e.g., connection string + return Client(client) class DaskWorkerFileSystem(AbstractFileSystem): @@ -20,28 +25,43 @@ **Warning** this implementation is experimental, and read-only for now. """ - def __init__(self, remote_protocol, remote_options=None, **kwargs): + def __init__( + self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs + ): super().__init__(**kwargs) - self.protocol = remote_protocol - self.remote_options = remote_options + if not (fs is None) ^ (target_protocol is None): + raise ValueError( + "Please provide one of filesystem instance (fs) or" + " target_protocol, not both" + ) + self.target_protocol = target_protocol + self.target_options = target_options self.worker = None - self.client = None - self.fs = None # What is the type here? + self.client = client + self.fs = fs self._determine_worker() + @staticmethod + def _get_kwargs_from_urls(path): + so = infer_storage_options(path) + if "host" in so and "port" in so: + return {"client": f"{so['host']}:{so['port']}"} + else: + return {} + def _determine_worker(self): try: get_worker() self.worker = True - self.fs = filesystem(self.protocol, **(self.remote_options or {})) + if self.fs is None: + self.fs = filesystem( + self.target_protocol, **(self.target_options or {}) + ) except ValueError: self.worker = False - self.client = _get_global_client() + self.client = _get_client(self.client) self.rfs = dask.delayed(self) - def __reduce__(self): - return make_instance, (type(self), self.storage_args, self.storage_options) - def mkdir(self, *args, **kwargs): if self.worker: self.fs.mkdir(*args, **kwargs) @@ -92,9 +112,9 @@ ) else: return DaskFile( - self, - path, - mode, + fs=self, + path=path, + mode=mode, block_size=block_size, autocommit=autocommit, cache_options=cache_options, @@ -111,6 +131,11 @@ class DaskFile(AbstractBufferedFile): + def __init__(self, mode="rb", **kwargs): + if mode != "rb": + raise ValueError('Remote dask files can only be opened in "rb" mode') + super().__init__(**kwargs) + def _upload_chunk(self, final=False): pass diff -Nru fsspec-0.6.1/fsspec/implementations/dvc.py fsspec-0.8.4/fsspec/implementations/dvc.py --- fsspec-0.6.1/fsspec/implementations/dvc.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/dvc.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,74 @@ +import os +from fsspec.spec import AbstractFileSystem +from fsspec.implementations.local import LocalFileSystem +import dvc.repo + +lfs = LocalFileSystem() + + +class DVCFileSystem(AbstractFileSystem): + """DVC backend (experimental) + + Load data files that are versioned using the `Data Version Control`_ system + + .. _Data Version Control: https://dvc.org/ + + This interface is incomplete and experimental. + """ + + root_marker = "" + + def __init__(self, path=None, **kwargs): + """ + + Parameters + ---------- + path: str (optional) + Location of the repo to access; defaults to the current directory. + """ + super().__init__(**kwargs) + self.repo = dvc.repo.Repo(path) + self.path = self.repo.find_root() + + @classmethod + def _strip_protocol(cls, path): + return super()._strip_protocol(path).lstrip("/") + + def ls(self, path, detail=False, **kwargs): + path = self._strip_protocol(path) + allfiles = self.repo.tree.walk(os.path.join(self.repo.root_dir, path)) + dirname, dirs, files = next(allfiles) + out = [os.path.join(path, f) for f in dirs + files] + details = [] + + for f in out: + full = os.path.join(self.repo.root_dir, f) + file_info = lfs.info(full) + if lfs.isdir(full): + details.append(file_info) + else: + try: + extra = self.repo.find_out_by_relpath(full).dumpd() + except dvc.exceptions.OutputNotFoundError: + continue + details.append(dict(**extra, **file_info)) + details[-1]["name"] = f + if detail: + return details + return [d["name"] for d in details] + + def ukey(self, path): + return self.info(path)["md5"] + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs + ): + # returns a context file object (i.e., needs to be used with ``with`` + path = self._strip_protocol(path) + return self.repo.open_by_relpath(path) diff -Nru fsspec-0.6.1/fsspec/implementations/ftp.py fsspec-0.8.4/fsspec/implementations/ftp.py --- fsspec-0.6.1/fsspec/implementations/ftp.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/ftp.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,5 +1,4 @@ from ftplib import FTP, Error, error_perm -from socket import timeout import uuid from ..spec import AbstractBufferedFile, AbstractFileSystem from ..utils import infer_storage_options @@ -10,6 +9,7 @@ root_marker = "/" cachable = False + protocol = "ftp" def __init__( self, @@ -75,13 +75,7 @@ out.pop("protocol", None) return out - def invalidate_cache(self, path=None): - if path is not None: - self.dircache.pop(path, None) - else: - self.dircache.clear() - - def ls(self, path, detail=True): + def ls(self, path, detail=True, **kwargs): path = self._strip_protocol(path) out = [] if path not in self.dircache: @@ -103,6 +97,8 @@ details["size"] = int(details["size"]) else: details["size"] = 0 + if details["type"] == "dir": + details["type"] = "directory" self.dircache[path] = out except Error: try: @@ -150,15 +146,17 @@ def _rm(self, path): path = self._strip_protocol(path) self.ftp.delete(path) - self.invalidate_cache(path.rsplit("/", 1)[0]) + self.invalidate_cache(self._parent(path)) def mkdir(self, path, **kwargs): path = self._strip_protocol(path) self.ftp.mkd(path) + self.invalidate_cache(self._parent(path)) def rmdir(self, path): path = self._strip_protocol(path) self.ftp.rmd(path) + self.invalidate_cache(self._parent(path)) def mv(self, path1, path2, **kwargs): path1 = self._strip_protocol(path1) @@ -170,6 +168,13 @@ def __del__(self): self.ftp.close() + def invalidate_cache(self, path=None): + if path is None: + self.dircache.clear() + else: + self.dircache.pop(path, None) + super(FTPFileSystem, self).invalidate_cache(path) + class TransferDone(Exception): """Internal exception to break out of transfer""" @@ -227,11 +232,12 @@ total[0] += len(x) if total[0] > end - start: out.append(x[: (end - start) - total[0]]) - raise TransferDone + if end < self.size: + raise TransferDone else: out.append(x) - if total[0] == end - start: + if total[0] == end - start and end < self.size: raise TransferDone try: @@ -243,10 +249,12 @@ ) except TransferDone: try: + # stop transfer, we got enough bytes for this block self.fs.ftp.abort() - self.fs.ftp.voidresp() - except timeout: - self.fs._connect() + self.fs.ftp.getmultiline() + except Error: + self.fs.ftp._connect() + return b"".join(out) def _upload_chunk(self, final=False): diff -Nru fsspec-0.6.1/fsspec/implementations/github.py fsspec-0.8.4/fsspec/implementations/github.py --- fsspec-0.6.1/fsspec/implementations/github.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/github.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,10 +1,11 @@ -import io import requests from ..spec import AbstractFileSystem +from ..utils import infer_storage_options +from .memory import MemoryFile class GithubFileSystem(AbstractFileSystem): - """[Experimental] interface to files in github + """Interface to files in github An instance of this class provides the files residing within a remote github repository. You may specify a point in the repos history, by SHA, branch @@ -12,39 +13,140 @@ Given that code files tend to be small, and that github does not support retrieving partial content, we always fetch whole files. + + When using fsspec.open, allows URIs of the form: + + - "github://path/file", in which case you must specify org, repo and + may specify sha in the extra args + - 'github://org:repo@/precip/catalog.yml', where the org and repo are + part of the URI + - 'github://org:repo@sha/precip/catalog.yml', where tha sha is also included + + ``sha`` can be the full or abbreviated hex of the commit you want to fetch + from, or a branch or tag name (so long as it doesn't contain special characters + like "/", "?", which would have to be HTTP-encoded). + + For authorised access, you must provide username and token, which can be made + at https://github.com/settings/tokens """ url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}" rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}" protocol = "github" - def __init__(self, org, repo, sha="master", **kwargs): + def __init__(self, org, repo, sha="master", username=None, token=None, **kwargs): super().__init__(**kwargs) self.org = org self.repo = repo self.root = sha + if (username is None) ^ (token is None): + raise ValueError("Auth required both username and token") + self.username = username + self.token = token self.ls("") - def ls(self, path, detail=False, sha=None, **kwargs): + @property + def kw(self): + if self.username: + return {"auth": (self.username, self.token)} + return {} + + @classmethod + def repos(cls, org_or_user, is_org=True): + """List repo names for given org or user + + This may become the top level of the FS + + Parameters + ---------- + org_or_user: str + Nmae of the github org or user to query + is_org: bool (default True) + Whether the name is an organisation (True) or user (False) + + Returns + ------- + List of string + """ + r = requests.get( + "https://api.github.com/{part}/{org}/repos".format( + part=["users", "orgs"][is_org], org=org_or_user + ) + ) + r.raise_for_status() + return [repo["name"] for repo in r.json()] + + @property + def tags(self): + """Names of tags in the repo""" + r = requests.get( + "https://api.github.com/repos/{org}/{repo}/tags" + "".format(org=self.org, repo=self.repo), + **self.kw + ) + r.raise_for_status() + return [t["name"] for t in r.json()] + + @property + def branches(self): + """Names of branches in the repo""" + r = requests.get( + "https://api.github.com/repos/{org}/{repo}/branches" + "".format(org=self.org, repo=self.repo), + **self.kw + ) + r.raise_for_status() + return [t["name"] for t in r.json()] + + @property + def refs(self): + """Named references, tags and branches""" + return {"tags": self.tags, "branches": self.branches} + + def ls(self, path, detail=False, sha=None, _sha=None, **kwargs): + """List files at given path + + Parameters + ---------- + path: str + Location to list, relative to repo root + detail: bool + If True, returns list of dicts, one per file; if False, returns + list of full filenames only + sha: str (optional) + List at the given point in the repo history, branch or tag name or commit + SHA + _sha: str (optional) + List this specific tree object (used internally to descend into trees) + """ + path = self._strip_protocol(path) if path == "": - sha = self.root - if sha is None: + _sha = sha or self.root + if _sha is None: parts = path.rstrip("/").split("/") so_far = "" - sha = self.root + _sha = sha or self.root for part in parts: - out = self.ls(so_far, True, sha=sha) + out = self.ls(so_far, True, sha=sha, _sha=_sha) so_far += "/" + part if so_far else part - out = [o for o in out if o["name"] == so_far][0] + out = [o for o in out if o["name"] == so_far] + if not out: + raise FileNotFoundError(path) + out = out[0] if out["type"] == "file": if detail: return [out] else: return path - sha = out["sha"] - if path not in self.dircache: - r = requests.get(self.url.format(org=self.org, repo=self.repo, sha=sha)) - self.dircache[path] = [ + _sha = out["sha"] + if path not in self.dircache or sha not in [self.root, None]: + r = requests.get( + self.url.format(org=self.org, repo=self.repo, sha=_sha), **self.kw + ) + if r.status_code == 404: + raise FileNotFoundError(path) + r.raise_for_status() + out = [ { "name": path + "/" + f["path"] if path else f["path"], "mode": f["mode"], @@ -54,10 +156,34 @@ } for f in r.json()["tree"] ] + if sha in [self.root, None]: + self.dircache[path] = out + else: + out = self.dircache[path] if detail: - return self.dircache[path] + return out else: - return sorted([f["name"] for f in self.dircache[path]]) + return sorted([f["name"] for f in out]) + + def invalidate_cache(self, path=None): + self.dircache.clear() + + @classmethod + def _strip_protocol(cls, path): + opts = infer_storage_options(path) + if "username" not in opts: + return super()._strip_protocol(path) + return opts["path"].lstrip("/") + + @staticmethod + def _get_kwargs_from_urls(path): + opts = infer_storage_options(path) + if "username" not in opts: + return {} + out = {"org": opts["username"], "repo": opts["password"]} + if opts["host"]: + out["sha"] = opts["host"] + return out def _open( self, @@ -66,10 +192,16 @@ block_size=None, autocommit=True, cache_options=None, + sha=None, **kwargs ): if mode != "rb": raise NotImplementedError - url = self.rurl.format(org=self.org, repo=self.repo, path=path, sha=self.root) - r = requests.get(url) - return io.BytesIO(r.content) + url = self.rurl.format( + org=self.org, repo=self.repo, path=path, sha=sha or self.root + ) + r = requests.get(url, **self.kw) + if r.status_code == 404: + raise FileNotFoundError(path) + r.raise_for_status() + return MemoryFile(None, None, r.content) diff -Nru fsspec-0.6.1/fsspec/implementations/git.py fsspec-0.8.4/fsspec/implementations/git.py --- fsspec-0.6.1/fsspec/implementations/git.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/git.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,100 @@ +import pygit2 +from fsspec.spec import AbstractFileSystem +from .memory import MemoryFile +import os + + +class GitFileSystem(AbstractFileSystem): + """Browse the files of a local git repo at any hash/tag/branch + + (experimental backend) + """ + + root_marker = "" + + def __init__(self, path=None, ref=None, **kwargs): + """ + + Parameters + ---------- + path: str (optional) + Local location of the repo (uses current directory if not given) + ref: str (optional) + Reference to work with, could be a hash, tag or branch name. Defaults + to current working tree. Note that ``ls`` and ``open`` also take hash, + so this becomes the default for those operations + kwargs + """ + super().__init__(**kwargs) + self.repo = pygit2.Repository(path or os.getcwd()) + self.ref = ref or "master" + + @classmethod + def _strip_protocol(cls, path): + return super()._strip_protocol(path).lstrip("/") + + def _path_to_object(self, path, ref): + comm, ref = self.repo.resolve_refish(ref or self.ref) + parts = path.split("/") + tree = comm.tree + for part in parts: + if part and isinstance(tree, pygit2.Tree): + tree = tree[part] + return tree + + def ls(self, path, detail=True, ref=None, **kwargs): + path = self._strip_protocol(path) + tree = self._path_to_object(path, ref) + if isinstance(tree, pygit2.Tree): + out = [] + for obj in tree: + if isinstance(obj, pygit2.Tree): + out.append( + { + "type": "directory", + "name": "/".join([path, obj.name]).lstrip("/"), + "hex": obj.hex, + "mode": "%o" % obj.filemode, + "size": 0, + } + ) + else: + out.append( + { + "type": "file", + "name": "/".join([path, obj.name]).lstrip("/"), + "hex": obj.hex, + "mode": "%o" % obj.filemode, + "size": obj.size, + } + ) + else: + obj = tree + out = [ + { + "type": "file", + "name": obj.name, + "hex": obj.hex, + "mode": "%o" % obj.filemode, + "size": obj.size, + } + ] + if detail: + return out + return [o["name"] for o in out] + + def ukey(self, path, ref=None): + return self.info(path, ref=ref)["hex"] + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + ref=None, + **kwargs + ): + obj = self._path_to_object(path, ref or self.ref) + return MemoryFile(data=obj.data) diff -Nru fsspec-0.6.1/fsspec/implementations/hdfs.py fsspec-0.8.4/fsspec/implementations/hdfs.py --- fsspec-0.6.1/fsspec/implementations/hdfs.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/hdfs.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,3 +1,4 @@ +import weakref from ..spec import AbstractFileSystem from ..utils import infer_storage_options from pyarrow.hdfs import HadoopFileSystem @@ -10,6 +11,8 @@ passes on all calls to the underlying class. """ + protocol = "hdfs" + def __init__( self, host="default", @@ -41,7 +44,7 @@ return AbstractFileSystem.__init__(self, **kwargs) self.pars = (host, port, user, kerb_ticket, driver, extra_conf) - self.pahdfs = HadoopFileSystem( + pahdfs = HadoopFileSystem( host=host, port=port, user=user, @@ -49,6 +52,8 @@ driver=driver, extra_conf=extra_conf, ) + weakref.finalize(self, lambda: pahdfs.close()) + self.pahdfs = pahdfs def _open( self, @@ -102,8 +107,8 @@ return out @staticmethod - def _get_kwargs_from_urls(paths): - ops = infer_storage_options(paths) + def _get_kwargs_from_urls(path): + ops = infer_storage_options(path) out = {} if ops.get("host", None): out["host"] = ops["host"] @@ -113,6 +118,9 @@ out["port"] = ops["port"] return out + def close(self): + self.pahdfs.close() + @classmethod def _strip_protocol(cls, path): ops = infer_storage_options(path) @@ -121,6 +129,7 @@ def __getattribute__(self, item): if item in [ "_open", + "close", "__init__", "__getattribute__", "__reduce_ex__", diff -Nru fsspec-0.6.1/fsspec/implementations/http.py fsspec-0.8.4/fsspec/implementations/http.py --- fsspec-0.6.1/fsspec/implementations/http.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/http.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,18 +1,28 @@ from __future__ import print_function, division, absolute_import +import aiohttp +import asyncio +import logging import re import requests +import weakref from urllib.parse import urlparse -from fsspec import AbstractFileSystem from fsspec.spec import AbstractBufferedFile from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE +from fsspec.asyn import sync_wrapper, sync, AsyncFileSystem +from ..caching import AllBytes # https://stackoverflow.com/a/15926317/3821154 ex = re.compile(r"""]*?\s+)?href=(["'])(.*?)\1""") ex2 = re.compile(r"""(http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""") +logger = logging.getLogger("fsspec.http") -class HTTPFileSystem(AbstractFileSystem): +async def get_client(**kwargs): + return aiohttp.ClientSession(**kwargs) + + +class HTTPFileSystem(AsyncFileSystem): """ Simple File-System for fetching data via HTTP(S) @@ -30,9 +40,16 @@ block_size=None, same_scheme=True, size_policy=None, + cache_type="bytes", + cache_options=None, + asynchronous=False, + loop=None, + client_kwargs=None, **storage_options ): """ + NB: if this is called async, you must await set_client + Parameters ---------- block_size: int @@ -45,54 +62,76 @@ When doing ls/glob, if this is True, only consider paths that have http/https matching the input URLs. size_policy: this argument is deprecated + client_kwargs: dict + Passed to aiohttp.ClientSession, see + https://docs.aiohttp.org/en/stable/client_reference.html + For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}`` storage_options: key-value - May be credentials, e.g., `{'auth': ('username', 'pword')}` or any - other parameters passed on to requests + Any other parameters passed on to requests + cache_type, cache_options: defaults used in open """ - AbstractFileSystem.__init__(self) + super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options) self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE self.simple_links = simple_links self.same_schema = same_scheme + self.cache_type = cache_type + self.cache_options = cache_options + self.client_kwargs = client_kwargs or {} self.kwargs = storage_options - self.session = requests.Session() + if not asynchronous: + self._session = sync(self.loop, get_client, **self.client_kwargs) + weakref.finalize(self, sync, self.loop, self.session.close) + else: + self._session = None + + @property + def session(self): + if self._session is None: + raise RuntimeError("please await ``.set_session`` before anything else") + return self._session + + async def set_session(self): + self._session = await get_client(**self.client_kwargs) @classmethod def _strip_protocol(cls, path): - """ For HTTP, we always want to keep the full URL - """ + """For HTTP, we always want to keep the full URL""" return path - # TODO: override get - - def ls(self, url, detail=True): + async def _ls(self, url, detail=True, **kwargs): # ignoring URL-encoded arguments - r = self.session.get(url, **self.kwargs) + kw = self.kwargs.copy() + kw.update(kwargs) + logger.debug(url) + async with self.session.get(url, **self.kwargs) as r: + r.raise_for_status() + text = await r.text() if self.simple_links: - links = ex2.findall(r.text) + ex.findall(r.text) + links = ex2.findall(text) + ex.findall(text) else: - links = ex.findall(r.text) + links = ex.findall(text) out = set() parts = urlparse(url) for l in links: if isinstance(l, tuple): l = l[1] + if l.startswith("/") and len(l) > 1: + # absolute URL on this server + l = parts.scheme + "://" + parts.netloc + l if l.startswith("http"): - if self.same_schema: - if l.split(":", 1)[0] == url.split(":", 1)[0]: - out.add(l) + if self.same_schema and l.startswith(url.rstrip("/") + "/"): + out.add(l) elif l.replace("https", "http").startswith( - url.replace("https", "http") + url.replace("https", "http").rstrip("/") + "/" ): # allowed to cross http <-> https out.add(l) - elif l.startswith("/") and len(l) > 1: - out.add(parts.scheme + "://" + parts.netloc + l) else: if l not in ["..", "../"]: # Ignore FTP-like "parent" out.add("/".join([url.rstrip("/"), l.lstrip("/")])) if not out and url.endswith("/"): - return self.ls(url.rstrip("/"), detail=True) + return await self._ls(url.rstrip("/"), detail=True) if detail: return [ { @@ -105,31 +144,52 @@ else: return list(sorted(out)) - def cat(self, url): - r = requests.get(url, **self.kwargs) - r.raise_for_status() - return r.content - - def mkdirs(self, url): - """Make any intermediate directories to make path writable""" - raise NotImplementedError + async def _cat_file(self, url, **kwargs): + kw = self.kwargs.copy() + kw.update(kwargs) + logger.debug(url) + async with self.session.get(url, **kw) as r: + if r.status == 404: + raise FileNotFoundError(url) + r.raise_for_status() + out = await r.read() + return out - def exists(self, path): - kwargs = self.kwargs.copy() - kwargs["stream"] = True + async def _get_file(self, rpath, lpath, chunk_size=5 * 2 ** 20, **kwargs): + kw = self.kwargs.copy() + kw.update(kwargs) + logger.debug(rpath) + async with self.session.get(rpath, **self.kwargs) as r: + if r.status == 404: + raise FileNotFoundError(rpath) + r.raise_for_status() + with open(lpath, "wb") as fd: + chunk = True + while chunk: + chunk = await r.content.read(chunk_size) + fd.write(chunk) + + async def _exists(self, path, **kwargs): + kw = self.kwargs.copy() + kw.update(kwargs) try: - r = self.session.get(path, **kwargs) - r.close() - return r.ok - except requests.HTTPError: + logger.debug(path) + r = await self.session.get(path, **kw) + async with r: + return r.status < 400 + except (requests.HTTPError, aiohttp.client_exceptions.ClientError): return False + async def _isfile(self, path, **kwargs): + return await self._exists(path, **kwargs) + def _open( self, path, mode="rb", block_size=None, autocommit=None, # XXX: This differs from the base class. + cache_type=None, cache_options=None, **kwargs ): @@ -151,29 +211,32 @@ raise NotImplementedError block_size = block_size if block_size is not None else self.block_size kw = self.kwargs.copy() - kw.update(kwargs) # this does nothing? - if block_size: + kw["asynchronous"] = self.asynchronous + kw.update(kwargs) + size = self.size(path) + if block_size and size: return HTTPFile( self, path, - self.session, - block_size, + session=self.session, + block_size=block_size, mode=mode, - cache_options=cache_options, + size=size, + cache_type=cache_type or self.cache_type, + cache_options=cache_options or self.cache_options, + loop=self.loop, **kw ) else: - kw["stream"] = True - r = self.session.get(path, **kw) - r.raise_for_status() - r.raw.decode_content = True - return r.raw + return HTTPStreamFile( + self, path, mode=mode, loop=self.loop, session=self.session, **kw + ) def ukey(self, url): """Unique identifier; assume HTTP files are static, unchanging""" return tokenize(url, self.kwargs, self.protocol) - def info(self, url, **kwargs): + async def _info(self, url, **kwargs): """Get info of URL Tries to access location via HEAD, and then GET methods, but does @@ -186,7 +249,9 @@ size = False for policy in ["head", "get"]: try: - size = file_size(url, self.session, policy, **self.kwargs) + size = await _file_size( + url, size_policy=policy, session=self.session, **self.kwargs + ) if size: break except Exception: @@ -197,6 +262,10 @@ raise FileNotFoundError(url) return {"name": url, "size": size or None, "type": "file"} + def isdir(self, path): + # override, since all URLs are (also) files + return bool(self.ls(path)) + class HTTPFile(AbstractBufferedFile): """ @@ -233,14 +302,16 @@ cache_type="bytes", cache_options=None, size=None, + loop=None, + asynchronous=False, **kwargs ): if mode != "rb": raise NotImplementedError("File mode not supported") + self.asynchronous = asynchronous self.url = url - self.session = session if session is not None else requests.Session() - if size is not None: - self.details = {"name": url, "size": size, "type": "file"} + self.session = session + self.details = {"name": url, "size": size, "type": "file"} super().__init__( fs=fs, path=url, @@ -250,7 +321,7 @@ cache_options=cache_options, **kwargs ) - self.cache.size = self.size or self.blocksize + self.loop = loop def read(self, length=-1): """Read bytes from file @@ -277,20 +348,23 @@ length = min(self.size - self.loc, length) return super().read(length) - def _fetch_all(self): + async def async_fetch_all(self): """Read whole file in one shot, without caching This is only called when position is still at zero, and read() is called without a byte-count. """ if not isinstance(self.cache, AllBytes): - r = self.session.get(self.url, **self.kwargs) - r.raise_for_status() - out = r.content - self.cache = AllBytes(out) - self.size = len(out) + r = await self.session.get(self.url, **self.kwargs) + async with r: + r.raise_for_status() + out = await r.read() + self.cache = AllBytes(out) + self.size = len(out) + + _fetch_all = sync_wrapper(async_fetch_all) - def _fetch_range(self, start, end): + async def async_fetch_range(self, start, end): """Download a block of data The expectation is that the server returns only the requested bytes, @@ -299,45 +373,103 @@ requested, an exception is raised. """ kwargs = self.kwargs.copy() - headers = kwargs.pop("headers", {}) + headers = kwargs.pop("headers", {}).copy() headers["Range"] = "bytes=%i-%i" % (start, end - 1) - r = self.session.get(self.url, headers=headers, stream=True, **kwargs) - if r.status_code == 416: - # range request outside file - return b"" - r.raise_for_status() - if r.status_code == 206: - # partial content, as expected - out = r.content - elif "Content-Length" in r.headers: - cl = int(r.headers["Content-Length"]) - if cl <= end - start: - # data size OK - out = r.content - else: - raise ValueError( - "Got more bytes (%i) than requested (%i)" % (cl, end - start) - ) - else: - cl = 0 - out = [] - for chunk in r.iter_content(chunk_size=2 ** 20): - # data size unknown, let's see if it goes too big - if chunk: - out.append(chunk) - cl += len(chunk) - if cl > end - start: - raise ValueError( - "Got more bytes so far (>%i) than requested (%i)" - % (cl, end - start) - ) + logger.debug(self.url + " : " + headers["Range"]) + r = await self.session.get(self.url, headers=headers, **kwargs) + async with r: + if r.status == 416: + # range request outside file + return b"" + r.raise_for_status() + if r.status == 206: + # partial content, as expected + out = await r.read() + elif "Content-Length" in r.headers: + cl = int(r.headers["Content-Length"]) + if cl <= end - start: + # data size OK + out = await r.read() else: - break - out = b"".join(out) + raise ValueError( + "Got more bytes (%i) than requested (%i)" % (cl, end - start) + ) + else: + cl = 0 + out = [] + while True: + chunk = await r.content.read(2 ** 20) + # data size unknown, let's see if it goes too big + if chunk: + out.append(chunk) + cl += len(chunk) + if cl > end - start: + raise ValueError( + "Got more bytes so far (>%i) than requested (%i)" + % (cl, end - start) + ) + else: + break + out = b"".join(out) + return out + + _fetch_range = sync_wrapper(async_fetch_range) + + def close(self): + pass + + +async def get(session, url, **kwargs): + return await session.get(url, **kwargs) + + +class HTTPStreamFile(AbstractBufferedFile): + def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs): + self.asynchronous = kwargs.pop("asynchronous", False) + self.url = url + self.loop = loop + self.session = session + if mode != "rb": + raise ValueError + self.details = {"name": url, "size": None} + super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs) + self.r = sync(self.loop, get, self.session, url, **kwargs) + + def seek(self, *args, **kwargs): + raise ValueError("Cannot seek strteaming HTTP file") + + async def _read(self, num=-1): + out = await self.r.content.read(num) + self.loc += len(out) return out + read = sync_wrapper(_read) + + async def _close(self): + self.r.close() + + def close(self): + asyncio.run_coroutine_threadsafe(self._close(), self.loop) -def file_size(url, session=None, size_policy="head", **kwargs): + +async def get_range(session, url, start, end, file=None, **kwargs): + # explicit get a range when we know it must be safe + kwargs = kwargs.copy() + headers = kwargs.pop("headers", {}).copy() + headers["Range"] = "bytes=%i-%i" % (start, end - 1) + r = await session.get(url, headers=headers, **kwargs) + r.raise_for_status() + async with r: + out = await r.read() + if file: + with open(file, "rb+") as f: + f.seek(start) + f.write(out) + else: + return out + + +async def _file_size(url, session=None, size_policy="head", **kwargs): """Call HEAD on the server to get file size Default operation is to explicitly allow redirects and use encoding @@ -347,25 +479,18 @@ ar = kwargs.pop("allow_redirects", True) head = kwargs.get("headers", {}).copy() head["Accept-Encoding"] = "identity" - session = session or requests.Session() + session = session or await get_client() if size_policy == "head": - r = session.head(url, allow_redirects=ar, **kwargs) + r = await session.head(url, allow_redirects=ar, **kwargs) elif size_policy == "get": - kwargs["stream"] = True - r = session.get(url, allow_redirects=ar, **kwargs) + r = await session.get(url, allow_redirects=ar, **kwargs) else: raise TypeError('size_policy must be "head" or "get", got %s' "" % size_policy) - if "Content-Length" in r.headers: - return int(r.headers["Content-Length"]) - elif "Content-Range" in r.headers: - return int(r.headers["Content-Range"].split("/")[1]) - - -class AllBytes(object): - """Cache entire contents of a remote URL""" + async with r: + if "Content-Length" in r.headers: + return int(r.headers["Content-Length"]) + elif "Content-Range" in r.headers: + return int(r.headers["Content-Range"].split("/")[1]) - def __init__(self, data): - self.data = data - def _fetch(self, start, end): - return self.data[start:end] +file_size = sync_wrapper(_file_size) diff -Nru fsspec-0.6.1/fsspec/implementations/jupyter.py fsspec-0.8.4/fsspec/implementations/jupyter.py --- fsspec-0.6.1/fsspec/implementations/jupyter.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/jupyter.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,121 @@ +import base64 +import io +import fsspec +import re +import requests + + +class JupyterFileSystem(fsspec.AbstractFileSystem): + """View of the files as seen by a Jupyter server (notebook or lab)""" + + protocol = ("jupyter", "jlab") + + def __init__(self, url, tok=None, **kwargs): + """ + + Parameters + ---------- + url : str + Base URL of the server, like "http://127.0.0.1:8888". May include + token in the string, which is given by the process when starting up + tok : str + If the token is obtained separately, can be given here + kwargs + """ + if "?" in url: + if tok is None: + try: + tok = re.findall("token=([a-f0-9]+)", url)[0] + except IndexError as e: + raise ValueError("Could not determine token") from e + url = url.split("?", 1)[0] + self.url = url.rstrip("/") + "/api/contents" + self.session = requests.Session() + if tok: + self.session.headers["Authorization"] = f"token {tok}" + + super().__init__(**kwargs) + + def ls(self, path, detail=True, **kwargs): + path = self._strip_protocol(path) + r = self.session.get(self.url + "/" + path) + if r.status_code == 404: + return FileNotFoundError(path) + r.raise_for_status() + out = r.json() + + if out["type"] == "directory": + out = out["content"] + else: + out = [out] + for o in out: + o["name"] = o.pop("path") + o.pop("content") + if o["type"] == "notebook": + o["type"] = "file" + if detail: + return out + return [o["name"] for o in out] + + def cat_file(self, path): + path = self._strip_protocol(path) + r = self.session.get(self.url + "/" + path) + if r.status_code == 404: + return FileNotFoundError(path) + r.raise_for_status() + out = r.json() + if out["format"] == "text": + # data should be binary + return out["content"].encode() + else: + return base64.b64decode(out["content"]) + + def pipe_file(self, path, value, **_): + path = self._strip_protocol(path) + json = { + "name": path.rsplit("/", 1)[-1], + "path": path, + "size": len(value), + "content": base64.b64encode(value), + "format": "base64", + "type": "file", + } + self.session.put(self.url + "/" + path, json=json) + + def mkdir(self, path, create_parents=True, **kwargs): + path = self._strip_protocol(path) + if create_parents and "/" in path: + self.mkdir(path.rsplit("/", 1)[0], True) + json = { + "name": path.rsplit("/", 1)[-1], + "path": path, + "size": None, + "content": None, + "type": "directory", + } + self.session.put(self.url + "/" + path, json=json) + + def _rm(self, path): + path = self._strip_protocol(path) + self.session.delete(self.url + "/" + path) + + def _open(self, path, mode="rb", **kwargs): + path = self._strip_protocol(path) + if mode == "rb": + data = self.cat_file(path) + return io.BytesIO(data) + else: + return SimpleFileWriter(self, path, mode="wb") + + +class SimpleFileWriter(fsspec.spec.AbstractBufferedFile): + def _upload_chunk(self, final=False): + """Never uploads a chunk until file is done + + Not suitable for large files + """ + if final is False: + return False + self.buffer.seek(0) + data = self.buffer.read() + self.fs.pipe_file(self.path, data) diff -Nru fsspec-0.6.1/fsspec/implementations/local.py fsspec-0.8.4/fsspec/implementations/local.py --- fsspec-0.6.1/fsspec/implementations/local.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/local.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,3 +1,4 @@ +import datetime import io import os import shutil @@ -20,8 +21,10 @@ """ root_marker = "/" + protocol = "file" + local_file = True - def __init__(self, auto_mkdir=True, **kwargs): + def __init__(self, auto_mkdir=False, **kwargs): super().__init__(**kwargs) self.auto_mkdir = auto_mkdir @@ -37,9 +40,10 @@ os.makedirs(path, exist_ok=exist_ok) def rmdir(self, path): + path = self._strip_protocol(path) os.rmdir(path) - def ls(self, path, detail=False): + def ls(self, path, detail=False, **kwargs): path = self._strip_protocol(path) paths = [posixpath.join(path, f) for f in os.listdir(path)] if detail: @@ -47,9 +51,9 @@ else: return paths - def glob(self, path, **kargs): + def glob(self, path, **kwargs): path = self._strip_protocol(path) - return super().glob(path) + return super().glob(path, **kwargs) def info(self, path, **kwargs): path = self._strip_protocol(path) @@ -76,25 +80,29 @@ result["size"] = 0 return result - def copy(self, path1, path2, **kwargs): - shutil.copyfile(path1, path2) - - def get(self, path1, path2, **kwargs): - if kwargs.get("recursive"): - return super(LocalFileSystem, self).get(path1, path2, **kwargs) + def cp_file(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1).rstrip("/") + path2 = self._strip_protocol(path2).rstrip("/") + if self.auto_mkdir: + self.makedirs(self._parent(path2), exist_ok=True) + if self.isfile(path1): + shutil.copyfile(path1, path2) else: - return self.copy(path1, path2, **kwargs) + self.mkdirs(path2, exist_ok=True) - def put(self, path1, path2, **kwargs): - if kwargs.get("recursive"): - return super(LocalFileSystem, self).put(path1, path2, **kwargs) - else: - return self.copy(path1, path2, **kwargs) + def get_file(self, path1, path2, **kwargs): + return self.cp_file(path1, path2, **kwargs) - def mv(self, path1, path2, **kwargs): + def put_file(self, path1, path2, **kwargs): + return self.cp_file(path1, path2, **kwargs) + + def mv_file(self, path1, path2, **kwargs): + path1 = self._strip_protocol(path1).rstrip("/") + path2 = self._strip_protocol(path2).rstrip("/") os.rename(path1, path2) def rm(self, path, recursive=False, maxdepth=None): + path = self._strip_protocol(path).rstrip("/") if recursive and self.isdir(path): shutil.rmtree(path) else: @@ -102,17 +110,27 @@ def _open(self, path, mode="rb", block_size=None, **kwargs): path = self._strip_protocol(path) - if self.auto_mkdir: + if self.auto_mkdir and "w" in mode: self.makedirs(self._parent(path), exist_ok=True) return LocalFileOpener(path, mode, fs=self, **kwargs) def touch(self, path, **kwargs): path = self._strip_protocol(path) + if self.auto_mkdir: + self.makedirs(self._parent(path), exist_ok=True) if self.exists(path): os.utime(path, None) else: open(path, "a").close() + def created(self, path): + info = self.info(path=path) + return datetime.datetime.utcfromtimestamp(info["created"]) + + def modified(self, path): + info = self.info(path=path) + return datetime.datetime.utcfromtimestamp(info["mtime"]) + @classmethod def _parent(cls, path): path = cls._strip_protocol(path).rstrip("/") @@ -126,6 +144,7 @@ path = stringify_path(path) if path.startswith("file://"): path = path[7:] + path = os.path.expanduser(path) return make_path_posix(path) def _isfilestore(self): @@ -137,17 +156,22 @@ def make_path_posix(path, sep=os.sep): """ Make path generic """ + if isinstance(path, (list, set, tuple)): + return type(path)(make_path_posix(p) for p in path) if re.match("/[A-Za-z]:", path): # for windows file URI like "file:///C:/folder/file" # or "file:///C:\\dir\\file" path = path[1:] if path.startswith("\\\\"): # special case for windows UNC/DFS-style paths, do nothing, - # jsut flip the slashes around (case below does not work!) + # just flip the slashes around (case below does not work!) return path.replace("\\", "/") - if path.startswith("\\") or re.match("[\\\\]*[A-Za-z]:", path): - # windows full path "\\server\\path" or "C:\\local\\path" + if re.match("[A-Za-z]:", path): + # windows full path like "C:\\local\\path" return path.lstrip("\\").replace("\\", "/").replace("//", "/") + if path.startswith("\\"): + # windows network path like "\\server\\path" + return "/" + path.lstrip("\\").replace("\\", "/").replace("//", "/") if ( sep not in path and "/" not in path @@ -179,6 +203,7 @@ else: # TODO: check if path is writable? i, name = tempfile.mkstemp() + os.close(i) # we want normal open and normal buffered file self.temp = name self.f = open(name, mode=self.mode) if "w" not in self.mode: @@ -195,13 +220,13 @@ return self.f.read(end - start) def __setstate__(self, state): + self.f = None + loc = state.pop("loc", None) + self.__dict__.update(state) if "r" in state["mode"]: - loc = self.state.pop("loc") + self.f = None self._open() self.f.seek(loc) - else: - self.f = None - self.__dict__.update(state) def __getstate__(self): d = self.__dict__.copy() @@ -216,7 +241,7 @@ def commit(self): if self.autocommit: raise RuntimeError("Can only commit if not already set to autocommit") - os.rename(self.temp, self.path) + os.replace(self.temp, self.path) def discard(self): if self.autocommit: @@ -224,9 +249,12 @@ os.remove(self.temp) def __fspath__(self): - # uniquely for fsspec implementations, this is a real path + # uniquely among fsspec implementations, this is a real, local path return self.path + def __iter__(self): + return self.f.__iter__() + def __getattr__(self, item): return getattr(self.f, item) diff -Nru fsspec-0.6.1/fsspec/implementations/memory.py fsspec-0.8.4/fsspec/implementations/memory.py --- fsspec-0.6.1/fsspec/implementations/memory.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/memory.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,6 +1,8 @@ from __future__ import print_function, division, absolute_import from io import BytesIO +from datetime import datetime +from errno import ENOTEMPTY from fsspec import AbstractFileSystem import logging @@ -8,14 +10,18 @@ class MemoryFileSystem(AbstractFileSystem): - """A filesystem based on a dict of BytesIO objects""" + """A filesystem based on a dict of BytesIO objects + + This is a global filesystem so instances of this class all point to the same + in memory filesystem. + """ store = {} # global pseudo_dirs = [] protocol = "memory" root_marker = "" - def ls(self, path, detail=False): + def ls(self, path, detail=False, **kwargs): if path in self.store: # there is a key with this exact name, but could also be directory out = [ @@ -23,6 +29,7 @@ "name": path, "size": self.store[path].getbuffer().nbytes, "type": "file", + "created": self.store[path].created, } ] else: @@ -42,10 +49,15 @@ "name": has_slash + p, "size": self.store[p2].getbuffer().nbytes, "type": "file", + "created": self.store[p2].created, } ) - elif path and all( - (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/")) + elif ( + path + and len(path) < len(p.strip("/")) + and all( + (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/")) + ) ): # implicit directory ppath = "/".join(p.split("/")[: len(path.split("/")) + 1]) @@ -74,29 +86,33 @@ ) paths.add(ppath) for p2 in self.pseudo_dirs: - if self._parent(p2).strip("/").rstrip("/") == path: + if self._parent(p2).strip("/") == path and p2.strip("/") not in paths: out.append({"name": p2 + "/", "size": 0, "type": "directory"}) if detail: return out return sorted([f["name"] for f in out]) - def mkdir(self, path): + def mkdir(self, path, create_parents=True, **kwargs): path = path.rstrip("/") - if path not in self.pseudo_dirs: + if create_parents and self._parent(path): + self.mkdir(self._parent(path), create_parents, **kwargs) + if self._parent(path) and not self.isdir(self._parent(path)): + raise NotADirectoryError(self._parent(path)) + if path and path not in self.pseudo_dirs: self.pseudo_dirs.append(path) def rmdir(self, path): path = path.rstrip("/") if path in self.pseudo_dirs: - if self.ls(path) == []: + if not self.ls(path): self.pseudo_dirs.remove(path) else: - raise OSError("Directory %s not empty" % path) + raise OSError(ENOTEMPTY, "Directory not empty", path) else: raise FileNotFoundError(path) def exists(self, path): - return path in self.store + return path in self.store or path in self.pseudo_dirs def _open( self, @@ -110,10 +126,12 @@ if mode in ["rb", "ab", "rb+"]: if path in self.store: f = self.store[path] - if mode == "rb": - f.seek(0) - else: + if mode == "ab": + # position at the end of file f.seek(0, 2) + else: + # position at the beginning of file + f.seek(0) return f else: raise FileNotFoundError(path) @@ -123,14 +141,28 @@ m.commit() return m - def copy(self, path1, path2, **kwargs): - self.store[path2] = MemoryFile(self, path2, self.store[path1].getbuffer()) + def cp_file(self, path1, path2, **kwargs): + if self.isfile(path1): + self.store[path2] = MemoryFile(self, path2, self.store[path1].getbuffer()) + elif self.isdir(path1): + if path2 not in self.pseudo_dirs: + self.pseudo_dirs.append(path2) + else: + raise FileNotFoundError - def cat(self, path): - return self.store[path].getvalue() + def cat_file(self, path): + try: + return self.store[path].getvalue() + except KeyError: + raise FileNotFoundError(path) def _rm(self, path): - del self.store[path] + if self.isfile(path): + del self.store[path] + elif self.isdir(path): + self.rmdir(path) + else: + raise FileNotFoundError def size(self, path): """Size in bytes of the file at path""" @@ -142,14 +174,15 @@ class MemoryFile(BytesIO): """A BytesIO which can't close and works as a context manager - Can initialise with data + Can initialise with data. Each path should only be active once at any moment. No need to provide fs, path if auto-committing (default) """ - def __init__(self, fs, path, data=None): + def __init__(self, fs=None, path=None, data=None): self.fs = fs self.path = path + self.created = datetime.utcnow().timestamp() if data: self.write(data) self.size = len(data) @@ -159,7 +192,9 @@ return self def close(self): + position = self.tell() self.size = self.seek(0, 2) + self.seek(position) def discard(self): pass diff -Nru fsspec-0.6.1/fsspec/implementations/sftp.py fsspec-0.8.4/fsspec/implementations/sftp.py --- fsspec-0.6.1/fsspec/implementations/sftp.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/sftp.py 2020-10-14 16:51:19.000000000 +0000 @@ -10,6 +10,10 @@ """Files over SFTP/SSH Peer-to-peer filesystem over SSH using paramiko. + + Note: if using this with the ``open`` or ``open_files``, with full URLs, + there is no way to tell if a path is relative, so all paths are assumed + to be absolute. """ protocol = "sftp", "ssh" @@ -84,7 +88,7 @@ "size": s.st_size, "type": t, "uid": s.st_uid, - "gui": s.st_gid, + "gid": s.st_gid, "time": s.st_atime, "mtime": s.st_mtime, } diff -Nru fsspec-0.6.1/fsspec/implementations/smb.py fsspec-0.8.4/fsspec/implementations/smb.py --- fsspec-0.6.1/fsspec/implementations/smb.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/smb.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,287 @@ +# -*- coding: utf-8 -*- +""" +This module contains SMBFileSystem class responsible for handling access to +Windows Samba network shares by using package smbprotocol +""" + +from stat import S_ISDIR, S_ISLNK +import datetime +import uuid + +import smbclient + +from .. import AbstractFileSystem +from ..utils import infer_storage_options + +# ! pylint: disable=bad-continuation + + +class SMBFileSystem(AbstractFileSystem): + """Allow reading and writing to Windows and Samba network shares. + + When using `fsspec.open()` for getting a file-like object the URI + should be specified as this format: + `smb://workgroup;user:password@server:port/share/folder/file.csv`. + + Example:: + >>> import fsspec + >>> with fsspec.open('smb://myuser:mypassword@myserver.com/' + ... 'share/folder/file.csv') as smbfile: + ... df = pd.read_csv(smbfile, sep='|', header=None) + + Note that you need to pass in a valid hostname or IP address for the host + component of the URL. Do not use the Windows/NetBIOS machine name for the + host component. + + The first component of the path in the URL points to the name of the shared + folder. Subsequent path components will point to the directory/folder/file. + + The URL components `workgroup` , `user`, `password` and `port` may be + optional. + + .. note:: + + For working this source require `smbprotocol`_ to be installed, e.g.:: + + $ pip install smbprotocol + # or + # pip install smbprotocol[kerberos] + + .. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements + + Note: if using this with the ``open`` or ``open_files``, with full URLs, + there is no way to tell if a path is relative, so all paths are assumed + to be absolute. + """ + + protocol = "smb" + + # pylint: disable=too-many-arguments + def __init__( + self, + host, + port=None, + username=None, + password=None, + timeout=60, + encrypt=None, + **kwargs + ): + """ + You can use _get_kwargs_from_urls to get some kwargs from + a reasonable SMB url. + + Authentication will be anonymous or integrated if username/password are not + given. + + Parameters + ---------- + host: str + The remote server name/ip to connect to + port: int + Port to connect with. Usually 445, sometimes 139. + username: str or None + Username to connect with. Required if Kerberos auth is not being used. + password: str of None + User's password on the server, if using username + timeout: int + Connection timeout in seconds + encrypt: bool + Whether to force encryption or not, once this has been set to True + the session cannot be changed back to False. + """ + super(SMBFileSystem, self).__init__(**kwargs) + self.host = host + self.port = port + self.username = username + self.password = password + self.timeout = timeout + self.encrypt = encrypt + self.temppath = kwargs.pop("temppath", "") + self._connect() + + def _connect(self): + smbclient.register_session( + self.host, + username=self.username, + password=self.password, + port=self.port, + encrypt=self.encrypt, + connection_timeout=self.timeout, + ) + + @classmethod + def _strip_protocol(cls, path): + return infer_storage_options(path)["path"] + + @staticmethod + def _get_kwargs_from_urls(path): + # smb://workgroup;user:password@host:port/share/folder/file.csv + out = infer_storage_options(path) + out.pop("path", None) + out.pop("protocol", None) + return out + + def mkdir(self, path, create_parents=True, **kwargs): + wpath = _as_unc_path(self.host, path) + if create_parents: + smbclient.makedirs(wpath, exist_ok=False, **kwargs) + else: + smbclient.mkdir(wpath, **kwargs) + + def makedirs(self, path, exist_ok=False): + if _share_has_path(path): + wpath = _as_unc_path(self.host, path) + smbclient.makedirs(wpath, exist_ok=exist_ok) + + def rmdir(self, path): + if _share_has_path(path): + wpath = _as_unc_path(self.host, path) + smbclient.rmdir(wpath) + + def info(self, path, **kwargs): + wpath = _as_unc_path(self.host, path) + stats = smbclient.stat(wpath, **kwargs) + if S_ISDIR(stats.st_mode): + stype = "directory" + elif S_ISLNK(stats.st_mode): + stype = "link" + else: + stype = "file" + res = { + "name": path + "/" if stype == "directory" else path, + "size": stats.st_size, + "type": stype, + "uid": stats.st_uid, + "gid": stats.st_gid, + "time": stats.st_atime, + "mtime": stats.st_mtime, + } + return res + + def created(self, path): + """Return the created timestamp of a file as a datetime.datetime""" + wpath = _as_unc_path(self.host, path) + stats = smbclient.stat(wpath) + return datetime.datetime.utcfromtimestamp(stats.st_ctime) + + def modified(self, path): + """Return the modified timestamp of a file as a datetime.datetime""" + wpath = _as_unc_path(self.host, path) + stats = smbclient.stat(wpath) + return datetime.datetime.utcfromtimestamp(stats.st_mtime) + + def ls(self, path, detail=True, **kwargs): + unc = _as_unc_path(self.host, path) + listed = smbclient.listdir(unc, **kwargs) + dirs = ["/".join([path.rstrip("/"), p]) for p in listed] + if detail: + dirs = [self.info(d) for d in dirs] + return dirs + + # pylint: disable=too-many-arguments + def _open( + self, + path, + mode="rb", + block_size=-1, + autocommit=True, + cache_options=None, + **kwargs + ): + """ + block_size: int or None + If 0, no buffering, 1, line buffering, >1, buffer that many bytes + """ + bls = block_size if block_size is not None and block_size >= 0 else -1 + wpath = _as_unc_path(self.host, path) + if "w" in mode and autocommit is False: + temp = _as_temp_path(self.host, path, self.temppath) + return SMBFileOpener(wpath, temp, mode, block_size=bls, **kwargs) + return smbclient.open_file(wpath, mode, buffering=bls, **kwargs) + + def copy(self, path1, path2, **kwargs): + """ Copy within two locations in the same filesystem""" + wpath1 = _as_unc_path(self.host, path1) + wpath2 = _as_unc_path(self.host, path2) + smbclient.copyfile(wpath1, wpath2, **kwargs) + + def _rm(self, path): + if _share_has_path(path): + wpath = _as_unc_path(self.host, path) + stats = smbclient.stat(wpath) + if S_ISDIR(stats.st_mode): + smbclient.rmdir(wpath) + else: + smbclient.remove(wpath) + + def mv(self, path1, path2, **kwargs): + wpath1 = _as_unc_path(self.host, path1) + wpath2 = _as_unc_path(self.host, path2) + smbclient.rename(wpath1, wpath2, **kwargs) + + +def _as_unc_path(host, path): + rpath = path.replace("/", "\\") + unc = "\\\\{}{}".format(host, rpath) + return unc + + +def _as_temp_path(host, path, temppath): + share = path.split("/")[1] + temp_file = "/{}{}/{}".format(share, temppath, uuid.uuid4()) + unc = _as_unc_path(host, temp_file) + return unc + + +def _share_has_path(path): + parts = path.count("/") + if path.endswith("/"): + return parts > 2 + return parts > 1 + + +class SMBFileOpener(object): + """writes to remote temporary file, move on commit""" + + def __init__(self, path, temp, mode, block_size=-1, **kwargs): + self.path = path + self.temp = temp + self.mode = mode + self.block_size = block_size + self.kwargs = kwargs + self.smbfile = None + self._incontext = False + self._open() + + def _open(self): + if self.smbfile is None or self.smbfile.closed: + self.smbfile = smbclient.open_file( + self.temp, self.mode, buffering=self.block_size, **self.kwargs + ) + + def commit(self): + """Move temp file to definitive on success.""" + # TODO: use transaction support in SMB protocol + smbclient.replace(self.temp, self.path) + + def discard(self): + """Remove the temp file on failure.""" + smbclient.remove(self.temp) + + def __fspath__(self): + return self.path + + def __iter__(self): + return self.smbfile.__iter__() + + def __getattr__(self, item): + return getattr(self.smbfile, item) + + def __enter__(self): + self._incontext = True + return self.smbfile.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + self._incontext = False + self.smbfile.__exit__(exc_type, exc_value, traceback) diff -Nru fsspec-0.6.1/fsspec/implementations/tests/conftest.py fsspec-0.8.4/fsspec/implementations/tests/conftest.py --- fsspec-0.6.1/fsspec/implementations/tests/conftest.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/conftest.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,27 @@ +import tempfile + +import pytest + +from fsspec.implementations.local import LocalFileSystem + + +# A dummy filesystem that has a list of protocols +class MultiProtocolFileSystem(LocalFileSystem): + protocol = ["file", "other"] + + +FILESYSTEMS = {"local": LocalFileSystem, "multi": MultiProtocolFileSystem} + +READ_ONLY_FILESYSTEMS = [] + + +@pytest.fixture(scope="function") +def fs(request): + cls = FILESYSTEMS[request.param] + return cls() + + +@pytest.fixture(scope="function") +def temp_file(): + with tempfile.TemporaryDirectory() as temp_dir: + return temp_dir + "test-file" diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_cached.py fsspec-0.8.4/fsspec/implementations/tests/test_cached.py --- fsspec-0.6.1/fsspec/implementations/tests/test_cached.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_cached.py 2020-10-14 16:51:19.000000000 +0000 @@ -2,9 +2,11 @@ import shutil import pickle import pytest +import tempfile import fsspec from fsspec.implementations.cached import CachingFileSystem +from fsspec.compression import compr from .test_ftp import FTPFileSystem @@ -24,7 +26,7 @@ "filecache", target_protocol="file", cache_storage=cache_location ) - return (data, original_file, cache_location, fs) + return data, original_file, cache_location, fs def test_idempotent(): @@ -35,13 +37,50 @@ assert fs3.storage == fs.storage -def test_workflow(ftp_writable): +def test_blockcache_workflow(ftp_writable, tmp_path): + host, port, user, pw = ftp_writable + fs = FTPFileSystem(host, port, user, pw) + with fs.open("/out", "wb") as f: + f.write(b"test\n" * 4096) + + fs_kwargs = dict( + skip_instance_cache=True, + cache_storage=str(tmp_path), + target_protocol="ftp", + target_options={"host": host, "port": port, "username": user, "password": pw}, + ) + + # Open the blockcache and read a little bit of the data + fs = fsspec.filesystem("blockcache", **fs_kwargs) + with fs.open("/out", "rb", block_size=5) as f: + assert f.read(5) == b"test\n" + + # Save the cache/close it + fs.save_cache() + del fs + + # Check that cache file only has the first two blocks + with open(tmp_path / "cache", "rb") as f: + cache = pickle.load(f) + assert "/out" in cache + assert cache["/out"]["blocks"] == [0, 1] + + # Reopen the same cache and read some more... + fs = fsspec.filesystem("blockcache", **fs_kwargs) + with fs.open("/out", block_size=5) as f: + assert f.read(5) == b"test\n" + f.seek(30) + assert f.read(5) == b"test\n" + + +@pytest.mark.parametrize("impl", ["filecache", "blockcache"]) +def test_workflow(ftp_writable, impl): host, port, user, pw = ftp_writable fs = FTPFileSystem(host, port, user, pw) with fs.open("/out", "wb") as f: f.write(b"test") fs = fsspec.filesystem( - "cached", + impl, target_protocol="ftp", target_options={"host": host, "port": port, "username": user, "password": pw}, ) @@ -49,9 +88,9 @@ with fs.open("/out") as f: assert os.listdir(fs.storage[-1]) assert f.read() == b"test" - assert fs.cached_files[-1]["ftp:///out"]["blocks"] + assert fs.cached_files[-1]["/out"]["blocks"] assert fs.cat("/out") == b"test" - assert fs.cached_files[-1]["ftp:///out"]["blocks"] is True + assert fs.cached_files[-1]["/out"]["blocks"] is True with fs.open("/out", "wb") as f: f.write(b"changed") @@ -59,6 +98,104 @@ assert fs.cat("/out") == b"test" # old value +@pytest.mark.parametrize("impl", ["simplecache", "blockcache"]) +def test_glob(ftp_writable, impl): + host, port, user, pw = ftp_writable + fs = FTPFileSystem(host, port, user, pw) + with fs.open("/out", "wb") as f: + f.write(b"test") + with fs.open("/out2", "wb") as f: + f.write(b"test2") + fs = fsspec.filesystem( + impl, + target_protocol="ftp", + target_options={"host": host, "port": port, "username": user, "password": pw}, + ) + assert fs.glob("/wrong*") == [] + assert fs.glob("/ou*") == ["/out", "/out2"] + + +def test_write(): + tmp = str(tempfile.mkdtemp()) + fn = tmp + "afile" + url = "simplecache::file://" + fn + with fsspec.open(url, "wb") as f: + f.write(b"hello") + assert fn not in f.name + assert not os.listdir(tmp) + + assert open(fn, "rb").read() == b"hello" + + +def test_clear(): + import tempfile + + origin = tempfile.mkdtemp() + cache1 = tempfile.mkdtemp() + data = b"test data" + f1 = os.path.join(origin, "afile") + with open(f1, "wb") as f: + f.write(data) + + # populates first cache + fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1) + assert fs.cat(f1) == data + + assert "cache" in os.listdir(cache1) + assert len(os.listdir(cache1)) == 2 + assert fs._check_file(f1) + + fs.clear_cache() + assert not fs._check_file(f1) + assert len(os.listdir(cache1)) < 2 + + +def test_pop(): + import tempfile + + origin = tempfile.mkdtemp() + cache1 = tempfile.mkdtemp() + cache2 = tempfile.mkdtemp() + data = b"test data" + f1 = os.path.join(origin, "afile") + f2 = os.path.join(origin, "bfile") + with open(f1, "wb") as f: + f.write(data) + with open(f2, "wb") as f: + f.write(data) + + # populates first cache + fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1) + fs.cat(f1) + + # populates last cache if file not found in first cache + fs = fsspec.filesystem( + "filecache", target_protocol="file", cache_storage=[cache1, cache2] + ) + assert fs.cat(f2) == data + assert len(os.listdir(cache2)) == 2 + assert fs._check_file(f1) + with pytest.raises(PermissionError): + fs.pop_from_cache(f1) + fs.pop_from_cache(f2) + assert len(os.listdir(cache2)) == 1 + assert not fs._check_file(f2) + assert fs._check_file(f1) + + +def test_write_pickle_context(): + tmp = str(tempfile.mkdtemp()) + fn = tmp + "afile" + url = "simplecache::file://" + fn + f = fsspec.open(url, "wb").open() + f.write(b"hello ") + f.flush() + with pickle.loads(pickle.dumps(f)) as f2: + f2.write(b"world") + + assert open(fn, "rb").read() == b"hello world" + + def test_blocksize(ftp_writable): host, port, user, pw = ftp_writable fs = FTPFileSystem(host, port, user, pw) @@ -77,31 +214,27 @@ fs.open("/out_block", block_size=30) -def test_local_filecache_creates_dir_if_needed(): +@pytest.mark.parametrize("impl", ["filecache", "simplecache", "blockcache"]) +def test_local_filecache_creates_dir_if_needed(impl): import tempfile original_location = tempfile.mkdtemp() - cache_location = "foofoobarbar" + cache_location = tempfile.mkdtemp() + os.rmdir(cache_location) assert not os.path.exists(cache_location) - try: - original_file = os.path.join(original_location, "afile") - data = b"test data" - with open(original_file, "wb") as f: - f.write(data) - - # we can access the file and read it - fs = fsspec.filesystem( - "filecache", target_protocol="file", cache_storage=cache_location - ) + original_file = os.path.join(original_location, "afile") + data = b"test data" + with open(original_file, "wb") as f: + f.write(data) - with fs.open(original_file, "rb") as f: - data_in_cache = f.read() + # we can access the file and read it + fs = fsspec.filesystem(impl, target_protocol="file", cache_storage=cache_location) - assert os.path.exists(cache_location) + with fs.open(original_file, "rb") as f: + data_in_cache = f.read() - finally: - shutil.rmtree(cache_location) + assert os.path.exists(cache_location) assert data_in_cache == data @@ -236,7 +369,8 @@ assert f.read() == data * 2 -def test_filecache_multicache_with_same_file_different_data_reads_from_first(): +@pytest.mark.parametrize("impl", ["filecache", "simplecache"]) +def test_filecache_multicache_with_same_file_different_data_reads_from_first(impl): import tempfile origin = tempfile.mkdtemp() @@ -248,29 +382,26 @@ f.write(data) # populate first cache - fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1) - assert fs.cat(f1) == data + fs1 = fsspec.filesystem(impl, target_protocol="file", cache_storage=cache1) + assert fs1.cat(f1) == data with open(f1, "wb") as f: f.write(data * 2) # populate second cache - fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache2) + fs2 = fsspec.filesystem(impl, target_protocol="file", cache_storage=cache2) - assert fs.cat(f1) == data * 2 + assert fs2.cat(f1) == data * 2 # the filenames in each cache are the same, but the data is different - assert os.listdir(cache1) == os.listdir(cache2) + assert sorted(os.listdir(cache1)) == sorted(os.listdir(cache2)) - fs = fsspec.filesystem( - "filecache", target_protocol="file", cache_storage=[cache1, cache2] - ) + fs = fsspec.filesystem(impl, target_protocol="file", cache_storage=[cache1, cache2]) assert fs.cat(f1) == data def test_filecache_with_checks(): - import tempfile import time origin = tempfile.mkdtemp() @@ -299,16 +430,219 @@ assert fs.cat(f1) == data * 2 # changed, since origin changed -def test_takes_fs_instance(): - import tempfile - +@pytest.mark.parametrize("impl", ["filecache", "simplecache", "blockcache"]) +@pytest.mark.parametrize("fs", ["local", "multi"], indirect=["fs"]) +def test_takes_fs_instance(impl, fs): origin = tempfile.mkdtemp() data = b"test data" f1 = os.path.join(origin, "afile") with open(f1, "wb") as f: f.write(data) - fs = fsspec.filesystem("file") - fs2 = fsspec.filesystem("filecache", target_protocol=fs) + fs2 = fsspec.filesystem(impl, fs=fs) assert fs2.cat(f1) == data + + +def test_add_file_to_cache_after_save(local_filecache): + (data, original_file, cache_location, fs) = local_filecache + + fs.save_cache() + + fs.cat(original_file) + assert len(fs.cached_files[-1]) == 1 + + fs.save_cache() + + fs2 = fsspec.filesystem( + "filecache", + target_protocol="file", + cache_storage=cache_location, + do_not_use_cache_for_this_instance=True, # cache is masking the issue + ) + assert len(fs2.cached_files[-1]) == 1 + + +@pytest.mark.parametrize("impl", ["filecache", "simplecache"]) +@pytest.mark.parametrize("compression", ["gzip", "bz2"]) +def test_with_compression(impl, compression): + data = b"123456789" + tempdir = tempfile.mkdtemp() + cachedir = tempfile.mkdtemp() + fn = os.path.join(tempdir, "data") + f = compr[compression](open(fn, mode="wb"), mode="w") + f.write(data) + f.close() + + with fsspec.open( + "%s::%s" % (impl, fn), + "rb", + compression=compression, + **{impl: dict(same_names=True, cache_storage=cachedir)} + ) as f: + # stores original compressed file, uncompress on read + assert f.read() == data + assert "data" in os.listdir(cachedir) + assert open(os.path.join(cachedir, "data"), "rb").read() != data + + cachedir = tempfile.mkdtemp() + + with fsspec.open( + "%s::%s" % (impl, fn), + "rb", + **{impl: dict(same_names=True, compression=compression, cache_storage=cachedir)} + ) as f: + # stores uncompressed data + assert f.read() == data + assert "data" in os.listdir(cachedir) + assert open(os.path.join(cachedir, "data"), "rb").read() == data + + +@pytest.mark.parametrize("protocol", ["simplecache", "filecache"]) +def test_again(protocol): + fn = "memory://afile" + with fsspec.open(fn, "wb") as f: + f.write(b"hello") + d2 = tempfile.mkdtemp() + lurl = fsspec.open_local(f"{protocol}::{fn}", **{protocol: {"cache_storage": d2}}) + assert os.path.exists(lurl) + assert d2 in lurl + assert open(lurl, "rb").read() == b"hello" + + # remove cache dir + shutil.rmtree(d2) + assert not os.path.exists(lurl) + + # gets recreated + lurl = fsspec.open_local(f"{protocol}::{fn}", **{protocol: {"cache_storage": d2}}) + assert open(lurl, "rb").read() == b"hello" + + +@pytest.mark.parametrize("protocol", ["simplecache", "filecache"]) +def test_multi_cache(protocol): + with fsspec.open_files("memory://file*", "wb", num=2) as files: + for f in files: + f.write(b"hello") + + d2 = tempfile.mkdtemp() + lurl = fsspec.open_local( + f"{protocol}::memory://file*", + mode="rb", + **{protocol: {"cache_storage": d2, "same_names": True}} + ) + assert all(d2 in u for u in lurl) + assert all(os.path.basename(f) in ["file0", "file1"] for f in lurl) + assert all(open(u, "rb").read() == b"hello" for u in lurl) + + d2 = tempfile.mkdtemp() + lurl = fsspec.open_files( + f"{protocol}::memory://file*", + mode="rb", + **{protocol: {"cache_storage": d2, "same_names": True}} + ) + with lurl as files: + for f in files: + assert os.path.basename(f.name) in ["file0", "file1"] + assert f.read() == b"hello" + fs = fsspec.filesystem("memory") + fs.store.clear() + with lurl as files: + for f in files: + assert os.path.basename(f.name) in ["file0", "file1"] + assert f.read() == b"hello" + + +@pytest.mark.parametrize("protocol", ["simplecache", "filecache", "blockcache"]) +def test_multi_cat(protocol, ftp_writable): + host, port, user, pw = ftp_writable + fs = FTPFileSystem(host, port, user, pw) + for fn in {"/file0", "/file1"}: + with fs.open(fn, "wb") as f: + f.write(b"hello") + + d2 = tempfile.mkdtemp() + fs = fsspec.filesystem(protocol, storage=d2, fs=fs) + assert fs.cat("file*") == {"/file0": b"hello", "/file1": b"hello"} + + +@pytest.mark.parametrize("protocol", ["simplecache", "filecache"]) +def test_multi_cache_chain(protocol): + import zipfile + + d = tempfile.mkdtemp() + fn = os.path.join(d, "test.zip") + zipfile.ZipFile(fn, mode="w").open("test", "w").write(b"hello") + + with fsspec.open_files(f"zip://test::{protocol}::file://{fn}") as files: + assert d not in files[0]._fileobj._file.name + assert files[0].read() == b"hello" + + # special test contains "file:" string + fn = os.path.join(d, "file.zip") + zipfile.ZipFile(fn, mode="w").open("file", "w").write(b"hello") + with fsspec.open_files(f"zip://file::{protocol}::file://{fn}") as files: + assert d not in files[0]._fileobj._file.name + assert files[0].read() == b"hello" + + +@pytest.mark.parametrize("protocol", ["blockcache", "simplecache", "filecache"]) +def test_strip(protocol): + fs = fsspec.filesystem(protocol, target_protocol="memory") + url1 = "memory://afile" + assert fs._strip_protocol(url1) == "afile" + assert fs._strip_protocol(protocol + "://afile") == "afile" + assert fs._strip_protocol(protocol + "::memory://afile") == "afile" + + +@pytest.mark.parametrize("protocol", ["simplecache", "filecache"]) +def test_cached_write(protocol): + d = tempfile.mkdtemp() + with fsspec.open_files(f"{protocol}::file://{d}/*.out", mode="wb", num=2) as files: + for f in files: + f.write(b"data") + + assert sorted(os.listdir(d)) == ["0.out", "1.out"] + + +def test_expiry(): + import time + + d = tempfile.mkdtemp() + fs = fsspec.filesystem("memory") + fn = "afile" + fn0 = "memory://afile" + data = b"hello" + with fs.open(fn0, "wb") as f: + f.write(data) + + fs = fsspec.filesystem( + "filecache", + fs=fs, + cache_storage=d, + check_files=False, + expiry_time=0.1, + same_names=True, + ) + + # get file + assert fs._check_file(fn0) is False + assert fs.open(fn0, mode="rb").read() == data + start_time = fs.cached_files[-1][fn]["time"] + + # cache time.. + assert fs.last_cache - start_time < 0.19 + + # cache should have refreshed + time.sleep(0.01) + + # file should still be valid... re-read + assert fs.open(fn0, mode="rb").read() == data + detail, _ = fs._check_file(fn0) + assert detail["time"] == start_time + + time.sleep(0.11) + # file should still be invalid... re-read + assert fs._check_file(fn0) is False + assert fs.open(fn0, mode="rb").read() == data + detail, _ = fs._check_file(fn0) + assert detail["time"] - start_time > 0.09 diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_common.py fsspec-0.8.4/fsspec/implementations/tests/test_common.py --- fsspec-0.6.1/fsspec/implementations/tests/test_common.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_common.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,31 @@ +import datetime +import time +import pytest + +from fsspec import AbstractFileSystem +from fsspec.implementations.tests.conftest import READ_ONLY_FILESYSTEMS + + +@pytest.mark.parametrize("fs", ["local"], indirect=["fs"]) +def test_created(fs: AbstractFileSystem, temp_file): + try: + fs.touch(temp_file) + created = fs.created(path=temp_file) + assert isinstance(created, datetime.datetime) + finally: + if not isinstance(fs, tuple(READ_ONLY_FILESYSTEMS)): + fs.rm(temp_file) + + +@pytest.mark.parametrize("fs", ["local"], indirect=["fs"]) +def test_modified(fs: AbstractFileSystem, temp_file): + try: + fs.touch(temp_file) + created = fs.created(path=temp_file) + time.sleep(0.05) + fs.touch(temp_file) + modified = fs.modified(path=temp_file) + assert isinstance(modified, datetime.datetime) + assert modified > created + finally: + fs.rm(temp_file) diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_dask.py fsspec-0.8.4/fsspec/implementations/tests/test_dask.py --- fsspec-0.6.1/fsspec/implementations/tests/test_dask.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_dask.py 2020-10-14 16:51:19.000000000 +0000 @@ -24,6 +24,6 @@ def test_basic(cli): - fs = fsspec.filesystem("dask", remote_protocol="memory") + fs = fsspec.filesystem("dask", target_protocol="memory") assert fs.ls("") == ["afile"] assert fs.cat("afile") == b"data" diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_ftp.py fsspec-0.8.4/fsspec/implementations/tests/test_ftp.py --- fsspec-0.6.1/fsspec/implementations/tests/test_ftp.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_ftp.py 2020-10-14 16:51:19.000000000 +0000 @@ -115,3 +115,17 @@ fs.rm(fn) assert not fs.exists(fn) + + +def test_transaction_with_cache(ftp_writable): + host, port, user, pw = ftp_writable + fs = FTPFileSystem(host, port, user, pw) + fs.mkdir("/tmp") + fs.mkdir("/tmp/dir") + assert "dir" in fs.ls("/tmp", detail=False) + + with fs.transaction: + fs.rmdir("/tmp/dir") + + assert "dir" not in fs.ls("/tmp", detail=False) + assert not fs.exists("/tmp/dir") diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_git.py fsspec-0.8.4/fsspec/implementations/tests/test_git.py --- fsspec-0.6.1/fsspec/implementations/tests/test_git.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_git.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,55 @@ +import fsspec +import os +import pytest +import shutil +import tempfile +import subprocess + +pygit2 = pytest.importorskip("pygit2") + + +@pytest.fixture() +def repo(): + d = tempfile.mkdtemp() + os.chdir(d) + subprocess.call("git init", shell=True, cwd=d) + subprocess.call("git init", shell=True, cwd=d) + subprocess.call('git config user.email "you@example.com"', shell=True, cwd=d) + subprocess.call('git config user.name "Your Name"', shell=True, cwd=d) + open(os.path.join(d, "file1"), "wb").write(b"data0") + subprocess.call("git add file1", shell=True, cwd=d) + subprocess.call('git commit -m "init"', shell=True, cwd=d) + sha = open(os.path.join(d, ".git/refs/heads/master"), "r").read().strip() + open(os.path.join(d, "file1"), "wb").write(b"data00") + subprocess.check_output('git commit -a -m "tagger"', shell=True, cwd=d) + subprocess.call('git tag -a thetag -m "make tag"', shell=True, cwd=d) + open(os.path.join(d, "file2"), "wb").write(b"data000") + subprocess.call("git add file2", shell=True) + subprocess.call('git commit -m "master tip"', shell=True, cwd=d) + subprocess.call("git checkout -b abranch", shell=True, cwd=d) + os.mkdir("inner") + open(os.path.join(d, "inner", "file1"), "wb").write(b"data3") + subprocess.call("git add inner/file1", shell=True, cwd=d) + subprocess.call('git commit -m "branch tip"', shell=True, cwd=d) + try: + yield d, sha + finally: + shutil.rmtree(d) + + +def test_refs(repo): + d, sha = repo + with fsspec.open("git://file1", path=d, ref=sha) as f: + assert f.read() == b"data0" + + with fsspec.open("git://file1", path=d, ref="thetag") as f: + assert f.read() == b"data00" + + with fsspec.open("git://file2", path=d, ref="master") as f: + assert f.read() == b"data000" + + with fsspec.open("git://file2", path=d, ref=None) as f: + assert f.read() == b"data000" + + with fsspec.open("git://inner/file1", path=d, ref="abranch") as f: + assert f.read() == b"data3" diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_hdfs.py fsspec-0.8.4/fsspec/implementations/tests/test_hdfs.py --- fsspec-0.6.1/fsspec/implementations/tests/test_hdfs.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_hdfs.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,62 @@ +import pytest +import fsspec + +pyarrow = pytest.importorskip("pyarrow") + +basedir = "/tmp/test-fsspec" +data = b"\n".join([b"some test data"] * 1000) + + +@pytest.fixture +def hdfs(request): + try: + hdfs = pyarrow.hdfs.connect() + except IOError: + pytest.skip("No HDFS configured") + + if hdfs.exists(basedir): + hdfs.rm(basedir, recursive=True) + + hdfs.mkdir(basedir) + + with hdfs.open(basedir + "/file", "wb") as f: + f.write(data) + + yield hdfs + + if hdfs.exists(basedir): + hdfs.rm(basedir, recursive=True) + + +def test_ls(hdfs): + h = fsspec.filesystem("hdfs") + out = [f["name"] for f in h.ls(basedir)] + assert out == hdfs.ls(basedir) + + +def test_walk(hdfs): + h = fsspec.filesystem("hdfs") + out = h.walk(basedir) + assert list(out) == list(hdfs.walk(basedir)) + + +def test_isdir(hdfs): + h = fsspec.filesystem("hdfs") + assert h.isdir(basedir) + assert not h.isdir(basedir + "/file") + + +def test_exists(hdfs): + h = fsspec.filesystem("hdfs") + assert not h.exists(basedir + "/notafile") + + +def test_read(hdfs): + h = fsspec.filesystem("hdfs") + out = basedir + "/file" + with h.open(out, "rb") as f: + assert f.read() == data + with h.open(out, "rb", block_size=0) as f: + assert f.read() == data + with h.open(out, "rb") as f: + assert f.read(100) + f.read() == data diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_http.py fsspec-0.8.4/fsspec/implementations/tests/test_http.py --- fsspec-0.6.1/fsspec/implementations/tests/test_http.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_http.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,5 +1,9 @@ +import contextlib +import asyncio +import os import pytest from http.server import BaseHTTPRequestHandler, HTTPServer +import sys import threading import fsspec @@ -8,6 +12,7 @@ data = b"\n".join([b"some test data"] * 1000) realfile = "http://localhost:%i/index/realfile" % port index = b'Link' % realfile.encode() +win = os.name == "nt" class HTTPTestHandler(BaseHTTPRequestHandler): @@ -22,17 +27,19 @@ self.wfile.write(data) def do_GET(self): - if self.path not in ["/index/realfile", "/index"]: + if self.path.rstrip("/") not in [ + "/index/realfile", + "/index/otherfile", + "/index", + ]: self._respond(404) return - d = data if self.path == "/index/realfile" else index + d = data if self.path in ["/index/realfile", "/index/otherfile"] else index if "Range" in self.headers: ran = self.headers["Range"] b, ran = ran.split("=") start, end = ran.split("-") - print(start) - print(end) d = d[int(start) : int(end) + 1] if "give_length" in self.headers: response_headers = {"Content-Length": len(d)} @@ -47,7 +54,7 @@ self._respond(405) return d = data if self.path == "/index/realfile" else index - if self.path not in ["/index/realfile", "/index"]: + if self.path.rstrip("/") not in ["/index/realfile", "/index"]: self._respond(404) elif "give_length" in self.headers: response_headers = {"Content-Length": len(d)} @@ -61,8 +68,8 @@ self._respond(200) # OK response, but no useful info -@pytest.fixture(scope="module") -def server(): +@contextlib.contextmanager +def serve(): server_address = ("", port) httpd = HTTPServer(server_address, HTTPTestHandler) th = threading.Thread(target=httpd.serve_forever) @@ -76,12 +83,24 @@ th.join() +@pytest.fixture(scope="module") +def server(): + with serve() as s: + yield s + + def test_list(server): h = fsspec.filesystem("http") out = h.glob(server + "/index/*") assert out == [server + "/index/realfile"] +def test_isdir(server): + h = fsspec.filesystem("http") + assert h.isdir(server + "/index/") + assert not h.isdir(server + "/index/realfile") + + def test_policy_arg(server): h = fsspec.filesystem("http", size_policy="get") out = h.glob(server + "/index/*") @@ -91,6 +110,8 @@ def test_exists(server): h = fsspec.filesystem("http") assert not h.exists(server + "/notafile") + with pytest.raises(FileNotFoundError): + h.cat(server + "/notafile") def test_read(server): @@ -127,10 +148,13 @@ if headers: assert f.size == len(data) assert f.read(5) == data[:5] - # python server does not respect bytes range request - # we actually get all the data - f.seek(5, 1) - assert f.read(5) == data[10:15] + + if headers: + f.seek(5, 1) + assert f.read(5) == data[10:15] + else: + with pytest.raises(ValueError): + f.seek(5, 1) def test_mapper_url(server): @@ -152,3 +176,116 @@ with h.open(url, "rb") as f: assert f.read() == data + + +def test_download(server, tmpdir): + h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "}) + url = server + "/index/realfile" + fn = os.path.join(tmpdir, "afile") + h.get(url, fn) + assert open(fn, "rb").read() == data + + +def test_multi_download(server, tmpdir): + h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "}) + urla = server + "/index/realfile" + urlb = server + "/index/otherfile" + fna = os.path.join(tmpdir, "afile") + fnb = os.path.join(tmpdir, "bfile") + h.get([urla, urlb], [fna, fnb]) + assert open(fna, "rb").read() == data + assert open(fnb, "rb").read() == data + + +def test_mcat(server): + h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "}) + urla = server + "/index/realfile" + urlb = server + "/index/otherfile" + out = h.cat([urla, urlb]) + assert out == {urla: data, urlb: data} + + +def test_mcat_cache(server): + urla = server + "/index/realfile" + urlb = server + "/index/otherfile" + fs = fsspec.filesystem("simplecache", target_protocol="http") + assert fs.cat([urla, urlb]) == {urla: data, urlb: data} + + +def test_mcat_expand(server): + h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "}) + out = h.cat(server + "/index/*") + assert out == {server + "/index/realfile": data} + + +@pytest.mark.xfail( + condition=sys.flags.optimize > 1, reason="no docstrings when optimised" +) +def test_docstring(): + h = fsspec.filesystem("http") + # most methods have empty docstrings and draw from base class, but this one + # is generated + assert h.pipe.__doc__ + + +def test_async_other_thread(server): + import threading + + loop = asyncio.get_event_loop() + th = threading.Thread(target=loop.run_forever) + + th.daemon = True + th.start() + fs = fsspec.filesystem("http", asynchronous=False, loop=loop) + cor = fs._cat([server + "/index/realfile"]) + fut = asyncio.run_coroutine_threadsafe(cor, loop=loop) + assert fut.result() == [data] + + +@pytest.mark.skipif(sys.version_info < (3, 7), reason="no asyncio.run in py36") +def test_async_this_thread(server): + async def _(): + loop = asyncio.get_event_loop() + fs = fsspec.filesystem("http", asynchronous=True, loop=loop) + + # fails because client creation has not yet been awaited + assert isinstance( + (await fs._cat([server + "/index/realfile"]))[0], RuntimeError + ) + with pytest.raises(RuntimeError): + fs.cat([server + "/index/realfile"]) + + await fs.set_session() # creates client + + out = await fs._cat([server + "/index/realfile"]) + del fs + assert out == [data] + + asyncio.run(_()) + + +def _inner_pass(fs, q, fn): + # pass the s3 instance, but don't use it; in new process, the instance + # cache should be skipped to make a new instance + fs = fsspec.filesystem("http") + q.put(fs.cat(fn)) + + +@pytest.mark.skipif( + bool(os.environ.get("TRAVIS", "")), reason="Travis is weird in many ways" +) +@pytest.mark.parametrize("method", ["spawn", "forkserver", "fork"]) +def test_processes(server, method): + import multiprocessing as mp + + if win and method != "spawn": + pytest.skip("Windows can only spawn") + ctx = mp.get_context(method) + fn = server + "/index/realfile" + fs = fsspec.filesystem("http") + + q = ctx.Queue() + p = ctx.Process(target=_inner_pass, args=(fs, q, fn)) + p.start() + assert q.get() == fs.cat(fn) + p.join() diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_jupyter.py fsspec-0.8.4/fsspec/implementations/tests/test_jupyter.py --- fsspec-0.6.1/fsspec/implementations/tests/test_jupyter.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_jupyter.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,68 @@ +import os +import re +import shlex +import subprocess +import time +import pytest + +import fsspec + +pytest.importorskip("notebook") +requests = pytest.importorskip("requests") + + +@pytest.fixture() +def jupyter(tmpdir): + import requests + + tmpdir = str(tmpdir) + try: + P = subprocess.Popen( + shlex.split( + f"jupyter notebook --notebook-dir={tmpdir}" f" --no-browser --port=5566" + ), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + stdin=subprocess.DEVNULL, + ) + except FileNotFoundError: + pytest.skip("notebook not installed correctly") + try: + timeout = 5 + while True: + try: + r = requests.get("http://127.0.0.1:5566/") + r.raise_for_status() + break + except (requests.exceptions.BaseHTTPError, IOError): + time.sleep(0.1) + timeout -= 0.1 + pytest.skip("Timed out for jupyter") + txt = P.stdout.read(600).decode() + try: + url = re.findall("(http[s]*://[^\\n]+)", txt)[0] + except IndexError: + pytest.skip("No notebook URL: " + txt) # debug on fail + yield url, tmpdir + finally: + P.terminate() + + +def test_simple(jupyter): + url, d = jupyter + fs = fsspec.filesystem("jupyter", url=url) + assert fs.ls("") == [] + + fs.pipe("afile", b"data") + assert fs.cat("afile") == b"data" + assert "afile" in os.listdir(d) + + with fs.open("bfile", "wb") as f: + f.write(b"more") + with fs.open("bfile", "rb") as f: + assert f.read() == b"more" + + assert fs.info("bfile")["size"] == 4 + fs.rm("afile") + + assert "afile" not in os.listdir(d) diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_local.py fsspec-0.8.4/fsspec/implementations/tests/test_local.py --- fsspec-0.6.1/fsspec/implementations/tests/test_local.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_local.py 2020-10-14 16:51:19.000000000 +0000 @@ -3,8 +3,11 @@ import gzip import os import os.path +import pickle import sys from contextlib import contextmanager +from distutils.version import LooseVersion +import posixpath import tempfile import pytest @@ -12,6 +15,7 @@ from fsspec.core import open_files, get_fs_token_paths, OpenFile from fsspec.implementations.local import LocalFileSystem, make_path_posix from fsspec import compression +from fsspec.tests.test_utils import WIN files = { ".test.accounts.1.json": ( @@ -33,11 +37,12 @@ ".test.fakedata.1.csv": (b"a,b\n" b"1,2\n"), ".test.fakedata.2.csv": (b"a,b\n" b"3,4\n"), } +odir = os.getcwd() @contextmanager def filetexts(d, open=open, mode="t"): - """ Dumps a number of textfiles to disk + """Dumps a number of textfiles to disk d - dict a mapping from filename to text like {'a.csv': '1,1\n2,2'} @@ -46,7 +51,6 @@ automatically switch to a temporary current directory, to avoid race conditions when running tests in parallel. """ - odir = os.getcwd() dirname = tempfile.mkdtemp() try: os.chdir(dirname) @@ -73,8 +77,8 @@ def test_urlpath_inference_strips_protocol(tmpdir): - tmpdir = str(tmpdir) - paths = [os.path.join(tmpdir, "test.%02d.csv" % i) for i in range(20)] + tmpdir = make_path_posix(str(tmpdir)) + paths = ["/".join([tmpdir, "test.%02d.csv" % i]) for i in range(20)] for path in paths: with open(path, "wb") as f: @@ -100,13 +104,7 @@ # Protocols differ with pytest.raises(ValueError) as err: get_fs_token_paths(["s3://test/path.csv", "/other/path.csv"]) - assert "same protocol" in str(err.value) - - # Unknown type - with pytest.raises(TypeError): - get_fs_token_paths( - {"sets/are.csv", "unordered/so/they.csv", "should/not/be.csvallowed.csv"} - ) + assert "protocol" in str(err.value) def test_urlpath_expand_read(): @@ -204,7 +202,9 @@ with filetexts(files, mode="b"): for f in files.keys(): assert fs.isfile(f) + assert fs.isfile("file://" + f) assert not fs.isfile("not-a-file") + assert not fs.isfile("file://not-a-file") def test_isdir(): @@ -258,7 +258,7 @@ f.write("hi") out = LocalFileSystem().glob("./*") assert len(out) == 1 - assert os.sep in out[0] + assert "/" in out[0] assert "tmp" in out[0] # I don't know what this was testing - but should avoid local paths anyway @@ -272,35 +272,65 @@ @pytest.mark.parametrize("sep", ["/", "\\"]) @pytest.mark.parametrize("chars", ["+", "++", "(", ")", "|", "\\"]) def test_glob_weird_characters(tmpdir, sep, chars): - tmpdir = str(tmpdir) + tmpdir = make_path_posix(str(tmpdir)) subdir = tmpdir + sep + "test" + chars + "x" - os.mkdir(subdir) + try: + os.makedirs(subdir, exist_ok=True) + except OSError as e: + if WIN and "label syntax" in str(e): + pytest.xfail("Illegal windows directory name") + else: + raise with open(subdir + sep + "tmp", "w") as f: f.write("hi") out = LocalFileSystem().glob(subdir + sep + "*") assert len(out) == 1 - assert os.sep in out[0] + assert "/" in out[0] assert "tmp" in out[0] def test_globfind_dirs(tmpdir): - tmpdir = str(tmpdir) + tmpdir = make_path_posix(str(tmpdir)) fs = fsspec.filesystem("file") fs.mkdir(tmpdir + "/dir") fs.touch(tmpdir + "/dir/afile") assert [tmpdir + "/dir"] == fs.glob(tmpdir + "/*") + assert fs.glob(tmpdir + "/*", detail=True)[tmpdir + "/dir"]["type"] == "directory" + assert ( + fs.glob(tmpdir + "/dir/*", detail=True)[tmpdir + "/dir/afile"]["type"] == "file" + ) assert [tmpdir + "/dir/afile"] == fs.find(tmpdir) assert [tmpdir + "/dir", tmpdir + "/dir/afile"] == fs.find(tmpdir, withdirs=True) +def test_touch(tmpdir): + import time + + fn = tmpdir + "/in/file" + fs = fsspec.filesystem("file", auto_mkdir=False) + with pytest.raises(OSError): + fs.touch(fn) + fs = fsspec.filesystem("file", auto_mkdir=True) + fs.touch(fn) + info = fs.info(fn) + time.sleep(0.2) + fs.touch(fn) + info2 = fs.info(fn) + if not WIN: + assert info2["mtime"] > info["mtime"] + + def test_get_pyarrow_filesystem(): pa = pytest.importorskip("pyarrow") fs = LocalFileSystem() - assert isinstance(fs, pa.filesystem.FileSystem) - assert fs._get_pyarrow_filesystem() is fs + if LooseVersion(pa.__version__) < LooseVersion("2.0"): + assert isinstance(fs, pa.filesystem.FileSystem) + assert fs._get_pyarrow_filesystem() is fs + else: + assert not isinstance(fs, pa.filesystem.FileSystem) class UnknownFileSystem(object): pass @@ -309,7 +339,7 @@ def test_directories(tmpdir): - tmpdir = str(tmpdir) + tmpdir = make_path_posix(str(tmpdir)) fs = LocalFileSystem() fs.mkdir(tmpdir + "/dir") assert tmpdir + "/dir" in fs.ls(tmpdir) @@ -319,8 +349,8 @@ def test_file_ops(tmpdir): - tmpdir = str(tmpdir) - fs = LocalFileSystem() + tmpdir = make_path_posix(str(tmpdir)) + fs = LocalFileSystem(auto_mkdir=True) with pytest.raises(FileNotFoundError): fs.info(tmpdir + "/nofile") fs.touch(tmpdir + "/afile") @@ -339,6 +369,9 @@ fs.move(tmpdir + "/afile", tmpdir + "/afile3") assert not fs.exists(tmpdir + "/afile") + fs.cp(tmpdir + "/afile3", tmpdir + "/deeply/nested/file") + assert fs.exists(tmpdir + "/deeply/nested/file") + fs.rm(tmpdir + "/afile3", recursive=True) assert not fs.exists(tmpdir + "/afile3") @@ -347,8 +380,8 @@ def test_recursive_get_put(tmpdir): - tmpdir = str(tmpdir) - fs = LocalFileSystem() + tmpdir = make_path_posix(str(tmpdir)) + fs = LocalFileSystem(auto_mkdir=True) fs.mkdir(tmpdir + "/a1/a2/a3") fs.touch(tmpdir + "/a1/a2/a3/afile") @@ -386,10 +419,19 @@ def test_make_path_posix(): cwd = os.getcwd() - assert make_path_posix("/a/posix/path") == "/a/posix/path" - assert make_path_posix("/posix") == "/posix" - assert make_path_posix("relpath", sep="/") == os.path.join(cwd, "relpath") - assert make_path_posix("rel/path", sep="/") == os.path.join(cwd, "rel/path") + if WIN: + drive = cwd[0] + assert make_path_posix("/a/posix/path") == f"{drive}:/a/posix/path" + assert make_path_posix("/posix") == f"{drive}:/posix" + else: + assert make_path_posix("/a/posix/path") == "/a/posix/path" + assert make_path_posix("/posix") == "/posix" + assert make_path_posix("relpath", sep="/") == posixpath.join( + make_path_posix(cwd), "relpath" + ) + assert make_path_posix("rel/path", sep="/") == posixpath.join( + make_path_posix(cwd), "rel/path" + ) assert make_path_posix("C:\\path", sep="\\") == "C:/path" assert ( make_path_posix( @@ -397,6 +439,12 @@ ) == "//windows-server/someshare/path/more/path/dir/foo.parquet" ) + assert ( + make_path_posix( + r"\\SERVER\UserHomeFolder$\me\My Documents\project1\data\filen.csv" + ) + == "//SERVER/UserHomeFolder$/me/My Documents/project1/data/filen.csv" + ) assert "/" in make_path_posix("rel\\path", sep="\\") @@ -408,8 +456,14 @@ data = b"my target data" with open(fn0, "wb") as f: f.write(data) - os.symlink(fn0, fn1) - os.symlink(fn0, fn2) + try: + os.symlink(fn0, fn1) + os.symlink(fn0, fn2) + except OSError: + if WIN: + pytest.xfail("Ran on win without admin permissions") + else: + raise fs = LocalFileSystem() assert fs.info(fn0)["type"] == "file" @@ -432,3 +486,67 @@ def test_isfilestore(): fs = LocalFileSystem(auto_mkdir=False) assert fs._isfilestore() + + +def test_pickle(tmpdir): + fs = LocalFileSystem() + tmpdir = str(tmpdir) + fn0 = os.path.join(tmpdir, "target") + + with open(fn0, "wb") as f: + f.write(b"data") + + f = fs.open(fn0, "rb") + f.seek(1) + f2 = pickle.loads(pickle.dumps(f)) + assert f2.read() == f.read() + + f = fs.open(fn0, "wb") + with pytest.raises(ValueError): + pickle.dumps(f) + + +def test_strip_protocol_expanduser(): + path = "file://~\\foo\\bar" if sys.platform == "win32" else "file://~/foo/bar" + stripped = LocalFileSystem._strip_protocol(path) + assert path != stripped + assert "file://" not in stripped + assert stripped.startswith(os.path.expanduser("~").replace("\\", "/")) + + +def test_iterable(tmpdir): + data = b"a\nhello\noi" + fn = os.path.join(tmpdir, "test") + with open(fn, "wb") as f: + f.write(data) + of = fsspec.open("file://%s" % fn, "rb") + with of as f: + out = list(f) + assert b"".join(out) == data + + +def test_mv_empty(tmpdir): + localfs = fsspec.filesystem("file") + src = os.path.join(str(tmpdir), "src") + dest = os.path.join(str(tmpdir), "dest") + assert localfs.isdir(src) is False + localfs.mkdir(src) + assert localfs.isdir(src) + localfs.move(src, dest, recursive=True) + assert localfs.isdir(src) is False + assert localfs.isdir(dest) + assert localfs.info(dest) + + +def test_mv_recursive(tmpdir): + localfs = fsspec.filesystem("file") + src = os.path.join(str(tmpdir), "src") + dest = os.path.join(str(tmpdir), "dest") + assert localfs.isdir(src) is False + localfs.mkdir(src) + assert localfs.isdir(src) + localfs.touch(os.path.join(src, "afile")) + localfs.move(src, dest, recursive=True) + assert localfs.isdir(src) is False + assert localfs.isdir(dest) + assert localfs.info(os.path.join(dest, "afile")) diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_memory.py fsspec-0.8.4/fsspec/implementations/tests/test_memory.py --- fsspec-0.6.1/fsspec/implementations/tests/test_memory.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_memory.py 2020-10-14 16:51:19.000000000 +0000 @@ -5,8 +5,17 @@ def test_1(m): m.touch("/somefile") # NB: is found with or without initial / m.touch("afiles/and/anothers") - assert m.find("") == ["afiles/and/anothers", "somefile"] - assert list(m.get_mapper("")) == ["afiles/and/anothers", "somefile"] + files = m.find("") + if "somefile" in files: + assert files == ["afiles/and/anothers", "somefile"] + else: + assert files == ["/somefile", "afiles/and/anothers"] + + files = sorted(m.get_mapper("")) + if "somefile" in files: + assert files == ["afiles/and/anothers", "somefile"] + else: + assert files == ["/somefile", "afiles/and/anothers"] @pytest.mark.xfail( @@ -14,6 +23,9 @@ reason="py35 error, see https://github.com/intake/filesystem_spec/issues/148", ) def test_ls(m): + m.mkdir("/dir") + m.mkdir("/dir/dir1") + m.touch("/dir/afile") m.touch("/dir/dir1/bfile") m.touch("/dir/dir1/cfile") @@ -24,3 +36,53 @@ assert m.ls("/dir", True)[1]["type"] == "directory" assert len(m.ls("/dir/dir1")) == 2 + + +def test_directories(m): + with pytest.raises(NotADirectoryError): + m.mkdir("outer/inner", create_parents=False) + m.mkdir("outer/inner") + + assert m.ls("outer") + assert m.ls("outer/inner") == [] + + with pytest.raises(OSError): + m.rmdir("outer") + + m.rmdir("outer/inner") + m.rmdir("outer") + + assert not m.store + + +def test_mv_recursive(m): + m.mkdir("src") + m.touch("src/file.txt") + m.mv("src", "dest", recursive=True) + assert m.exists("dest/file.txt") + assert not m.exists("src") + + +def test_rewind(m): + # https://github.com/intake/filesystem_spec/issues/349 + with m.open("src/file.txt", "w") as f: + f.write("content") + with m.open("src/file.txt") as f: + assert f.tell() == 0 + + +def test_no_rewind_append_mode(m): + # https://github.com/intake/filesystem_spec/issues/349 + with m.open("src/file.txt", "w") as f: + f.write("content") + with m.open("src/file.txt", "a") as f: + assert f.tell() == 7 + + +def test_moves(m): + m.touch("source.txt") + m.mv("source.txt", "target.txt") + + m.touch("source2.txt") + m.mv("source2.txt", "target2.txt", recursive=True) + assert m.find("") == ["target.txt", "target2.txt"] diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_sftp.py fsspec-0.8.4/fsspec/implementations/tests/test_sftp.py --- fsspec-0.6.1/fsspec/implementations/tests/test_sftp.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_sftp.py 2020-10-14 16:51:19.000000000 +0000 @@ -18,7 +18,7 @@ def ssh(): try: subprocess.check_call(["docker", "run", "hello-world"]) - except subprocess.CalledProcessError: + except (subprocess.CalledProcessError, FileNotFoundError): pytest.skip("docker run not available") return diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_smb.py fsspec-0.8.4/fsspec/implementations/tests/test_smb.py --- fsspec-0.6.1/fsspec/implementations/tests/test_smb.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_smb.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +""" +Test SMBFileSystem class using a docker container +""" + +import logging +import shlex +import subprocess +import time +import pytest +import fsspec + +pytest.importorskip("smbprotocol") + +# ! pylint: disable=redefined-outer-name,missing-function-docstring + + +def stop_docker(container): + cmd = shlex.split('docker ps -a -q --filter "name=%s"' % container) + cid = subprocess.check_output(cmd).strip().decode() + if cid: + subprocess.call(["docker", "rm", "-f", "-v", cid]) + + +@pytest.fixture(scope="module") +def smb_params(): + try: + pchk = ["docker", "run", "--name", "fsspec_test_smb", "hello-world"] + subprocess.check_call(pchk) + except (subprocess.CalledProcessError, FileNotFoundError): + pytest.skip("docker run not available") + return + stop_docker("fsspec_test_smb") + + # requires docker + container = "fsspec_smb" + stop_docker(container) + img = "docker run --name {} --detach -p 139:139 -p 445:445 dperson/samba" + cfg = " -p -u 'testuser;testpass' -s 'home;/share;no;no;no;testuser'" + cmd = img.format(container) + cfg + cid = subprocess.check_output(shlex.split(cmd)).strip().decode() + logger = logging.getLogger("fsspec") + logger.debug("Container: %s", cid) + try: + time.sleep(1) + yield dict(host="localhost", port=445, username="testuser", password="testpass") + finally: + import smbclient # pylint: disable=import-outside-toplevel + + smbclient.reset_connection_cache() + stop_docker(container) + + +def test_simple(smb_params): + adir = "/home/adir" + adir2 = "/home/adir/otherdir/" + afile = "/home/adir/otherdir/afile" + fsmb = fsspec.get_filesystem_class("smb")(**smb_params) + fsmb.mkdirs(adir2) + fsmb.touch(afile) + assert fsmb.find(adir) == [afile] + assert fsmb.ls(adir2, detail=False) == [afile] + assert fsmb.info(afile)["type"] == "file" + assert fsmb.info(afile)["size"] == 0 + assert fsmb.exists(adir) + fsmb.rm(adir, recursive=True) + assert not fsmb.exists(adir) + + +def test_with_url(smb_params): + smb_url = "smb://{username}:{password}@{host}:{port}/home/someuser.txt" + fwo = fsspec.open(smb_url.format(**smb_params), "wb") + with fwo as fwr: + fwr.write(b"hello") + fro = fsspec.open(smb_url.format(**smb_params), "rb") + with fro as frd: + read_result = frd.read() + assert read_result == b"hello" + + +def test_transaction(smb_params): + afile = "/home/afolder/otherdir/afile" + afile2 = "/home/afolder/otherdir/afile2" + adir = "/home/afolder" + adir2 = "/home/afolder/otherdir" + fsmb = fsspec.get_filesystem_class("smb")(**smb_params) + fsmb.mkdirs(adir2) + fsmb.start_transaction() + fsmb.touch(afile) + assert fsmb.find(adir) == [] + fsmb.end_transaction() + assert fsmb.find(adir) == [afile] + + with fsmb.transaction: + assert fsmb._intrans + fsmb.touch(afile2) + assert fsmb.find(adir) == [afile] + assert fsmb.find(adir) == [afile, afile2] + + +def test_makedirs_exist_ok(smb_params): + fsmb = fsspec.get_filesystem_class("smb")(**smb_params) + fsmb.makedirs("/home/a/b/c") + fsmb.makedirs("/home/a/b/c", exist_ok=True) diff -Nru fsspec-0.6.1/fsspec/implementations/tests/test_zip.py fsspec-0.8.4/fsspec/implementations/tests/test_zip.py --- fsspec-0.6.1/fsspec/implementations/tests/test_zip.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/tests/test_zip.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,5 +1,6 @@ import zipfile from contextlib import contextmanager + import os import pickle import pytest @@ -28,14 +29,18 @@ def test_empty(): with tempzip() as z: - fs = fsspec.get_filesystem_class("zip")(fo=z) + fs = fsspec.filesystem("zip", fo=z) assert fs.find("") == [] + assert fs.find("", withdirs=True) == [] + with pytest.raises(FileNotFoundError): + fs.info("") + assert fs.ls("") == [] @pytest.mark.xfail(sys.version_info < (3, 6), reason="zip-info odd on py35") def test_mapping(): with tempzip(data) as z: - fs = fsspec.get_filesystem_class("zip")(fo=z) + fs = fsspec.filesystem("zip", fo=z) m = fs.get_mapper("") assert list(m) == ["a", "b", "deeply/nested/path"] assert m["b"] == data["b"] @@ -44,6 +49,112 @@ @pytest.mark.xfail(sys.version_info < (3, 6), reason="zip not supported on py35") def test_pickle(): with tempzip(data) as z: - fs = fsspec.get_filesystem_class("zip")(fo=z) + fs = fsspec.filesystem("zip", fo=z) fs2 = pickle.loads(pickle.dumps(fs)) assert fs2.cat("b") == b"hello" + + +def test_all_dirnames(): + with tempzip() as z: + fs = fsspec.filesystem("zip", fo=z) + + # fx are files, dx are a directories + assert fs._all_dirnames([]) == set() + assert fs._all_dirnames(["f1"]) == set() + assert fs._all_dirnames(["f1", "f2"]) == set() + assert fs._all_dirnames(["f1", "f2", "d1/f1"]) == {"d1"} + assert fs._all_dirnames(["f1", "d1/f1", "d1/f2"]) == {"d1"} + assert fs._all_dirnames(["f1", "d1/f1", "d2/f1"]) == {"d1", "d2"} + assert fs._all_dirnames(["d1/d1/d1/f1"]) == {"d1", "d1/d1", "d1/d1/d1"} + + +def test_ls(): + with tempzip(data) as z: + lhs = fsspec.filesystem("zip", fo=z) + + assert lhs.ls("") == ["a", "b", "deeply/"] + assert lhs.ls("/") == lhs.ls("") + + assert lhs.ls("deeply") == ["deeply/nested/"] + assert lhs.ls("deeply/") == lhs.ls("deeply") + + assert lhs.ls("deeply/nested") == ["deeply/nested/path"] + assert lhs.ls("deeply/nested/") == lhs.ls("deeply/nested") + + +def test_find(): + with tempzip(data) as z: + lhs = fsspec.filesystem("zip", fo=z) + + assert lhs.find("") == ["a", "b", "deeply/nested/path"] + assert lhs.find("", withdirs=True) == [ + "a", + "b", + "deeply/", + "deeply/nested/", + "deeply/nested/path", + ] + + assert lhs.find("deeply") == ["deeply/nested/path"] + assert lhs.find("deeply/") == lhs.find("deeply") + + +def test_walk(): + with tempzip(data) as z: + fs = fsspec.filesystem("zip", fo=z) + expected = [ + # (dirname, list of subdirs, list of files) + ("", ["deeply"], ["a", "b"]), + ("deeply", ["nested"], []), + ("deeply/nested", [], ["path"]), + ] + assert list(fs.walk("")) == expected + + +def test_info(): + with tempzip(data) as z: + fs_cache = fsspec.filesystem("zip", fo=z) + + with pytest.raises(FileNotFoundError): + fs_cache.info("i-do-not-exist") + + # Iterate over all directories + # The ZipFile does not include additional information about the directories, + for d in fs_cache._all_dirnames(data.keys()): + lhs = fs_cache.info(d) + expected = {"name": f"{d}/", "size": 0, "type": "directory"} + assert lhs == expected + + # Iterate over all files + for f, v in data.items(): + lhs = fs_cache.info(f) + assert lhs["name"] == f + assert lhs["size"] == len(v) + assert lhs["type"] == "file" + + # There are many flags specific to Zip Files. + # These are two we can use to check we are getting some of them + assert "CRC" in lhs + assert "compress_size" in lhs + + +@pytest.mark.parametrize("scale", [128, 512, 4096]) +def test_isdir_isfile(scale): + def make_nested_dir(i): + x = f"{i}" + table = x.maketrans("0123456789", "ABCDEFGHIJ") + return "/".join(x.translate(table)) + + scaled_data = {f"{make_nested_dir(i)}/{i}": b"" for i in range(1, scale + 1)} + with tempzip(scaled_data) as z: + fs = fsspec.filesystem("zip", fo=z) + + lhs_dirs, lhs_files = fs._all_dirnames(scaled_data.keys()), scaled_data.keys() + + # Warm-up the Cache, this is done in both cases anyways... + fs._get_dirs() + + entries = lhs_files | lhs_dirs + + assert lhs_dirs == {e for e in entries if fs.isdir(e)} + assert lhs_files == {e for e in entries if fs.isfile(e)} diff -Nru fsspec-0.6.1/fsspec/implementations/webhdfs.py fsspec-0.8.4/fsspec/implementations/webhdfs.py --- fsspec-0.6.1/fsspec/implementations/webhdfs.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/webhdfs.py 2020-10-14 16:51:19.000000000 +0000 @@ -12,7 +12,7 @@ class WebHDFS(AbstractFileSystem): """ - Interface to HDFS over HTTP + Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways. Three auth mechanisms are supported: @@ -43,6 +43,7 @@ proxy_to=None, kerb_kwargs=None, data_proxy=None, + use_https=False, **kwargs ): """ @@ -74,12 +75,16 @@ maps host names `host->data_proxy[host]`; if a callable, full URLs are passed, and function must conform to `url->data_proxy(url)`. + use_https: bool + Whether to connect to the Name-node using HTTPS instead of HTTP kwargs """ if self._cached: return super().__init__(**kwargs) - self.url = "http://{host}:{port}/webhdfs/v1".format(host=host, port=port) + self.url = "{protocol}://{host}:{port}/webhdfs/v1".format( + protocol="https" if use_https else "http", host=host, port=port + ) self.kerb = kerberos self.kerb_kwargs = kerb_kwargs or {} self.pars = {} @@ -123,12 +128,22 @@ data=data, allow_redirects=redirect, ) - if out.status_code == 404: - raise FileNotFoundError(path) - if out.status_code == 403: - raise PermissionError(path or "") - if out.status_code == 401: - raise PermissionError # not specific to path + if out.status_code in [400, 401, 403, 404, 500]: + try: + err = out.json() + msg = err["RemoteException"]["message"] + exp = err["RemoteException"]["exception"] + except (ValueError, KeyError): + pass + else: + if exp in ["IllegalArgumentException", "UnsupportedOperationException"]: + raise ValueError(msg) + elif exp in ["SecurityException", "AccessControlException"]: + raise PermissionError(msg) + elif exp in ["FileNotFoundException"]: + raise FileNotFoundError(msg) + else: + raise RuntimeError(msg) out.raise_for_status() return out @@ -221,10 +236,14 @@ def ukey(self, path): """Checksum info of file, giving method and result""" out = self._call("GETFILECHECKSUM", path=path, redirect=False) - location = self._apply_proxy(out.headers["Location"]) - out2 = self.session.get(location) - out2.raise_for_status() - return out2.json()["FileChecksum"] + if "Location" in out.headers: + location = self._apply_proxy(out.headers["Location"]) + out2 = self.session.get(location) + out2.raise_for_status() + return out2.json()["FileChecksum"] + else: + out.raise_for_status() + return out.json()["FileChecksum"] def home_directory(self): """Get user's home directory""" @@ -339,7 +358,7 @@ self.path = "/".join([tempdir, str(uuid.uuid4())]) def _upload_chunk(self, final=False): - """ Write one part of a multi-block file upload + """Write one part of a multi-block file upload Parameters ========== @@ -347,35 +366,47 @@ This is the last block, so should complete file, if self.autocommit is True. """ - out = self.fs.session.post(self.location, data=self.buffer.getvalue()) + out = self.fs.session.post( + self.location, + data=self.buffer.getvalue(), + headers={"content-type": "application/octet-stream"}, + ) out.raise_for_status() return True def _initiate_upload(self): """ Create remote file/upload """ + kwargs = self.kwargs.copy() if "a" in self.mode: op, method = "APPEND", "POST" else: op, method = "CREATE", "PUT" - if self.fs.exists(self.path): - # no "truncate" or "create empty" - self.fs.rm(self.path) - out = self.fs._call(op, method, self.path, redirect=False, **self.kwargs) + kwargs["overwrite"] = "true" + out = self.fs._call(op, method, self.path, redirect=False, **kwargs) location = self.fs._apply_proxy(out.headers["Location"]) if "w" in self.mode: # create empty file to append to - out2 = self.fs.session.put(location) + out2 = self.fs.session.put( + location, headers={"content-type": "application/octet-stream"} + ) out2.raise_for_status() self.location = location.replace("CREATE", "APPEND") def _fetch_range(self, start, end): + start = max(start, 0) + end = min(self.size, end) + if start >= end or start >= self.size: + return b"" out = self.fs._call( "OPEN", path=self.path, offset=start, length=end - start, redirect=False ) out.raise_for_status() - location = out.headers["Location"] - out2 = self.fs.session.get(self.fs._apply_proxy(location)) - return out2.content + if "Location" in out.headers: + location = out.headers["Location"] + out2 = self.fs.session.get(self.fs._apply_proxy(location)) + return out2.content + else: + return out.content def commit(self): self.fs.mv(self.path, self.target) diff -Nru fsspec-0.6.1/fsspec/implementations/zip.py fsspec-0.8.4/fsspec/implementations/zip.py --- fsspec-0.6.1/fsspec/implementations/zip.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/implementations/zip.py 2020-10-14 16:51:19.000000000 +0000 @@ -15,7 +15,15 @@ root_marker = "" - def __init__(self, fo="", mode="r", **storage_options): + def __init__( + self, + fo="", + mode="r", + target_protocol=None, + target_options=None, + block_size=DEFAULT_BLOCK_SIZE, + **kwargs + ): """ Parameters ---------- @@ -24,18 +32,18 @@ `open_files()`, which must return one file exactly. mode: str Currently, only 'r' accepted - storage_options: key-value - May be credentials, e.g., `{'auth': ('username', 'pword')}` or any - other parameters for requests + target_protocol: str (optional) + If ``fo`` is a string, this value can be used to override the + FS protocol inferred from a URL + target_options: dict (optional) + Kwargs passed when instantiating the target FS, if ``fo`` is + a string. """ - if self._cached: - return - AbstractFileSystem.__init__(self) + super().__init__(self, **kwargs) if mode != "r": raise ValueError("Only read from zip files accepted") - self.in_fo = fo if isinstance(fo, str): - files = open_files(fo) + files = open_files(fo, protocol=target_protocol, **(target_options or {})) if len(files) != 1: raise ValueError( 'Path "{}" did not resolve to exactly' @@ -44,7 +52,7 @@ fo = files[0] self.fo = fo.__enter__() # the whole instance is a context self.zip = zipfile.ZipFile(self.fo) - self.block_size = storage_options.get("block_size", DEFAULT_BLOCK_SIZE) + self.block_size = block_size self.dir_cache = None @classmethod @@ -55,7 +63,10 @@ def _get_dirs(self): if self.dir_cache is None: files = self.zip.infolist() - self.dir_cache = {} + self.dir_cache = { + dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"} + for dirname in self._all_dirnames(self.zip.namelist()) + } for z in files: f = {s: getattr(z, s) for s in zipfile.ZipInfo.__slots__} f.update( @@ -67,7 +78,17 @@ ) self.dir_cache[f["name"]] = f - def ls(self, path, detail=False): + def info(self, path, **kwargs): + self._get_dirs() + path = self._strip_protocol(path) + if path in self.dir_cache: + return self.dir_cache[path] + elif path + "/" in self.dir_cache: + return self.dir_cache[path + "/"] + else: + raise FileNotFoundError(path) + + def ls(self, path, detail=False, **kwargs): self._get_dirs() paths = {} for p, f in self.dir_cache.items(): @@ -78,15 +99,6 @@ root = "" if root == path.rstrip("/"): paths[p] = f - elif path and all( - (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/")) - ): - # implicit directory - ppath = "/".join(p.split("/")[: len(path.split("/")) + 1]) - if ppath not in paths: - out = {"name": ppath + "/", "size": 0, "type": "directory"} - paths[ppath] = out - elif all( (a == b) for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) @@ -124,4 +136,17 @@ return out def ukey(self, path): - return tokenize(path, self.in_fo, self.protocol) + return tokenize(path, self.fo, self.protocol) + + def _all_dirnames(self, paths): + """Returns *all* directory names for each path in paths, including intermediate ones. + + Parameters + ---------- + paths: Iterable of path strings + """ + if len(paths) == 0: + return set() + + dirnames = {self._parent(path) for path in paths} - {self.root_marker} + return dirnames | self._all_dirnames(dirnames) diff -Nru fsspec-0.6.1/fsspec/__init__.py fsspec-0.8.4/fsspec/__init__.py --- fsspec-0.6.1/fsspec/__init__.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/__init__.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,9 +1,14 @@ from ._version import get_versions from .spec import AbstractFileSystem -from .registry import get_filesystem_class, registry, filesystem +from .registry import ( + get_filesystem_class, + registry, + filesystem, + register_implementation, +) from .mapping import FSMap, get_mapper -from .core import open_files, get_fs_token_paths, open +from .core import open_files, get_fs_token_paths, open, open_local from . import caching __version__ = get_versions()["version"] @@ -14,11 +19,13 @@ "AbstractFileSystem", "FSMap", "filesystem", + "register_implementation", "get_filesystem_class", "get_fs_token_paths", "get_mapper", "open", "open_files", + "open_local", "registry", "caching", ] diff -Nru fsspec-0.6.1/fsspec/mapping.py fsspec-0.8.4/fsspec/mapping.py --- fsspec-0.6.1/fsspec/mapping.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/mapping.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,6 +1,6 @@ +import array from collections.abc import MutableMapping -from .registry import get_filesystem_class -from .core import split_protocol +from .core import url_to_fs class FSMap(MutableMapping): @@ -31,11 +31,18 @@ b'Hello World' """ - def __init__(self, root, fs, check=False, create=False): + def __init__(self, root, fs, check=False, create=False, missing_exceptions=None): self.fs = fs self.root = fs._strip_protocol(root).rstrip( "/" ) # we join on '/' in _key_to_str + if missing_exceptions is None: + missing_exceptions = ( + FileNotFoundError, + IsADirectoryError, + NotADirectoryError, + ) + self.missing_exceptions = missing_exceptions if create: if not self.fs.exists(root): self.fs.mkdir(root) @@ -49,14 +56,63 @@ self.fs.rm(root + "/a") def clear(self): - """Remove all keys below root - empties out mapping - """ + """Remove all keys below root - empties out mapping""" try: self.fs.rm(self.root, True) self.fs.mkdir(self.root) except: # noqa: E722 pass + def getitems(self, keys, on_error="raise"): + """Fetch multiple items from the store + + If the backend is async-able, this might proceed concurrently + + Parameters + ---------- + keys: list(str) + They keys to be fetched + on_error : "raise", "omit", "return" + If raise, an underlying exception will be raised (converted to KeyError + if the type is in self.missing_exceptions); if omit, keys with exception + will simply not be included in the output; if "return", all keys are + included in the output, but the value will be bytes or an exception + instance. + + Returns + ------- + dict(key, bytes|exception) + """ + keys2 = [self._key_to_str(k) for k in keys] + oe = on_error if on_error == "raise" else "return" + try: + out = self.fs.cat(keys2, on_error=oe) + except self.missing_exceptions as e: + raise KeyError from e + out = { + k: (KeyError() if isinstance(v, self.missing_exceptions) else v) + for k, v in out.items() + } + return { + key: out[k2] + for key, k2 in zip(keys, keys2) + if on_error == "return" or not isinstance(out[k2], BaseException) + } + + def setitems(self, values_dict): + """Set the values of multiple items in the store + + Parameters + ---------- + values_dict: dict(str, bytes) + """ + values = {self._key_to_str(k): v for k, v in values_dict.items()} + self.fs.pipe(values) + + def delitems(self, keys): + """Remove multiple keys from the store""" + self.fs.rm([self._key_to_str(k) for k in keys]) + def _key_to_str(self, key): """Generate full path for the key""" if isinstance(key, (tuple, list)): @@ -71,10 +127,10 @@ def __getitem__(self, key, default=None): """Retrieve data""" - key = self._key_to_str(key) + k = self._key_to_str(key) try: - result = self.fs.cat(key) - except: # noqa: E722 + result = self.fs.cat(k) + except self.missing_exceptions: if default is not None: return default raise KeyError(key) @@ -91,9 +147,11 @@ def __setitem__(self, key, value): """Store value in key""" key = self._key_to_str(key) + if isinstance(value, array.array): # pragma: no cover + # back compat, array.array used to work + value = bytearray(value) self.fs.mkdirs(self.fs._parent(key), exist_ok=True) - with self.fs.open(key, "wb") as f: - f.write(value) + self.fs.pipe_file(key, value) def __iter__(self): return (self._str_to_key(x) for x in self.fs.find(self.root)) @@ -110,26 +168,22 @@ def __contains__(self, key): """Does key exist in mapping?""" - return self.fs.exists(self._key_to_str(key)) + path = self._key_to_str(key) + return self.fs.exists(path) and self.fs.isfile(path) - def __getstate__(self): - """Mapping should be pickleable""" - # TODO: replace with reduce to reinstantiate? - return self.fs, self.root - - def __setstate__(self, state): - fs, root = state - self.fs = fs - self.root = root + def __reduce__(self): + return FSMap, (self.root, self.fs, False, False, self.missing_exceptions) -def get_mapper(url, check=False, create=False, **kwargs): +def get_mapper(url, check=False, create=False, missing_exceptions=None, **kwargs): """Create key-value interface for given URL and options The URL will be of the form "protocol://location" and point to the root of the mapper required. All keys will be file-names below this location, and their values the contents of each key. + Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``. + Parameters ---------- url: str @@ -140,13 +194,15 @@ create: bool Whether to make the directory corresponding to the root before instantiating + missing_exceptions: None or tuple + If given, these excpetion types will be regarded as missing keys and + return KeyError when trying to read data. By default, you get + (FileNotFoundError, IsADirectoryError, NotADirectoryError) Returns ------- ``FSMap`` instance, the dict-like key-value store. """ - protocol, path = split_protocol(url) - cls = get_filesystem_class(protocol) - fs = cls(**kwargs) # Removing protocol here - could defer to each open() on the backend - return FSMap(url, fs, check, create) + fs, urlpath = url_to_fs(url, **kwargs) + return FSMap(urlpath, fs, check, create, missing_exceptions=missing_exceptions) diff -Nru fsspec-0.6.1/fsspec/registry.py fsspec-0.8.4/fsspec/registry.py --- fsspec-0.6.1/fsspec/registry.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/registry.py 2020-10-14 16:51:19.000000000 +0000 @@ -4,21 +4,104 @@ __all__ = ["registry", "get_filesystem_class", "default"] # mapping protocol: implementation class object -registry = {} +_registry = {} # internal, mutable + + +class ReadOnlyError(TypeError): + pass + + +class ReadOnlyRegistry(dict): + """Dict-like registry, but immutable + + Maps backend name to implementation class + + To add backend implementations, use ``register_implementation`` + """ + + def __init__(self, target): + self.target = target + + def __getitem__(self, item): + return self.target[item] + + def __delitem__(self, key): + raise ReadOnlyError + + def __setitem__(self, key, value): + raise ReadOnlyError + + def clear(self): + raise ReadOnlyError + + def __contains__(self, item): + return item in self.target + + def __iter__(self): + yield from self.target + + +def register_implementation(name, cls, clobber=True, errtxt=None): + """Add implementation class to the registry + + Parameters + ---------- + name: str + Protocol name to associate with the class + cls: class or str + if a class: fsspec-compliant implementation class (normally inherits from + ``fsspec.AbstractFileSystem``, gets added straight to the registry. If a + str, the full path to an implementation class like package.module.class, + which gets added to known_implementations, + so the import is deferred until the filesystem is actually used. + clobber: bool (optional) + Whether to overwrite a protocol with the same name; if False, will raise + instead. + errtxt: str (optional) + If given, then a failure to import the given class will result in this + text being given. + """ + if isinstance(cls, str): + if name in known_implementations and clobber is False: + raise ValueError( + "Name (%s) already in the known_implementations and clobber " + "is False" % name + ) + known_implementations[name] = { + "class": cls, + "err": errtxt or "%s import failed for protocol %s" % (cls, name), + } + + else: + if name in registry and clobber is False: + raise ValueError( + "Name (%s) already in the registry and clobber is False" % name + ) + _registry[name] = cls + + +registry = ReadOnlyRegistry(_registry) default = "file" # protocols mapped to the class which implements them. This dict can -# be dynamically updated. +# updated with register_implementation known_implementations = { "file": {"class": "fsspec.implementations.local.LocalFileSystem"}, "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"}, + "dropbox": { + "class": "dropboxdrivefs.DropboxDriveFileSystem", + "err": ( + 'DropboxFileSystem requires "dropboxdrivefs",' + '"requests" and "dropbox" to be installed' + ), + }, "http": { "class": "fsspec.implementations.http.HTTPFileSystem", - "err": 'HTTPFileSystem requires "requests" to be installed', + "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed', }, "https": { "class": "fsspec.implementations.http.HTTPFileSystem", - "err": 'HTTPFileSystem requires "requests" to be installed', + "err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed', }, "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"}, "gcs": { @@ -29,6 +112,10 @@ "class": "gcsfs.GCSFileSystem", "err": "Please install gcsfs to access Google Storage", }, + "gdrive": { + "class": "gdrivefs.GoogleDriveFileSystem", + "err": "Please install gdrivefs for access to Google Drive", + }, "sftp": { "class": "fsspec.implementations.sftp.SFTPFileSystem", "err": 'SFTPFileSystem requires "paramiko" to be installed', @@ -47,13 +134,46 @@ "err": 'webHDFS access requires "requests" to be installed', }, "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"}, + "adl": { + "class": "adlfs.AzureDatalakeFileSystem", + "err": "Install adlfs to access Azure Datalake Gen1", + }, + "abfs": { + "class": "adlfs.AzureBlobFileSystem", + "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage", + }, + "az": { + "class": "adlfs.AzureBlobFileSystem", + "err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage", + }, "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"}, "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"}, "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"}, + "simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"}, "dask": { "class": "fsspec.implementations.dask.DaskWorkerFileSystem", "err": "Install dask distributed to access worker file system", }, + "github": { + "class": "fsspec.implementations.github.GithubFileSystem", + "err": "Install the requests package to use the github FS", + }, + "git": { + "class": "fsspec.implementations.git.GitFileSystem", + "err": "Install pygit2 to browse local git repos", + }, + "smb": { + "class": "fsspec.implementations.smb.SMBFileSystem", + "err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed', + }, + "jupyter": { + "class": "fsspec.implementations.jupyter.JupyterFileSystem", + "err": "Jupyter FS requires requests to be installed", + }, + "jlab": { + "class": "fsspec.implementations.jupyter.JupyterFileSystem", + "err": "Jupyter FS requires requests to be installed", + }, } minversions = {"s3fs": LooseVersion("0.3.0"), "gcsfs": LooseVersion("0.3.0")} @@ -71,34 +191,17 @@ import may fail. In this case, the string in the "err" field of the ``known_implementations`` will be given as the error message. """ - if protocol is None: + if not protocol: protocol = default if protocol not in registry: if protocol not in known_implementations: raise ValueError("Protocol not known: %s" % protocol) bit = known_implementations[protocol] - mod, name = bit["class"].rsplit(".", 1) - minversion = minversions.get(mod, None) - err = None try: - mod = importlib.import_module(mod) - except ImportError: - err = ImportError(bit["err"]) - - except Exception as e: - err = e - if err is not None: - raise RuntimeError(str(err)) - - if minversion: - version = getattr(mod, "__version__", None) - if version and LooseVersion(version) < minversion: - raise RuntimeError( - "'{}={}' is installed, but version '{}' or " - "higher is required".format(mod.__name__, version, minversion) - ) - registry[protocol] = getattr(mod, name) + register_implementation(protocol, _import_class(bit["class"])) + except ImportError as e: + raise ImportError(bit["err"]) from e cls = registry[protocol] if getattr(cls, "protocol", None) in ("abstract", None): cls.protocol = protocol @@ -106,6 +209,22 @@ return cls +def _import_class(cls, minv=None): + mod, name = cls.rsplit(".", 1) + minv = minv or minversions + minversion = minv.get(mod, None) + + mod = importlib.import_module(mod) + if minversion: + version = getattr(mod, "__version__", None) + if version and LooseVersion(version) < minversion: + raise RuntimeError( + "'{}={}' is installed, but version '{}' or " + "higher is required".format(mod.__name__, version, minversion) + ) + return getattr(mod, name) + + def filesystem(protocol, **storage_options): """Instantiate filesystems for given protocol and arguments diff -Nru fsspec-0.6.1/fsspec/spec.py fsspec-0.8.4/fsspec/spec.py --- fsspec-0.6.1/fsspec/spec.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/spec.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,11 +1,15 @@ -import warnings -from hashlib import md5 import io -import os import logging +import os +import warnings +from distutils.version import LooseVersion +from errno import ESPIPE +from hashlib import sha256 +from glob import has_magic +from .dircache import DirCache from .transaction import Transaction -from .utils import read_block, tokenize, stringify_path +from .utils import read_block, tokenize, stringify_path, other_paths logger = logging.getLogger("fsspec") @@ -31,22 +35,24 @@ be made for a filesystem instance to be garbage collected. """ - cachable = True - _extra_tokenize_attributes = () - def __init__(cls, *args, **kwargs): super().__init__(*args, **kwargs) # Note: we intentionally create a reference here, to avoid garbage # collecting instances when all other references are gone. To really # delete a FileSystem, the cache must be cleared. cls._cache = {} + cls._pid = os.getpid() def __call__(cls, *args, **kwargs): extra_tokens = tuple( getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes ) token = tokenize(cls, *args, *extra_tokens, **kwargs) - if cls.cachable and token in cls._cache: + skip = kwargs.pop("skip_instance_cache", False) + if os.getpid() != cls._pid: + cls._cache.clear() + cls._pid = os.getpid() + if not skip and cls.cachable and token in cls._cache: return cls._cache[token] else: obj = super().__call__(*args, **kwargs) @@ -54,18 +60,26 @@ obj._fs_token_ = token obj.storage_args = args obj.storage_options = kwargs + if obj.async_impl: + from .asyn import mirror_sync_methods + + mirror_sync_methods(obj) - if cls.cachable: + if cls.cachable and not skip: cls._cache[token] = obj return obj try: # optionally derive from pyarrow's FileSystem, if available import pyarrow as pa - - up = pa.filesystem.DaskFileSystem except ImportError: up = object +else: + # only derive from the legacy pyarrow's FileSystem for older pyarrow versions + if LooseVersion(pa.__version__) < LooseVersion("2.0"): + up = pa.filesystem.DaskFileSystem + else: + up = object class AbstractFileSystem(up, metaclass=_Cached): @@ -81,6 +95,7 @@ blocksize = 2 ** 22 sep = "/" protocol = "abstract" + async_impl = False root_marker = "" # For some FSs, may require leading '/' or other character #: Extra *class attributes* that should be considered when hashing. @@ -97,9 +112,18 @@ Subclasses should call this method. - Magic kwargs that affect functionality here: - add_docs: if True, will append docstrings from this spec to the - specific implementation + Parameters + ---------- + use_listings_cache, listings_expiry_time, max_paths: + passed to ``DirCache``, if the implementation supports + directory listing caching. Pass use_listings_cache=False + to disable such caching. + skip_instance_cache: bool + If this is a cachable implementation, pass True here to force + creating a new instance even if a matching instance exists, and prevent + storing this instance. + asynchronous: bool + loop: asyncio-compatible IOLoop or None """ if self._cached: # reusing instance, don't change @@ -107,7 +131,8 @@ self._cached = True self._intrans = False self._transaction = None - self.dircache = {} + self._invalidated_caches_in_transaction = [] + self.dircache = DirCache(**storage_options) if storage_options.pop("add_docs", None): warnings.warn("add_docs is no longer supported.", FutureWarning) @@ -130,25 +155,30 @@ def __eq__(self, other): return isinstance(other, type(self)) and self._fs_token == other._fs_token + def __reduce__(self): + return make_instance, (type(self), self.storage_args, self.storage_options) + @classmethod def _strip_protocol(cls, path): - """ Turn path from fully-qualified to file-system-specific + """Turn path from fully-qualified to file-system-specific May require FS-specific handling, e.g., for relative paths or links. """ + if isinstance(path, list): + return [cls._strip_protocol(p) for p in path] path = stringify_path(path) protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol for protocol in protos: - path = path.rstrip("/") if path.startswith(protocol + "://"): path = path[len(protocol) + 3 :] - elif path.startswith(protocol + ":"): - path = path[len(protocol) + 1 :] + elif path.startswith(protocol + "::"): + path = path[len(protocol) + 2 :] + path = path.rstrip("/") # use of root_marker to make minimum required path, e.g., "/" return path or cls.root_marker @staticmethod - def _get_kwargs_from_urls(paths): + def _get_kwargs_from_urls(path): """If kwargs can be encoded in the paths, extract them here This should happen before instantiation of the class; incoming paths @@ -162,7 +192,7 @@ @classmethod def current(cls): - """ Return the most recently created FileSystem + """Return the most recently created FileSystem If no instance has been created, then create one with defaults """ @@ -192,6 +222,10 @@ """Finish write transaction, non-context version""" self.transaction.complete() self._transaction = None + # The invalid cache must be cleared after the transcation is completed. + for path in self._invalidated_caches_in_transaction: + self.invalidate_cache(path) + self._invalidated_caches_in_transaction.clear() def invalidate_cache(self, path=None): """ @@ -203,7 +237,12 @@ If None, clear all listings cached else listings at or under given path. """ - pass # not necessary to implement, may have no cache + # Not necessary to implement invalidation mechanism, may have no cache. + # But if have, you should call this method of parent class from your + # subclass to ensure expiring caches after transacations correctly. + # See the implementaion of FTPFileSystem in ftp.py + if self._intrans: + self._invalidated_caches_in_transaction.append(path) def mkdir(self, path, create_parents=True, **kwargs): """ @@ -253,6 +292,7 @@ The specific keys, or perhaps a FileInfo class, or similar, is TBD, but must be consistent across implementations. Must include: + - full path to the entry (without protocol) - size of the entry, in bytes. If the value cannot be determined, will be ``None``. @@ -290,17 +330,21 @@ but contains nothing), None if not in cache. """ parent = self._parent(path) - if path in self.dircache: + try: return self.dircache[path] - elif parent in self.dircache: + except KeyError: + pass + try: files = [f for f in self.dircache[parent] if f["name"] == path] if len(files) == 0: # parent dir was listed but did not contain this file raise FileNotFoundError(path) return files + except KeyError: + pass def walk(self, path, maxdepth=None, **kwargs): - """ Return all files belows path + """Return all files belows path List all files, recursing into subdirectories; output is iterator-style, like ``os.walk()``. For a simple list of files, ``find()`` is available. @@ -318,10 +362,11 @@ kwargs: passed to ``ls`` """ path = self._strip_protocol(path) - full_dirs = [] - dirs = [] - files = [] + full_dirs = {} + dirs = {} + files = {} + detail = kwargs.pop("detail", False) try: listing = self.ls(path, detail=True, **kwargs) except (FileNotFoundError, IOError): @@ -330,26 +375,30 @@ for info in listing: # each info name must be at least [path]/part , but here # we check also for names like [path]/part/ - name = info["name"].rstrip("/") - if info["type"] == "directory" and name != path: + pathname = info["name"].rstrip("/") + name = pathname.rsplit("/", 1)[-1] + if info["type"] == "directory" and pathname != path: # do not include "self" path - full_dirs.append(name) - dirs.append(name.rsplit("/", 1)[-1]) - elif name == path: + full_dirs[pathname] = info + dirs[name] = info + elif pathname == path: # file-like with same name as give path - files.append("") + files[""] = info else: - files.append(name.rsplit("/", 1)[-1]) - yield path, dirs, files + files[name] = info + + if detail: + yield path, dirs, files + else: + yield path, list(dirs), list(files) + + if maxdepth is not None: + maxdepth -= 1 + if maxdepth < 1: + return for d in full_dirs: - if maxdepth is None or maxdepth > 1: - for res in self.walk( - d, - maxdepth=(maxdepth - 1) if maxdepth is not None else None, - **kwargs - ): - yield res + yield from self.walk(d, maxdepth=maxdepth, detail=detail, **kwargs) def find(self, path, maxdepth=None, withdirs=False, **kwargs): """List all files below path. @@ -367,18 +416,22 @@ kwargs are passed to ``ls``. """ # TODO: allow equivalent of -name parameter - out = set() - for path, dirs, files in self.walk(path, maxdepth, **kwargs): + path = self._strip_protocol(path) + out = dict() + detail = kwargs.pop("detail", False) + for path, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs): if withdirs: - files += dirs - for name in files: - if name and name not in out: - out.add("/".join([path.rstrip("/"), name]) if path else name) + files.update(dirs) + out.update({info["name"]: info for name, info in files.items()}) if self.isfile(path) and path not in out: # walk works on directories, but find should also return [path] # when path happens to be a file - out.add(path) - return sorted(out) + out[path] = {} + names = sorted(out) + if not detail: + return names + else: + return {name: out[name] for name in names} def du(self, path, total=True, maxdepth=None, **kwargs): """Space used by files within a path @@ -414,12 +467,15 @@ the same as ``ls(path)``, returning only files. We support ``"**"``, - ``"?"`` and ``"[..]"``. + ``"?"`` and ``"[..]"``. We do not support ^ for pattern negation. + + Search path names that contain embedded characters special to this + implementation of glob may not produce expected results; + e.g., 'foo/bar/*starredfilename*'. kwargs are passed to ``ls``. """ import re - from glob import has_magic ends = path.endswith("/") path = self._strip_protocol(path) @@ -429,23 +485,36 @@ ind = min(indstar, indques, indbrace) + detail = kwargs.pop("detail", False) + if not has_magic(path): root = path depth = 1 if ends: path += "/*" elif self.exists(path): - return [path] + if not detail: + return [path] + else: + return {path: self.info(path)} else: - return [] # glob of non-existent returns empty + if not detail: + return [] # glob of non-existent returns empty + else: + return {} elif "/" in path[:ind]: ind2 = path[:ind].rindex("/") root = path[: ind2 + 1] - depth = 20 if "**" in path else path[ind2 + 1 :].count("/") + 1 + depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1 else: root = "" - depth = 20 if "**" in path else 1 - allpaths = self.find(root, maxdepth=depth, withdirs=True, **kwargs) + depth = None if "**" in path else 1 + + allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs) + # Escape characters special to python regex, leaving our supported + # special characters in place. + # See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html + # for shell globbing details. pattern = ( "^" + ( @@ -456,6 +525,10 @@ .replace("(", r"\(") .replace(")", r"\)") .replace("|", r"\|") + .replace("^", r"\^") + .replace("$", r"\$") + .replace("{", r"\{") + .replace("}", r"\}") .rstrip("/") .replace("?", ".") ) @@ -464,8 +537,15 @@ pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern) pattern = re.sub("[*]", "[^/]*", pattern) pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*")) - out = {p for p in allpaths if pattern.match(p.replace("//", "/").rstrip("/"))} - return list(sorted(out)) + out = { + p: allpaths[p] + for p in sorted(allpaths) + if pattern.match(p.replace("//", "/").rstrip("/")) + } + if detail: + return out + else: + return list(out) def exists(self, path): """Is there a file at the given path""" @@ -531,7 +611,7 @@ """Is this entry directory-like?""" try: return self.info(path)["type"] == "directory" - except FileNotFoundError: + except IOError: return False def isfile(self, path): @@ -541,30 +621,73 @@ except: # noqa: E722 return False - def cat(self, path): + def cat_file(self, path): """ Get the content of a file """ return self.open(path, "rb").read() - def get(self, rpath, lpath, recursive=False, **kwargs): - """Copy file to local. + def pipe_file(self, path, value, **kwargs): + """Set the bytes of given file""" + with self.open(path, "wb") as f: + f.write(value) - Possible extension: maybe should be able to copy to any file-system - (streaming through local). - """ - rpath = self._strip_protocol(rpath) - if recursive: - rpaths = self.find(rpath) - lpaths = [ - os.path.join(lpath, path[len(rpath) :].lstrip("/")) for path in rpaths - ] - for lpath in lpaths: - dirname = os.path.dirname(lpath) - if not os.path.isdir(dirname): - os.makedirs(dirname) + def pipe(self, path, value=None, **kwargs): + """Put value into path + + (counterpart to ``cat``) + Parameters + ---------- + path: string or dict(str, bytes) + If a string, a single remote location to put ``value`` bytes; if a dict, + a mapping of {path: bytesvalue}. + value: bytes, optional + If using a single path, these are the bytes to put there. Ignored if + ``path`` is a dict + """ + if isinstance(path, str): + self.pipe_file(self._strip_protocol(path), value, **kwargs) + elif isinstance(path, dict): + for k, v in path.items(): + self.pipe_file(self._strip_protocol(k), v, **kwargs) + else: + raise ValueError("path must be str or dict") + + def cat(self, path, recursive=False, on_error="raise", **kwargs): + """Fetch (potentially multiple) paths' contents + + Returns a dict of {path: contents} if there are multiple paths + or the path has been otherwise expanded + + on_error : "raise", "omit", "return" + If raise, an underlying exception will be raised (converted to KeyError + if the type is in self.missing_exceptions); if omit, keys with exception + will simply not be included in the output; if "return", all keys are + included in the output, but the value will be bytes or an exception + instance. + """ + paths = self.expand_path(path, recursive=recursive) + if ( + len(paths) > 1 + or isinstance(path, list) + or paths[0] != self._strip_protocol(path) + ): + out = {} + for path in paths: + try: + out[path] = self.cat_file(path, **kwargs) + except Exception as e: + if on_error == "raise": + raise + if on_error == "return": + out[path] = e + return out + else: + return self.cat_file(paths[0]) + + def get_file(self, rpath, lpath, **kwargs): + """Copy single remote file to local""" + if self.isdir(rpath): + os.makedirs(lpath, exist_ok=True) else: - rpaths = [rpath] - lpaths = [lpath] - for lpath, rpath in zip(lpaths, rpaths): with self.open(rpath, "rb", **kwargs) as f1: with open(lpath, "wb") as f2: data = True @@ -572,33 +695,63 @@ data = f1.read(self.blocksize) f2.write(data) - def put(self, lpath, rpath, recursive=False, **kwargs): - """ Upload file from local """ - if recursive: - lpaths = [] - for dirname, subdirlist, filelist in os.walk(lpath): - lpaths += [os.path.join(dirname, filename) for filename in filelist] - rootdir = os.path.basename(lpath.rstrip("/")) - if self.exists(rpath): - # copy lpath inside rpath directory - rpath2 = os.path.join(rpath, rootdir) - else: - # copy lpath as rpath directory - rpath2 = rpath - rpaths = [ - os.path.join(rpath2, path[len(lpath) :].lstrip("/")) for path in lpaths - ] - else: - lpaths = [lpath] - rpaths = [rpath] + def get(self, rpath, lpath, recursive=False, **kwargs): + """Copy file(s) to local. + + Copies a specific file or tree of files (if recursive=True). If lpath + ends with a "/", it will be assumed to be a directory, and target files + will go within. Can submit a list of paths, which may be glob-patterns + and will be expanded. + + Calls get_file for each source. + """ + from .implementations.local import make_path_posix + + if isinstance(lpath, str): + lpath = make_path_posix(lpath) + rpaths = self.expand_path(rpath, recursive=recursive) + lpaths = other_paths(rpaths, lpath) for lpath, rpath in zip(lpaths, rpaths): + self.get_file(rpath, lpath, **kwargs) + + def put_file(self, lpath, rpath, **kwargs): + """Copy single file to remote""" + if os.path.isdir(lpath): + self.makedirs(rpath, exist_ok=True) + else: with open(lpath, "rb") as f1: + self.mkdirs(os.path.dirname(rpath), exist_ok=True) with self.open(rpath, "wb", **kwargs) as f2: data = True while data: data = f1.read(self.blocksize) f2.write(data) + def put(self, lpath, rpath, recursive=False, **kwargs): + """Copy file(s) from local. + + Copies a specific file or tree of files (if recursive=True). If rpath + ends with a "/", it will be assumed to be a directory, and target files + will go within. + + Calls put_file for each source. + """ + from .implementations.local import make_path_posix, LocalFileSystem + + rpath = ( + self._strip_protocol(rpath) + if isinstance(rpath, str) + else [self._strip_protocol(p) for p in rpath] + ) + if isinstance(lpath, str): + lpath = make_path_posix(lpath) + fs = LocalFileSystem() + lpaths = fs.expand_path(lpath, recursive=recursive) + rpaths = other_paths(lpaths, rpath) + + for lpath, rpath in zip(lpaths, rpaths): + self.put_file(lpath, rpath, **kwargs) + def head(self, path, size=1024): """ Get the first ``size`` bytes from file """ with self.open(path, "rb") as f: @@ -610,17 +763,52 @@ f.seek(max(-size, -f.size), 2) return f.read() - def copy(self, path1, path2, **kwargs): - """ Copy within two locations in the filesystem""" + def cp_file(self, path1, path2, **kwargs): raise NotImplementedError - def mv(self, path1, path2, **kwargs): - """ Move file from one location to another """ - self.copy(path1, path2, **kwargs) - self.rm(path1, recursive=False) + def copy(self, path1, path2, recursive=False, **kwargs): + """ Copy within two locations in the filesystem""" + paths = self.expand_path(path1, recursive=recursive) + path2 = other_paths(paths, path2) + for p1, p2 in zip(paths, path2): + self.cp_file(p1, p2, **kwargs) + + def expand_path(self, path, recursive=False, maxdepth=None): + """Turn one or more globs or directories into a list of all matching files""" + if isinstance(path, str): + out = self.expand_path([path], recursive, maxdepth) + else: + out = set() + path = [self._strip_protocol(p) for p in path] + for p in path: + if has_magic(p): + bit = set(self.glob(p)) + out |= bit + if recursive: + out += self.expand_path(p) + continue + elif recursive: + rec = set(self.find(p, withdirs=True)) + out |= rec + if p not in out and (recursive is False or self.exists(p)): + # should only check once, for the root + out.add(p) + if not out: + raise FileNotFoundError(path) + return list(sorted(out)) - def _rm(self, path): + def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs): + """ Move file(s) from one location to another """ + self.copy(path1, path2, recursive=recursive, maxdepth=maxdepth) + self.rm(path1, recursive=recursive) + + def rm_file(self, path): """Delete a file""" + self._rm(path) + + def _rm(self, path): + """Delete one file""" + # this is the old name for the method, prefer rm_file raise NotImplementedError def rm(self, path, recursive=False, maxdepth=None): @@ -638,25 +826,16 @@ If None, there will be no limit and infinite recursion may be possible. """ - # prefer some bulk method, if possible - if not isinstance(path, list): - path = [path] - for p in path: - if recursive: - out = self.walk(p, maxdepth=maxdepth) - for pa_, _, files in reversed(list(out)): - for name in files: - fn = "/".join([pa_, name]) if pa_ else name - self.rm(fn) - self.rmdir(pa_) - else: - self._rm(p) + path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth) + for p in reversed(path): + self.rm_file(p) @classmethod def _parent(cls, path): path = cls._strip_protocol(path.rstrip("/")) if "/" in path: - return cls.root_marker + path.rsplit("/", 1)[0] + parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker) + return cls.root_marker + parent else: return cls.root_marker @@ -728,7 +907,7 @@ return f def touch(self, path, truncate=True, **kwargs): - """ Create empty file, or update timestamp + """Create empty file, or update timestamp Parameters ---------- @@ -746,10 +925,10 @@ def ukey(self, path): """Hash of file properties, to tell if it has changed""" - return md5(str(self.info(path)).encode()).hexdigest() + return sha256(str(self.info(path)).encode()).hexdigest() def read_block(self, fn, offset, length, delimiter=None): - """ Read a block of bytes from + """Read a block of bytes from Starting at ``offset`` of the file, read ``length`` bytes. If ``delimiter`` is set then we ensure that the read starts and stops at @@ -793,8 +972,58 @@ length = size - offset return read_block(f, offset, length, delimiter) - def __reduce__(self): - return make_instance, (type(self), self.storage_args, self.storage_options) + def to_json(self): + """ + JSON representation of this filesystem instance + + Returns + ------- + str: JSON structure with keys cls (the python location of this class), + protocol (text name of this class's protocol, first one in case of + multiple), args (positional args, usually empty), and all other + kwargs as their own keys. + """ + import json + + cls = type(self) + cls = ".".join((cls.__module__, cls.__name__)) + proto = ( + self.protocol[0] + if isinstance(self.protocol, (tuple, list)) + else self.protocol + ) + return json.dumps( + dict( + **{"cls": cls, "protocol": proto, "args": self.storage_args}, + **self.storage_options + ) + ) + + @staticmethod + def from_json(blob): + """ + Recreate a filesystem instance from JSON representation + + See ``.to_json()`` for the expected structure of the input + + Parameters + ---------- + blob: str + + Returns + ------- + file system instance, not necessarily of this particular class. + """ + from .registry import _import_class, get_filesystem_class + import json + + dic = json.loads(blob) + protocol = dic.pop("protocol") + try: + cls = _import_class(dic.pop("cls")) + except (ImportError, ValueError, RuntimeError, KeyError): + cls = get_filesystem_class(protocol) + return cls(*dic.pop("args", ()), **dic) def _get_pyarrow_filesystem(self): """ @@ -828,6 +1057,14 @@ """ cls._cache.clear() + def created(self, path): + """Return the created timestamp of a file as a datetime.datetime""" + raise NotImplementedError + + def modified(self, path): + """Return the modified timestamp of a file as a datetime.datetime""" + raise NotImplementedError + # ------------------------------------------------------------------------ # Aliases @@ -875,6 +1112,37 @@ """Alias of :ref:`FilesystemSpec.get`.""" return self.get(rpath, lpath, recursive=recursive, **kwargs) + def sign(self, path, expiration=100, **kwargs): + """Create a signed URL representing the given path + + Some implementations allow temporary URLs to be generated, as a + way of delegating credentials. + + Parameters + ---------- + path : str + The path on the filesystem + expiration : int + Number of seconds to enable the URL for (if supported) + + Returns + ------- + URL : str + The signed URL + + Raises + ------ + NotImplementedError : if method is not implemented for a fileystem + """ + raise NotImplementedError("Sign is not implemented for this filesystem") + + def _isfilestore(self): + # Originally inherited from pyarrow DaskFileSystem. Keeping this + # here for backwards compatibility as long as pyarrow uses its + # legacy ffspec-compatible filesystems and thus accepts fsspec + # filesystems as well + return False + class AbstractBufferedFile(io.IOBase): """Convenient class to derive from to provide buffering @@ -967,7 +1235,8 @@ @property def closed(self): # get around this attr being read-only in IOBase - return self._closed + # use getattr here, since this can be called during del + return getattr(self, "_closed", True) @closed.setter def closed(self, c): @@ -1001,7 +1270,7 @@ return self.loc def seek(self, loc, whence=0): - """ Set current file location + """Set current file location Parameters ---------- @@ -1012,7 +1281,7 @@ """ loc = int(loc) if not self.mode == "rb": - raise ValueError("Seek only available in read mode") + raise OSError(ESPIPE, "Seek only available in read mode") if whence == 0: nloc = loc elif whence == 1: @@ -1089,7 +1358,7 @@ self.buffer = io.BytesIO() def _upload_chunk(self, final=False): - """ Write one part of a multi-block file upload + """Write one part of a multi-block file upload Parameters ========== @@ -1097,7 +1366,7 @@ This is the last block, so should complete file, if self.autocommit is True. """ - # may not yet have been initialized, may neet to call _initialize_upload + # may not yet have been initialized, may need to call _initialize_upload def _initiate_upload(self): """ Create remote file/upload """ @@ -1136,8 +1405,9 @@ https://docs.python.org/3/library/io.html#io.RawIOBase.readinto """ - data = self.read(len(b)) - b[: len(data)] = data + out = memoryview(b).cast("B") + data = self.read(out.nbytes) + out[: len(data)] = data return len(data) def readuntil(self, char=b"\n", blocks=None): @@ -1200,7 +1470,7 @@ return self.readinto(b) def close(self): - """ Close file + """Close file Finalizes writes, discards cache """ @@ -1231,7 +1501,8 @@ return self.mode in {"wb", "ab"} and not self.closed def __del__(self): - self.close() + if not self.closed: + self.close() def __str__(self): return "" % (type(self.fs).__name__, self.path) diff -Nru fsspec-0.6.1/fsspec/tests/test_api.py fsspec-0.8.4/fsspec/tests/test_api.py --- fsspec-0.6.1/fsspec/tests/test_api.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/tests/test_api.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,7 +1,14 @@ """Tests the spec, using memoryfs""" +import contextlib import os import pickle +import sys +import tempfile + +import pytest + +import fsspec from fsspec.implementations.memory import MemoryFileSystem, MemoryFile @@ -26,7 +33,7 @@ def test_class_methods(): - assert MemoryFileSystem._strip_protocol("memory:stuff") == "stuff" + assert MemoryFileSystem._strip_protocol("memory::stuff") == "stuff" assert MemoryFileSystem._strip_protocol("memory://stuff") == "stuff" assert MemoryFileSystem._strip_protocol("stuff") == "stuff" assert MemoryFileSystem._strip_protocol("other://stuff") == "other://stuff" @@ -54,6 +61,31 @@ fs.put(tmpdir, "/more", recursive=True) assert fs.find("/more") == ["/more/dir/two", "/more/one", "/more/three"] + @contextlib.contextmanager + def tmp_chdir(path): + curdir = os.getcwd() + os.chdir(path) + try: + yield + finally: + os.chdir(curdir) + + with tmp_chdir(os.path.join(tmpdir, os.path.pardir)): + fs.put(os.path.basename(tmpdir), "/moretwo", recursive=True) + assert fs.find("/moretwo") == [ + "/moretwo/dir/two", + "/moretwo/one", + "/moretwo/three", + ] + + with tmp_chdir(tmpdir): + fs.put(os.path.curdir, "/morethree", recursive=True) + assert fs.find("/morethree") == [ + "/morethree/dir/two", + "/morethree/one", + "/morethree/three", + ] + for f in [fn, fn2, fn3]: os.remove(f) os.rmdir(os.path.join(tmpdir, "dir")) @@ -93,6 +125,32 @@ assert isinstance(fs.ukey("/otherfile"), str) +def test_recursive_get_put(tmpdir): + fs = MemoryFileSystem() + os.makedirs(f"{tmpdir}/nest") + for file in ["one", "two", "nest/other"]: + with open(f"{tmpdir}/{file}", "wb") as f: + f.write(b"data") + + fs.put(str(tmpdir), "test", recursive=True) + + d = tempfile.mkdtemp() + fs.get("test", d, recursive=True) + for file in ["one", "two", "nest/other"]: + with open(f"{d}/{file}", "rb") as f: + f.read() == b"data" + + +def test_pipe_cat(): + fs = MemoryFileSystem() + fs.pipe("afile", b"contents") + assert fs.cat("afile") == b"contents" + + data = {"bfile": b"more", "cfile": b"stuff"} + fs.pipe(data) + assert fs.cat(list(data)) == data + + def test_read_block_delimiter(): fs = MemoryFileSystem() with fs.open("/myfile", "wb") as f: @@ -111,3 +169,158 @@ f.write(b"some\n" b"lines\n" b"of\n" b"text") f = fs.open("/myfile", "r", encoding="latin1") assert f.encoding == "latin1" + + +def test_chained_fs(): + d1 = tempfile.mkdtemp() + d2 = tempfile.mkdtemp() + f1 = os.path.join(d1, "f1") + with open(f1, "wb") as f: + f.write(b"test") + + of = fsspec.open( + f"simplecache::file://{f1}", + simplecache={"cache_storage": d2, "same_names": True}, + ) + with of as f: + assert f.read() == b"test" + + assert os.listdir(d2) == ["f1"] + + +@pytest.mark.xfail(reason="see issue #334", strict=True) +def test_multilevel_chained_fs(): + """This test reproduces intake/filesystem_spec#334""" + import zipfile + + d1 = tempfile.mkdtemp() + f1 = os.path.join(d1, "f1.zip") + with zipfile.ZipFile(f1, mode="w") as z: + # filename, content + z.writestr("foo.txt", "foo.txt") + z.writestr("bar.txt", "bar.txt") + + # We expected this to be the correct syntax + with pytest.raises(IsADirectoryError): + of = fsspec.open_files(f"zip://*.txt::simplecache::file://{f1}") + assert len(of) == 2 + + # But this is what is actually valid... + of = fsspec.open_files(f"zip://*.txt::simplecache://{f1}::file://") + + assert len(of) == 2 + for open_file in of: + with open_file as f: + assert f.read().decode("utf-8") == f.name + + +@pytest.mark.skipif(sys.version_info < (3, 7), reason="no seek in old zipfile") +def test_multilevel_chained_fs_zip_zip_file(): + """This test reproduces intake/filesystem_spec#334""" + import zipfile + + d1 = tempfile.mkdtemp() + f1 = os.path.join(d1, "f1.zip") + f2 = os.path.join(d1, "f2.zip") + with zipfile.ZipFile(f1, mode="w") as z: + # filename, content + z.writestr("foo.txt", "foo.txt") + z.writestr("bar.txt", "bar.txt") + + with zipfile.ZipFile(f2, mode="w") as z: + with open(f1, "rb") as f: + z.writestr("f1.zip", f.read()) + + # We expected this to be the correct syntax + of = fsspec.open_files(f"zip://*.txt::zip://f1.zip::file://{f2}") + + assert len(of) == 2 + for open_file in of: + with open_file as f: + assert f.read().decode("utf-8") == f.name + + +def test_chained_equivalent(): + d1 = tempfile.mkdtemp() + d2 = tempfile.mkdtemp() + f1 = os.path.join(d1, "f1") + with open(f1, "wb") as f: + f.write(b"test1") + + of = fsspec.open( + f"simplecache::file://{f1}", + simplecache={"cache_storage": d2, "same_names": True}, + ) + of2 = fsspec.open( + f"simplecache://{f1}", + cache_storage=d2, + same_names=True, + target_protocol="file", + target_options={}, + ) + # the following line passes by fluke - they are not quite the same instance, + # since the parameters don't quite match. Also, the url understood by the two + # of s are not the same (path gets munged a bit differently) + assert of.fs == of2.fs + assert of.open().read() == of2.open().read() + + +def test_chained_fs_multi(): + d1 = tempfile.mkdtemp() + d2 = tempfile.mkdtemp() + f1 = os.path.join(d1, "f1") + f2 = os.path.join(d1, "f2") + with open(f1, "wb") as f: + f.write(b"test1") + with open(f2, "wb") as f: + f.write(b"test2") + + of = fsspec.open_files( + f"simplecache::file://{d1}/*", + simplecache={"cache_storage": d2, "same_names": True}, + ) + with of[0] as f: + assert f.read() == b"test1" + with of[1] as f: + assert f.read() == b"test2" + + assert sorted(os.listdir(d2)) == ["f1", "f2"] + + d2 = tempfile.mkdtemp() + + of = fsspec.open_files( + [f"simplecache::file://{f1}", f"simplecache::file://{f2}"], + simplecache={"cache_storage": d2, "same_names": True}, + ) + with of[0] as f: + assert f.read() == b"test1" + with of[1] as f: + assert f.read() == b"test2" + + assert sorted(os.listdir(d2)) == ["f1", "f2"] + + +def test_chained_fo(): + import zipfile + + d1 = tempfile.mkdtemp() + f1 = os.path.join(d1, "temp.zip") + d3 = tempfile.mkdtemp() + with zipfile.ZipFile(f1, mode="w") as z: + z.writestr("afile", b"test") + + of = fsspec.open(f"zip://afile::file://{f1}") + with of as f: + assert f.read() == b"test" + + of = fsspec.open_files(f"zip://*::file://{f1}") + with of[0] as f: + assert f.read() == b"test" + + of = fsspec.open_files( + f"simplecache::zip://*::file://{f1}", + simplecache={"cache_storage": d3, "same_names": True}, + ) + with of[0] as f: + assert f.read() == b"test" + assert "afile" in os.listdir(d3) diff -Nru fsspec-0.6.1/fsspec/tests/test_async.py fsspec-0.8.4/fsspec/tests/test_async.py --- fsspec-0.6.1/fsspec/tests/test_async.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/tests/test_async.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,21 @@ +import pytest +import asyncio +import sys +from fsspec.asyn import _run_until_done + + +async def inner(): + await asyncio.sleep(1) + return True + + +async def outer(): + await asyncio.sleep(1) + return _run_until_done(inner()) + + +@pytest.mark.skipif(sys.version_info < (3, 7), reason="Async fails on py36") +def test_runtildone(): + loop = asyncio.get_event_loop() + assert loop.run_until_complete(outer()) + loop.close() diff -Nru fsspec-0.6.1/fsspec/tests/test_caches.py fsspec-0.8.4/fsspec/tests/test_caches.py --- fsspec-0.6.1/fsspec/tests/test_caches.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/tests/test_caches.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,81 @@ +import pickle +import string + +import pytest +from fsspec.caching import BlockCache, caches + + +def test_cache_getitem(Cache_imp): + cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters)) + assert cacher._fetch(0, 4) == b"abcd" + assert cacher._fetch(None, 4) == b"abcd" + assert cacher._fetch(2, 4) == b"cd" + + +def test_block_cache_lru(): + cache = BlockCache(4, letters_fetcher, len(string.ascii_letters), maxblocks=2) + # miss + cache._fetch(0, 2) + assert cache.cache_info().misses == 1 + assert cache.cache_info().currsize == 1 + + # hit + cache._fetch(0, 2) + assert cache.cache_info().misses == 1 + assert cache.cache_info().currsize == 1 + + # miss + cache._fetch(4, 6) + assert cache.cache_info().misses == 2 + assert cache.cache_info().currsize == 2 + + # miss & evict + cache._fetch(12, 13) + assert cache.cache_info().misses == 3 + assert cache.cache_info().currsize == 2 + + +def _fetcher(start, end): + return b"0" * (end - start) + + +def letters_fetcher(start, end): + return string.ascii_letters[start:end].encode() + + +@pytest.fixture(params=caches.values(), ids=list(caches.keys())) +def Cache_imp(request): + return request.param + + +def test_cache_empty_file(Cache_imp): + blocksize = 5 + size = 0 + cache = Cache_imp(blocksize, _fetcher, size) + assert cache._fetch(0, 0) == b"" + + +def test_cache_pickleable(Cache_imp): + blocksize = 5 + size = 100 + cache = Cache_imp(blocksize, _fetcher, size) + cache._fetch(0, 5) # fill in cache + unpickled = pickle.loads(pickle.dumps(cache)) + assert isinstance(unpickled, Cache_imp) + assert unpickled.blocksize == blocksize + assert unpickled.size == size + assert unpickled._fetch(0, 10) == b"0" * 10 + + +@pytest.mark.parametrize( + "size_requests", + [[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]], +) +@pytest.mark.parametrize("blocksize", [1, 10, 52, 100]) +def test_cache_basic(Cache_imp, blocksize, size_requests): + cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters)) + + for start, end in size_requests: + result = cache._fetch(start, end) + expected = string.ascii_letters[start:end].encode() + assert result == expected diff -Nru fsspec-0.6.1/fsspec/tests/test_core.py fsspec-0.8.4/fsspec/tests/test_core.py --- fsspec-0.6.1/fsspec/tests/test_core.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/tests/test_core.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,15 +1,17 @@ -import pytest +import os import pickle -import string +import pytest +import tempfile from fsspec.core import ( _expand_paths, OpenFile, - caches, + open_local, get_compression, - BaseCache, - BlockCache, + open_files, + OpenFiles, ) +import fsspec @pytest.mark.parametrize( @@ -41,52 +43,26 @@ f.read() == "data" -# For test_cache_pickleable(). Functions are only picklable if they are defined -# at the top-level of a module -def _fetcher(start, end): - return b"0" * (end - start) - - -def letters_fetcher(start, end): - return string.ascii_letters[start:end].encode() - - -@pytest.fixture(params=caches.values(), ids=list(caches.keys())) -def Cache_imp(request): - return request.param - - -def test_cache_empty_file(Cache_imp): - blocksize = 5 - size = 0 - cache = Cache_imp(blocksize, _fetcher, size) - assert cache._fetch(0, 0) == b"" - - -def test_cache_pickleable(Cache_imp): - blocksize = 5 - size = 100 - cache = Cache_imp(blocksize, _fetcher, size) - cache._fetch(0, 5) # fill in cache - unpickled = pickle.loads(pickle.dumps(cache)) - assert isinstance(unpickled, Cache_imp) - assert unpickled.blocksize == blocksize - assert unpickled.size == size - assert unpickled._fetch(0, 10) == b"0" * 10 +def test_openfile_open(m): + of = OpenFile(m, "somepath", mode="wt") + f = of.open() + f.write("hello") + assert m.size("somepath") == 0 # no flush yet + del of + assert m.size("somepath") == 0 # still no flush + f.close() + assert m.size("somepath") == 5 -@pytest.mark.parametrize( - "size_requests", - [[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]], -) -@pytest.mark.parametrize("blocksize", [1, 10, 52, 100]) -def test_cache_basic(Cache_imp, blocksize, size_requests): - cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters)) - - for start, end in size_requests: - result = cache[start:end] - expected = string.ascii_letters[start:end].encode() - assert result == expected +def test_open_local(): + d1 = str(tempfile.mkdtemp()) + f1 = os.path.join(d1, "f1") + open(f1, "w").write("test1") + d2 = str(tempfile.mkdtemp()) + fn = open_local("simplecache://" + f1, cache_storage=d2, target_protocol="file") + assert isinstance(fn, str) + assert open(fn).read() == "test1" + assert d2 in fn def test_xz_lzma_compressions(): @@ -97,46 +73,107 @@ assert get_compression("some_file.xz", "lzma") == "lzma" -def test_cache_getitem(Cache_imp): - cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters)) - assert cacher[0:4] == b"abcd" - assert cacher[:4] == b"abcd" - assert cacher[-3:] == b"XYZ" - assert cacher[-3:-1] == b"XY" - assert cacher[2:4] == b"cd" - - -def test_cache_getitem_raises(): - cacher = BaseCache(4, letters_fetcher, len(string.ascii_letters)) - with pytest.raises(TypeError, match="int"): - cacher[5] - - with pytest.raises(ValueError, match="contiguous"): - cacher[::4] - - -def test_block_cache_lru(): - cache = BlockCache(4, letters_fetcher, len(string.ascii_letters), maxblocks=2) - # miss - cache[0:2] - assert cache.cache_info().hits == 0 - assert cache.cache_info().misses == 1 - assert cache.cache_info().currsize == 1 - - # hit - cache[0:2] - assert cache.cache_info().hits == 1 - assert cache.cache_info().misses == 1 - assert cache.cache_info().currsize == 1 - - # miss - cache[4:6] - assert cache.cache_info().hits == 1 - assert cache.cache_info().misses == 2 - assert cache.cache_info().currsize == 2 - - # miss & evict - cache[12:13] - assert cache.cache_info().hits == 1 - assert cache.cache_info().misses == 3 - assert cache.cache_info().currsize == 2 +def test_list(): + here = os.path.abspath(os.path.dirname(__file__)) + flist = os.listdir(here) + plist = [os.path.join(here, p).replace("\\", "/") for p in flist] + of = open_files(plist) + assert len(of) == len(flist) + assert [f.path for f in of] == plist + + +def test_pathobject(tmpdir): + import pathlib + + tmpdir = str(tmpdir) + plist_str = [os.path.join(str(tmpdir), f).replace("\\", "/") for f in ["a", "b"]] + open(plist_str[0], "w").write("first file") + open(plist_str[1], "w").write("second file") + plist = [pathlib.Path(p) for p in plist_str] + of = open_files(plist) + assert len(of) == 2 + assert [f.path for f in of] == plist_str + + of = open_files(plist[0]) + assert len(of) == 1 + assert of[0].path == plist_str[0] + with of[0] as f: + assert f.read() == open(plist_str[0], "rb").read() + + +def test_automkdir(tmpdir): + dir = os.path.join(str(tmpdir), "a") + of = fsspec.open(os.path.join(dir, "afile"), "w") + with of: + pass + assert "afile" in os.listdir(dir) + + dir = os.path.join(str(tmpdir), "b") + of = fsspec.open(os.path.join(dir, "bfile"), "w", auto_mkdir=True) + with of: + pass + + assert "bfile" in os.listdir(dir) + + dir = os.path.join(str(tmpdir), "c") + with pytest.raises(FileNotFoundError): + of = fsspec.open(os.path.join(dir, "bfile"), "w", auto_mkdir=False) + with of: + pass + + +def test_automkdir_readonly(tmpdir): + dir = os.path.join(str(tmpdir), "d") + with pytest.raises(FileNotFoundError): + of = fsspec.open(os.path.join(dir, "dfile"), "r") + with of: + pass + + +def test_openfile_pickle_newline(): + # GH#318 + test = fsspec.open(__file__, newline=b"") + + pickled = pickle.dumps(test) + restored = pickle.loads(pickled) + + assert test.newline == restored.newline + + +def test_mismatch(): + with pytest.raises(ValueError, match="protocol"): + open_files(["s3://test/path.csv", "/other/path.csv"]) + + +def test_url_kwargs_chain(ftp_writable): + host, port, username, password = "localhost", 2121, "user", "pass" + data = b"hello" + with fsspec.open( + "ftp:///afile", "wb", host=host, port=port, username=username, password=password + ) as f: + f.write(data) + + with fsspec.open( + "simplecache::ftp://{}:{}@{}:{}/afile".format(username, password, host, port), + "rb", + ) as f: + assert f.read() == data + + +def test_multi_context(tmpdir): + fns = [os.path.join(tmpdir, fn) for fn in ["a", "b"]] + files = open_files(fns, "wb") + assert isinstance(files, OpenFiles) + assert isinstance(files[0], OpenFile) + assert len(files) == 2 + with files as of: + assert len(of) == 2 + assert not of[0].closed + assert of[0].name.endswith("a") + assert of[0].closed + assert repr(files) == "" + + +def test_not_local(): + with pytest.raises(ValueError, match="attribute local_file=True"): + open_local("memory://afile") diff -Nru fsspec-0.6.1/fsspec/tests/test_file.py fsspec-0.8.4/fsspec/tests/test_file.py --- fsspec-0.6.1/fsspec/tests/test_file.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/tests/test_file.py 2020-10-14 16:51:19.000000000 +0000 @@ -105,7 +105,7 @@ f = ftp.open("/out2", "wb") with pytest.raises(ValueError): f.info() - with pytest.raises(ValueError): + with pytest.raises(OSError): f.seek(0) with pytest.raises(ValueError): f.read(0) @@ -175,3 +175,20 @@ with fs.open(fn, "rb") as f: gf = gzip.GzipFile(fileobj=f, mode="r") assert gf.read() == data + + +def test_with_zip(ftp_writable): + import zipfile + + data = b"hello zip" + host, port, user, pw = ftp_writable + fs = FTPFileSystem(host=host, port=port, username=user, password=pw) + fn = "/myfile.zip" + inner_file = "test.txt" + with fs.open(fn, "wb") as f: + zf = zipfile.ZipFile(f, mode="w") + zf.writestr(inner_file, data) + zf.close() + with fs.open(fn, "rb") as f: + zf = zipfile.ZipFile(f, mode="r") + assert zf.read(inner_file) == data diff -Nru fsspec-0.6.1/fsspec/tests/test_fuse.py fsspec-0.8.4/fsspec/tests/test_fuse.py --- fsspec-0.6.1/fsspec/tests/test_fuse.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/tests/test_fuse.py 2020-10-14 16:51:19.000000000 +0000 @@ -5,7 +5,11 @@ import pytest -pytest.importorskip("fuse") # noqa: E402 +try: + pytest.importorskip("fuse") # noqa: E402 +except OSError: + # can succeed in importing fuse, but fail to load so + pytest.importorskip("nonexistent") # noqa: E402 from fsspec.fuse import run from fsspec.implementations.memory import MemoryFileSystem @@ -17,7 +21,7 @@ run(fs, "/mounted/", mountdir) -def test_basic(tmpdir): +def test_basic(tmpdir, capfd): mountdir = str(tmpdir.mkdir("mount")) fuse_process = Process(target=host_fuse, args=(str(mountdir),)) @@ -53,6 +57,10 @@ with pytest.raises(OSError): os.rmdir(fn) + captured = capfd.readouterr() + assert "Traceback" not in captured.out + assert "Traceback" not in captured.err + os.rmdir(fn + "/inner") os.rmdir(fn) finally: diff -Nru fsspec-0.6.1/fsspec/tests/test_gui.py fsspec-0.8.4/fsspec/tests/test_gui.py --- fsspec-0.6.1/fsspec/tests/test_gui.py 1970-01-01 00:00:00.000000000 +0000 +++ fsspec-0.8.4/fsspec/tests/test_gui.py 2020-10-14 16:51:19.000000000 +0000 @@ -0,0 +1,10 @@ +import pytest + +panel = pytest.importorskip("panel") + + +def test_basic(): + import fsspec.gui + + gui = fsspec.gui.FileSelector() + assert "url" in str(gui.panel) diff -Nru fsspec-0.6.1/fsspec/tests/test_mapping.py fsspec-0.8.4/fsspec/tests/test_mapping.py --- fsspec-0.6.1/fsspec/tests/test_mapping.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/tests/test_mapping.py 2020-10-14 16:51:19.000000000 +0000 @@ -22,6 +22,23 @@ assert m == m2 == m3 +def test_getitems_errors(tmpdir): + tmpdir = str(tmpdir) + os.makedirs(os.path.join(tmpdir, "afolder")) + open(os.path.join(tmpdir, "afile"), "w").write("test") + open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2") + m = fsspec.get_mapper("file://" + tmpdir) + assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"} + with pytest.raises(KeyError): + m.getitems(["afile", "bfile"]) + out = m.getitems(["afile", "bfile"], on_error="return") + assert isinstance(out["bfile"], KeyError) + m = fsspec.get_mapper("file://" + tmpdir, missing_exceptions=()) + assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"} + with pytest.raises(FileNotFoundError): + m.getitems(["afile", "bfile"]) + + def test_ops(): MemoryFileSystem.store.clear() m = fsspec.get_mapper("memory://") @@ -48,6 +65,7 @@ m["key"] = b"data" m2 = pickle.loads(pickle.dumps(m)) assert list(m) == list(m2) + assert m.missing_exceptions == m2.missing_exceptions def test_keys_view(): @@ -59,3 +77,14 @@ assert len(keys) == 1 # check that we don't consume the keys assert len(keys) == 1 + m.clear() + + +def test_multi(): + m = fsspec.get_mapper("memory://") + data = {"a": b"data1", "b": b"data2"} + m.setitems(data) + + assert m.getitems(list(data)) == data + m.delitems(list(data)) + assert not list(m) diff -Nru fsspec-0.6.1/fsspec/tests/test_registry.py fsspec-0.8.4/fsspec/tests/test_registry.py --- fsspec-0.6.1/fsspec/tests/test_registry.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/tests/test_registry.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,5 +1,22 @@ import pytest -from fsspec.registry import get_filesystem_class, registry +from fsspec.registry import ( + get_filesystem_class, + _registry, + registry, + register_implementation, + ReadOnlyError, + known_implementations, +) +from fsspec.spec import AbstractFileSystem + + +@pytest.fixture() +def clear_registry(): + try: + yield + finally: + _registry.clear() + known_implementations.pop("test", None) @pytest.mark.parametrize( @@ -7,12 +24,64 @@ [("s3", "s3fs", "0.3.0", "0.1.0"), ("gs", "gcsfs", "0.3.0", "0.1.0")], ) def test_minversion_s3fs(protocol, module, minversion, oldversion, monkeypatch): - registry.clear() + _registry.clear() mod = pytest.importorskip(module, minversion) assert get_filesystem_class("s3") is not None - registry.clear() + _registry.clear() monkeypatch.setattr(mod, "__version__", oldversion) with pytest.raises(RuntimeError, match=minversion): get_filesystem_class(protocol) + + +def test_registry_readonly(): + get_filesystem_class("file") + assert "file" in registry + assert "file" in list(registry) + with pytest.raises(ReadOnlyError): + del registry["file"] + with pytest.raises(ReadOnlyError): + registry["file"] = None + with pytest.raises(ReadOnlyError): + registry.clear() + + +def test_register_cls(clear_registry): + with pytest.raises(ValueError): + get_filesystem_class("test") + register_implementation("test", AbstractFileSystem) + cls = get_filesystem_class("test") + assert cls is AbstractFileSystem + + +def test_register_str(clear_registry): + with pytest.raises(ValueError): + get_filesystem_class("test") + register_implementation("test", "fsspec.AbstractFileSystem") + assert "test" not in registry + cls = get_filesystem_class("test") + assert cls is AbstractFileSystem + assert "test" in registry + + +def test_register_fail(clear_registry): + register_implementation("test", "doesntexist.AbstractFileSystem") + with pytest.raises(ImportError): + get_filesystem_class("test") + + register_implementation("test", "doesntexist.AbstractFileSystem") + with pytest.raises(ValueError): + register_implementation("test", "doesntexist.AbstractFileSystem", clobber=False) + + register_implementation( + "test", "doesntexist.AbstractFileSystem", errtxt="hiho", clobber=True + ) + with pytest.raises(ImportError) as e: + get_filesystem_class("test") + assert "hiho" in str(e.value) + register_implementation("test", AbstractFileSystem) + + with pytest.raises(ValueError): + register_implementation("test", AbstractFileSystem, clobber=False) + register_implementation("test", AbstractFileSystem, clobber=True) diff -Nru fsspec-0.6.1/fsspec/tests/test_spec.py fsspec-0.8.4/fsspec/tests/test_spec.py --- fsspec-0.6.1/fsspec/tests/test_spec.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/tests/test_spec.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,6 +1,11 @@ +import json import pickle +import numpy as np import pytest + +import fsspec +from fsspec.implementations.ftp import FTPFileSystem from fsspec.spec import AbstractFileSystem, AbstractBufferedFile @@ -34,19 +39,31 @@ }, {"name": "misc", "type": "directory"}, {"name": "misc/foo.txt", "type": "file", "size": 100}, + {"name": "glob_test/hat/^foo.txt", "type": "file", "size": 100}, + {"name": "glob_test/dollar/$foo.txt", "type": "file", "size": 100}, + {"name": "glob_test/lbrace/{foo.txt", "type": "file", "size": 100}, + {"name": "glob_test/rbrace/}foo.txt", "type": "file", "size": 100}, ) + def __getitem__(self, name): + for item in self._fs_contents: + if item["name"] == name: + return item + raise IndexError("{name} not found!".format(name=name)) + def ls(self, path, detail=True, **kwargs): path = self._strip_protocol(path) - files = ( - file for file in self._fs_contents if path == self._parent(file["name"]) - ) + files = { + file["name"]: file + for file in self._fs_contents + if path == self._parent(file["name"]) + } if detail: - return list(files) + return [files[name] for name in sorted(files)] - return list(sorted([file["name"] for file in files])) + return list(sorted(files)) @pytest.mark.parametrize( @@ -89,12 +106,30 @@ "top_level/second_level/date=2019-10-04/a.parquet", ], ), + ("mock://glob_test/hat/^foo.*", ["glob_test/hat/^foo.txt"]), + ("mock://glob_test/dollar/$foo.*", ["glob_test/dollar/$foo.txt"]), + ("mock://glob_test/lbrace/{foo.*", ["glob_test/lbrace/{foo.txt"]), + ("mock://glob_test/rbrace/}foo.*", ["glob_test/rbrace/}foo.txt"]), ], ) def test_glob(test_path, expected): test_fs = DummyTestFS() + res = test_fs.glob(test_path) + res = sorted(res) # FIXME: py35 back-compat + assert res == expected + res = test_fs.glob(test_path, detail=True) + assert isinstance(res, dict) + assert sorted(res) == expected # FIXME: py35 back-compat + for name, info in res.items(): + assert info == test_fs[name] - assert test_fs.glob(test_path) == expected + +def test_find_details(): + test_fs = DummyTestFS() + filenames = test_fs.find("/") + details = test_fs.find("/", detail=True) + for filename in filenames: + assert details[filename] == test_fs.info(filename) def test_cache(): @@ -166,3 +201,75 @@ result = pickle.loads(y) assert result.storage_args == (2,) assert result.storage_options == dict(bar=1) + + +def test_json(): + a = DummyTestFS(1) + b = DummyTestFS(2, bar=1) + + outa = a.to_json() + outb = b.to_json() + + assert json.loads(outb) # is valid JSON + assert a != b + assert "bar" in outb + + assert DummyTestFS.from_json(outa) is a + assert DummyTestFS.from_json(outb) is b + + +@pytest.mark.parametrize( + "dt", + [ + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.float32, + np.float64, + ], +) +def test_readinto_with_numpy(tmpdir, dt): + store_path = str(tmpdir / "test_arr.npy") + arr = np.arange(10, dtype=dt) + arr.tofile(store_path) + + arr2 = np.empty_like(arr) + with fsspec.open(store_path, "rb") as f: + f.readinto(arr2) + + assert np.array_equal(arr, arr2) + + +@pytest.mark.parametrize( + "dt", + [ + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.float32, + np.float64, + ], +) +def test_readinto_with_multibyte(ftp_writable, tmpdir, dt): + host, port, user, pw = ftp_writable + ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) + + with ftp.open("/out", "wb") as fp: + arr = np.arange(10, dtype=dt) + fp.write(arr.tobytes()) + + with ftp.open("/out", "rb") as fp: + arr2 = np.empty_like(arr) + fp.readinto(arr2) + + assert np.array_equal(arr, arr2) diff -Nru fsspec-0.6.1/fsspec/tests/test_utils.py fsspec-0.8.4/fsspec/tests/test_utils.py --- fsspec-0.6.1/fsspec/tests/test_utils.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/tests/test_utils.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,6 +1,18 @@ import io import pytest -from fsspec.utils import infer_storage_options, seek_delimiter, read_block +import sys +from fsspec.utils import ( + can_be_local, + infer_storage_options, + seek_delimiter, + read_block, + common_prefix, + other_paths, + setup_logger, +) + + +WIN = sys.platform.startswith("win") def test_read_block(): @@ -198,6 +210,13 @@ infer_storage_options("hdfs:///bucket/file.csv", {"protocol": "collide"}) +def test_infer_simple(): + out = infer_storage_options("//mnt/datasets/test.csv") + assert out["protocol"] == "file" + assert out["path"] == "//mnt/datasets/test.csv" + assert out.get("host", None) is None + + @pytest.mark.parametrize( "urlpath, expected_path", ( @@ -213,3 +232,80 @@ so = infer_storage_options(urlpath) assert so["protocol"] == "file" assert so["path"] == expected_path + + +@pytest.mark.parametrize( + "paths, out", + ( + (["/more/dir/", "/more/dir/two", "/more/one", "/more/three"], "/more"), + (["/", "", "/"], ""), + (["/", "/"], "/"), + (["/more/", "/"], ""), + (["/more/", "/more"], "/more"), + (["more/dir/", "more/dir/two", "more/one", "more/three"], "more"), + ), +) +def test_common_prefix(paths, out): + assert common_prefix(paths) == out + + +@pytest.mark.parametrize( + "paths, other, is_dir, expected", + ( + (["/path1"], "/path2", False, ["/path2"]), + (["/path1"], "/path2", True, ["/path2/path1"]), + (["/path1"], "/path2", None, ["/path2"]), + (["/path1"], "/path2/", True, ["/path2/path1"]), + (["/path1"], ["/path2"], True, ["/path2"]), + (["/path1", "/path2"], "/path2", True, ["/path2/path1", "/path2/path2"]), + ( + ["/more/path1", "/more/path2"], + "/path2", + True, + ["/path2/path1", "/path2/path2"], + ), + ( + ["/more/path1", "/more/path2"], + "/path2", + False, + ["/path2/path1", "/path2/path2"], + ), + ( + ["/more/path1", "/more/path2"], + "/path2/", + None, + ["/path2/path1", "/path2/path2"], + ), + ( + ["/more/path1", "/diff/path2"], + "/path2/", + None, + ["/path2/more/path1", "/path2/diff/path2"], + ), + ), +) +def test_other_paths(paths, other, is_dir, expected): + assert other_paths(paths, other, is_dir) == expected + + +def test_log(): + import logging + + logger = setup_logger("fsspec.test") + assert logger.level == logging.DEBUG + + +@pytest.mark.parametrize( + "par", + [ + ("afile", True), + ("file://afile", True), + ("noproto://afile", False), + ("noproto::stuff", False), + ("simplecache::stuff", True), + ("simplecache://stuff", True), + ], +) +def test_can_local(par): + url, outcome = par + assert can_be_local(url) == outcome diff -Nru fsspec-0.6.1/fsspec/transaction.py fsspec-0.8.4/fsspec/transaction.py --- fsspec-0.6.1/fsspec/transaction.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/transaction.py 2020-10-14 16:51:19.000000000 +0000 @@ -27,6 +27,7 @@ def start(self): """Start a transaction on this FileSystem""" + self.files = [] # clean up after previous failed completions self.fs._intrans = True def complete(self, commit=True): diff -Nru fsspec-0.6.1/fsspec/utils.py fsspec-0.8.4/fsspec/utils.py --- fsspec-0.6.1/fsspec/utils.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/utils.py 2020-10-14 16:51:19.000000000 +0000 @@ -1,4 +1,4 @@ -from hashlib import md5 +from hashlib import sha256 import math import os import pathlib @@ -10,7 +10,7 @@ def infer_storage_options(urlpath, inherit_storage_options=None): - """ Infer storage options from URL path and merge it with existing storage + """Infer storage options from URL path and merge it with existing storage options. Parameters @@ -37,7 +37,10 @@ "url_query": "q=1", "extra": "value"} """ # Handle Windows paths including disk name in this special case - if re.match(r"^[a-zA-Z]:[\\/]", urlpath): + if ( + re.match(r"^[a-zA-Z]:[\\/]", urlpath) + or re.match(r"^[a-zA-Z0-9]+://", urlpath) is None + ): return {"protocol": "file", "path": urlpath} parsed_path = urlsplit(urlpath) @@ -117,7 +120,7 @@ def build_name_function(max_int): - """ Returns a function that receives a single integer + """Returns a function that receives a single integer and returns it as a string padded by enough zero characters to align with maximum possible integer @@ -195,7 +198,7 @@ def read_block(f, offset, length, delimiter=None, split_before=False): - """ Read a block of bytes from a file + """Read a block of bytes from a file Parameters ---------- @@ -260,7 +263,7 @@ def tokenize(*args, **kwargs): - """ Deterministic token + """Deterministic token (modified from dask.base) @@ -272,11 +275,11 @@ """ if kwargs: args += (kwargs,) - return md5(str(args).encode()).hexdigest() + return sha256(str(args).encode()).hexdigest() def stringify_path(filepath): - """ Attempt to convert a path-like object to a string. + """Attempt to convert a path-like object to a string. Parameters ---------- @@ -302,3 +305,93 @@ elif isinstance(filepath, pathlib.Path): return str(filepath) return filepath + + +def make_instance(cls, args, kwargs): + inst = cls(*args, **kwargs) + inst._determine_worker() + return inst + + +def common_prefix(paths): + """For a list of paths, find the shortest prefix common to all""" + parts = [p.split("/") for p in paths] + lmax = min(len(p) for p in parts) + end = 0 + for i in range(lmax): + end = all(p[i] == parts[0][i] for p in parts) + if not end: + break + i += end + return "/".join(parts[0][:i]) + + +def other_paths(paths, path2, is_dir=None): + """In bulk file operations, construct a new file tree from a list of files + + Parameters + ---------- + paths: list of str + The input file tree + path2: str or list of str + Root to construct the new list in. If this is already a list of str, we just + assert it has the right number of elements. + is_dir: bool (optional) + For the special case where the input in one element, whether to regard the value + as the target path, or as a directory to put a file path within. If None, a + directory is inferred if the path ends in '/' + + Returns + ------- + list of str + """ + if isinstance(path2, str): + is_dir = is_dir or path2.endswith("/") + path2 = path2.rstrip("/") + if len(paths) > 1: + cp = common_prefix(paths) + path2 = [p.replace(cp, path2, 1) for p in paths] + else: + if is_dir: + path2 = [path2.rstrip("/") + "/" + paths[0].rsplit("/")[-1]] + else: + path2 = [path2] + else: + assert len(paths) == len(path2) + return path2 + + +def is_exception(obj): + return isinstance(obj, BaseException) + + +def get_protocol(url): + parts = re.split(r"(\:\:|\://)", url, 1) + if len(parts) > 1: + return parts[0] + return "file" + + +def can_be_local(path): + """Can the given URL be used wih open_local?""" + from fsspec import get_filesystem_class + + try: + return getattr(get_filesystem_class(get_protocol(path)), "local_file", False) + except (ValueError, ImportError): + # not in registry or import failed + return False + + +def setup_logger(logname, level="DEBUG"): + import logging + + logger = logging.getLogger(logname) + handle = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s " "- %(message)s" + ) + handle.setFormatter(formatter) + logger.addHandler(handle) + logger.setLevel(level) + return logger diff -Nru fsspec-0.6.1/fsspec/_version.py fsspec-0.8.4/fsspec/_version.py --- fsspec-0.6.1/fsspec/_version.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/fsspec/_version.py 2020-10-14 16:51:19.000000000 +0000 @@ -22,9 +22,9 @@ # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). - git_refnames = " (tag: 0.6.1)" - git_full = "4daa59f43a621702cd2ecb38648c33507cd81524" - git_date = "2019-11-27 11:48:41 -0600" + git_refnames = " (tag: 0.8.4)" + git_full = "695fbea9616f9fcfe2bf71a6eb4c722c0e99265b" + git_date = "2020-10-14 12:51:19 -0400" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords diff -Nru fsspec-0.6.1/.gitignore fsspec-0.8.4/.gitignore --- fsspec-0.6.1/.gitignore 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/.gitignore 2020-10-14 16:51:19.000000000 +0000 @@ -49,6 +49,7 @@ coverage.xml *.cover .hypothesis/ +.pytest_cache/ # Translations *.mo @@ -90,7 +91,6 @@ .venv venv/ ENV/ -.idea/ # Spyder project settings .spyderproject @@ -104,3 +104,12 @@ # mypy .mypy_cache/ + +# jetbrains ide stuff +*.iml +.idea/ + +# vscode ide stuff +*.code-workspace +.history +.vscode diff -Nru fsspec-0.6.1/setup.py fsspec-0.8.4/setup.py --- fsspec-0.6.1/setup.py 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/setup.py 2020-10-14 16:51:19.000000000 +0000 @@ -17,9 +17,9 @@ "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", - "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], description="File-system specification", long_description=long_description, @@ -30,7 +30,23 @@ license="BSD", keywords="file", packages=["fsspec", "fsspec.implementations"], - python_requires=">=3.5", + python_requires=">3.6", install_requires=open("requirements.txt").read().strip().split("\n"), + extras_require={ + "abfs": ["adlfs"], + "adl": ["adlfs"], + "dask": ["dask", "distributed"], + "dropbox": ["dropboxdrivefs", "requests", "dropbox"], + "gcs": ["gcsfs"], + "git": ["pygit2"], + "github": ["requests"], + "gs": ["gcsfs"], + "hdfs": ["pyarrow"], + "http": ["requests", "aiohttp"], + "sftp": ["paramiko"], + "s3": ["s3fs"], + "smb": ["smbprotocol"], + "ssh": ["paramiko"], + }, zip_safe=False, ) diff -Nru fsspec-0.6.1/tox.ini fsspec-0.8.4/tox.ini --- fsspec-0.6.1/tox.ini 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/tox.ini 2020-10-14 16:51:19.000000000 +0000 @@ -1,6 +1,6 @@ # content of: tox.ini , put in same dir as setup.py [tox] -envlist = {py35,py36,py37} +envlist = {py36,py37,py38} [core] conda_channels= @@ -12,17 +12,26 @@ requests zstandard python-snappy + aiohttp lz4 distributed dask pyarrow + panel + notebook + pygit2 + git + s3fs pyftpdlib cloudpickle pytest + pytest-benchmark pytest-cov fusepy==3.0.1 + msgpack-python<1.0.0 deps= hadoop-test-cluster==0.1.0 + smbprotocol [dev] conda_deps= @@ -41,6 +50,7 @@ {[core]deps} commands = py.test -v -r s +passenv = TRAVIS [testenv:coverage] description=Run test suite with coverage enabled. @@ -53,6 +63,7 @@ {[core]deps} commands = py.test --cov=fsspec -v -r s +passenv = TRAVIS [testenv:dev] description=Setup conda dev env under '.tox/dev'. @@ -74,7 +85,7 @@ skip_install=True conda_deps= {[dev]conda_deps} -deps= +deps= {[dev]deps} commands_pre= pre-commit install --install-hooks @@ -83,19 +94,14 @@ [testenv:s3fs] description=Run s3fs (@master) test suite against fsspec. +extras=s3 conda_channels= - defaults conda-forge + defaults conda_deps= {[core]conda_deps} - boto3 - botocore httpretty moto - six - mock -deps= - {[core]deps} changedir=.tox/s3fs/tmp whitelist_externals= rm @@ -111,14 +117,12 @@ [testenv:gcsfs] description=Run gcsfs (@master) test suite against fsspec. +extras=gcs conda_channels= defaults conda-forge conda_deps= {[core]conda_deps} - requests - decorator - google-auth deps= {[core]deps} vcrpy diff -Nru fsspec-0.6.1/.travis.yml fsspec-0.8.4/.travis.yml --- fsspec-0.6.1/.travis.yml 2019-11-27 17:48:41.000000000 +0000 +++ fsspec-0.8.4/.travis.yml 2020-10-14 16:51:19.000000000 +0000 @@ -7,9 +7,9 @@ language: generic env: - - TOXENV=py35 - TOXENV=py36 - TOXENV=py37 + - TOXENV=py38 - TOXENV=coverage - TOXENV=lint - TOXENV=s3fs