diff -Nru python-fastparquet-2023.10.1/.github/workflows/main.yaml python-fastparquet-2024.2.0/.github/workflows/main.yaml --- python-fastparquet-2023.10.1/.github/workflows/main.yaml 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/.github/workflows/main.yaml 2024-02-07 18:38:43.000000000 +0000 @@ -120,7 +120,7 @@ - name: pip-install shell: bash -l {0} run: | - pip install 'Cython<3' + pip install Cython pip install hypothesis pip install pytest-localserver pytest-xdist pytest-asyncio pip install -e . --no-deps # Install fastparquet diff -Nru python-fastparquet-2023.10.1/.github/workflows/test_wheel.yaml python-fastparquet-2024.2.0/.github/workflows/test_wheel.yaml --- python-fastparquet-2023.10.1/.github/workflows/test_wheel.yaml 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/.github/workflows/test_wheel.yaml 2024-02-07 18:38:43.000000000 +0000 @@ -38,7 +38,7 @@ fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" @@ -50,7 +50,7 @@ - name: Add msbuild to PATH if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v1.3 + uses: microsoft/setup-msbuild@v2 - name: delvewheel install if: runner.os == 'Windows' @@ -58,7 +58,7 @@ python -m pip install delvewheel cython - name: Build wheels - uses: joerick/cibuildwheel@v2.16.2 + uses: joerick/cibuildwheel@v2.16.5 - name: Install wheels shell: bash -l {0} diff -Nru python-fastparquet-2023.10.1/.github/workflows/wheel.yml python-fastparquet-2024.2.0/.github/workflows/wheel.yml --- python-fastparquet-2023.10.1/.github/workflows/wheel.yml 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/.github/workflows/wheel.yml 2024-02-07 18:38:43.000000000 +0000 @@ -32,9 +32,9 @@ fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: "3.11" + python-version: "3.12" - name: Set up QEMU if: runner.os == 'Linux' @@ -44,7 +44,7 @@ - name: Add msbuild to PATH if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v1 + uses: microsoft/setup-msbuild@v2 - name: delvewheel install if: runner.os == 'Windows' @@ -52,7 +52,7 @@ python -m pip install delvewheel cython - name: Build wheels - uses: joerick/cibuildwheel@v2.16.2 + uses: joerick/cibuildwheel@v2.16.5 - uses: actions/upload-artifact@v3 with: @@ -87,9 +87,9 @@ fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: "3.11" + python-version: "3.12" - name: Set up QEMU if: runner.os == 'Linux' @@ -99,7 +99,7 @@ - name: Add msbuild to PATH if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v1 + uses: microsoft/setup-msbuild@v2 - name: delvewheel install if: runner.os == 'Windows' @@ -107,7 +107,7 @@ python -m pip install delvewheel cython - name: Build wheels - uses: joerick/cibuildwheel@v2.16.2 + uses: joerick/cibuildwheel@v2.16.5 - uses: actions/upload-artifact@v3 with: @@ -142,9 +142,9 @@ fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: "3.11" + python-version: "3.12" - name: Set up QEMU if: runner.os == 'Linux' @@ -154,7 +154,7 @@ - name: Add msbuild to PATH if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v1 + uses: microsoft/setup-msbuild@v2 - name: delvewheel install if: runner.os == 'Windows' @@ -162,7 +162,7 @@ python -m pip install delvewheel cython - name: Build wheels - uses: joerick/cibuildwheel@v2.16.2 + uses: joerick/cibuildwheel@v2.16.5 - uses: actions/upload-artifact@v3 with: @@ -197,9 +197,9 @@ fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: "3.11" + python-version: "3.12" - name: Set up QEMU if: runner.os == 'Linux' @@ -209,7 +209,7 @@ - name: Add msbuild to PATH if: runner.os == 'Windows' - uses: microsoft/setup-msbuild@v1 + uses: microsoft/setup-msbuild@v2 - name: delvewheel install if: runner.os == 'Windows' @@ -217,7 +217,7 @@ python -m pip install delvewheel cython - name: Build wheels - uses: joerick/cibuildwheel@v2.16.2 + uses: joerick/cibuildwheel@v2.16.5 - uses: actions/upload-artifact@v3 with: @@ -246,12 +246,12 @@ fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: "3.11" + python-version: "3.12" - name: Build wheels - uses: joerick/cibuildwheel@v2.16.2 + uses: joerick/cibuildwheel@v2.16.5 - uses: actions/upload-artifact@v3 with: diff -Nru python-fastparquet-2023.10.1/debian/changelog python-fastparquet-2024.2.0/debian/changelog --- python-fastparquet-2023.10.1/debian/changelog 2023-11-28 11:09:24.000000000 +0000 +++ python-fastparquet-2024.2.0/debian/changelog 2024-02-28 18:14:19.000000000 +0000 @@ -1,3 +1,12 @@ +python-fastparquet (2024.2.0-1) unstable; urgency=medium + + * New upstream release. + * Bug fix: "FTBFS: Python.h: No such file or directory", thanks to + Sebastian Ramacher (Closes: #1063598). + * Use --with=numpy3 to fix missing dependency. + + -- Roland Mas Wed, 28 Feb 2024 19:14:19 +0100 + python-fastparquet (2023.10.1-2) unstable; urgency=medium * Source-only upload for migration to testing. diff -Nru python-fastparquet-2023.10.1/debian/control python-fastparquet-2024.2.0/debian/control --- python-fastparquet-2023.10.1/debian/control 2023-11-15 15:41:45.000000000 +0000 +++ python-fastparquet-2024.2.0/debian/control 2024-02-28 18:06:49.000000000 +0000 @@ -21,6 +21,7 @@ python3-numpy, cython3, python3-dev, + libpython3-all-dev, python3-pandas, python3-fsspec, python3-cramjam, diff -Nru python-fastparquet-2023.10.1/debian/rules python-fastparquet-2024.2.0/debian/rules --- python-fastparquet-2023.10.1/debian/rules 2023-11-15 15:38:06.000000000 +0000 +++ python-fastparquet-2024.2.0/debian/rules 2024-02-28 18:13:31.000000000 +0000 @@ -2,6 +2,6 @@ export PYBUILD_NAME=fastparquet %: - dh $@ --with python3 --buildsystem=pybuild + dh $@ --with python3,numpy3 --buildsystem=pybuild override_dh_auto_test: diff -Nru python-fastparquet-2023.10.1/fastparquet/__init__.py python-fastparquet-2024.2.0/fastparquet/__init__.py --- python-fastparquet-2023.10.1/fastparquet/__init__.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/__init__.py 2024-02-07 18:38:43.000000000 +0000 @@ -1,8 +1,8 @@ """parquet - read parquet files.""" -from ._version import __version__ -from .writer import write, update_file_custom_metadata -from . import core, schema, converted_types, api -from .api import ParquetFile -from .util import ParquetException +from fastparquet._version import __version__ +from fastparquet.writer import write, update_file_custom_metadata +from fastparquet import core, schema, converted_types, api +from fastparquet.api import ParquetFile +from fastparquet.util import ParquetException diff -Nru python-fastparquet-2023.10.1/fastparquet/api.py python-fastparquet-2024.2.0/fastparquet/api.py --- python-fastparquet-2023.10.1/fastparquet/api.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/api.py 2024-02-07 18:38:43.000000000 +0000 @@ -6,16 +6,15 @@ import numpy as np import fsspec -from fastparquet.util import join_path import pandas as pd -from . import core, schema, converted_types, encoding, dataframe, writer -from . import parquet_thrift -from .cencoding import ThriftObject, from_buffer -from .json import json_decoder -from .util import (default_open, default_remove, ParquetException, val_to_num, +from fastparquet import core, schema, converted_types, encoding, dataframe, writer +from fastparquet import parquet_thrift +from fastparquet.cencoding import ThriftObject, from_buffer +from fastparquet.json import json_decoder +from fastparquet.util import (default_open, default_remove, ParquetException, val_to_num, ops, ensure_bytes, ensure_str, check_column_names, metadata_from_many, - ex_from_sep, _strip_path_tail, get_fs, PANDAS_VERSION) + ex_from_sep, _strip_path_tail, get_fs, PANDAS_VERSION, join_path) # Find in names of partition files the integer matching "**part.*.parquet", @@ -380,6 +379,9 @@ size = rg.num_rows df, assign = self.pre_allocate( size, columns, categories, index) + if "PANDAS_ATTRS" in self.key_value_metadata: + import json + df.attrs = json.loads(self.key_value_metadata["PANDAS_ATTRS"]) ret = True f = infile or self.open(fn, mode='rb') @@ -765,6 +767,10 @@ size = sum(rg.num_rows for rg in rgs) selected = [None] * len(rgs) # just to fill zip, below df, views = self.pre_allocate(size, columns, categories, index, dtypes=dtypes) + if "PANDAS_ATTRS" in self.key_value_metadata: + import json + df.attrs = json.loads(self.key_value_metadata["PANDAS_ATTRS"]) + start = 0 if self.file_scheme == 'simple': infile = self.open(self.fn, 'rb') @@ -959,10 +965,11 @@ dt = md[col]["numpy_type"] if tz is not None and tz.get(col, False): z = dataframe.tz_to_dt_tz(tz[col]) - if PANDAS_VERSION.major >= 2: - dt = pd.Series([], dtype=dt).dt.tz_convert(z).dtype + dt_series = pd.Series([], dtype=dt) + if PANDAS_VERSION.major >= 2 and dt_series.dt.tz is not None: + dt = dt_series.dt.tz_convert(z).dtype else: - dt = pd.Series([], dtype=dt).dt.tz_localize(z).dtype + dt = dt_series.dt.tz_localize(z).dtype dtype[col] = dt elif dt in converted_types.nullable: if self.pandas_metadata: diff -Nru python-fastparquet-2023.10.1/fastparquet/cencoding.pyx python-fastparquet-2024.2.0/fastparquet/cencoding.pyx --- python-fastparquet-2023.10.1/fastparquet/cencoding.pyx 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/cencoding.pyx 2024-02-07 18:38:43.000000000 +0000 @@ -214,22 +214,30 @@ cdef void delta_read_bitpacked(NumpyIO file_obj, uint8_t bitwidth, - NumpyIO o, uint64_t count, uint8_t itemsize=4): + NumpyIO o, uint64_t count, uint8_t longval=0): cdef: uint64_t data = 0 - int8_t stop = -bitwidth + int8_t left = 0 + int8_t right = 0 uint64_t mask = 0XFFFFFFFFFFFFFFFF >> (64 - bitwidth) while count > 0: - if stop < 0: - data = ((data & 0X00FFFFFFFFFFFFFF) << 8) | file_obj.read_byte() - stop += 8 + if (left - right) < bitwidth: + data = data | (file_obj.read_byte() << left) + left += 8 + elif right > 8: + data >>= 8 + left -= 8 + right -= 8 else: - o.write_int((data >> stop) & mask) - stop -= bitwidth + if longval: + o.write_long((data >> right) & mask) + else: + o.write_int((data >> right) & mask) + right += bitwidth count -= 1 -cpdef void delta_binary_unpack(NumpyIO file_obj, NumpyIO o): +cpdef void delta_binary_unpack(NumpyIO file_obj, NumpyIO o, uint8_t longval=0): cdef: uint64_t block_size = read_unsigned_var_int(file_obj) uint64_t miniblock_per_block = read_unsigned_var_int(file_obj) @@ -248,19 +256,27 @@ temp = o.loc if count > 1: # no more diffs if on last value - delta_read_bitpacked(file_obj, bitwidth, o, values_per_miniblock, count) + delta_read_bitpacked(file_obj, bitwidth, o, values_per_miniblock, longval) o.loc = temp for j in range(values_per_miniblock): - temp = o.read_int() - o.loc -= 4 - o.write_int(value) + if longval: + temp = o.read_long() + o.loc -= 8 + o.write_long(value) + else: + temp = o.read_int() + o.loc -= 4 + o.write_int(value) value += min_delta + temp count -= 1 if count <= 0: return else: for j in range(values_per_miniblock): - o.write_int(value) + if longval: + o.write_long(value) + else: + o.write_int(value) value += min_delta count -= 1 if count <= 0: @@ -372,6 +388,20 @@ ( self.get_pointer())[0] = i self.loc += 4 + cdef void write_long(self, int64_t i): + if self.nbytes - self.loc < 8: + return + ( self.get_pointer())[0] = i + self.loc += 8 + + cdef int64_t read_long(self): + cdef int64_t i + if self.nbytes - self.loc < 8: + return 0 + i = ( self.get_pointer())[0] + self.loc += 8 + return i + cdef void write_many(self, char b, int32_t count): cdef int32_t i for i in range(count): diff -Nru python-fastparquet-2023.10.1/fastparquet/compression.py python-fastparquet-2024.2.0/fastparquet/compression.py --- python-fastparquet-2023.10.1/fastparquet/compression.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/compression.py 2024-02-07 18:38:43.000000000 +0000 @@ -1,7 +1,7 @@ import cramjam import numpy as np -from . import parquet_thrift +from fastparquet import parquet_thrift # TODO: use stream/direct-to-buffer conversions instead of memcopy diff -Nru python-fastparquet-2023.10.1/fastparquet/converted_types.py python-fastparquet-2024.2.0/fastparquet/converted_types.py --- python-fastparquet-2023.10.1/fastparquet/converted_types.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/converted_types.py 2024-02-07 18:38:43.000000000 +0000 @@ -10,9 +10,9 @@ import numpy as np import pandas as pd -from . import parquet_thrift -from .cencoding import time_shift -from .json import json_decoder +from fastparquet import parquet_thrift +from fastparquet.cencoding import time_shift +from fastparquet.json import json_decoder logger = logging.getLogger('parquet') # pylint: disable=invalid-name diff -Nru python-fastparquet-2023.10.1/fastparquet/core.py python-fastparquet-2024.2.0/fastparquet/core.py --- python-fastparquet-2023.10.1/fastparquet/core.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/core.py 2024-02-07 18:38:43.000000000 +0000 @@ -1,17 +1,16 @@ -import warnings import numpy as np import pandas as pd -from . import encoding -from . encoding import read_plain +from fastparquet import encoding +from fastparquet.encoding import read_plain import fastparquet.cencoding as encoding -from .compression import decompress_data, rev_map, decom_into -from .converted_types import convert, simple, converts_inplace -from .schema import _is_list_like, _is_map_like -from .speedups import unpack_byte_array -from . import parquet_thrift -from .cencoding import ThriftObject, read_thrift -from .util import val_to_num, ex_from_sep +from fastparquet.compression import decompress_data, rev_map, decom_into +from fastparquet.converted_types import convert, simple, converts_inplace +from fastparquet.schema import _is_list_like, _is_map_like +from fastparquet.speedups import unpack_byte_array +from fastparquet import parquet_thrift +from fastparquet.cencoding import ThriftObject +from fastparquet.util import val_to_num def _read_page(file_obj, page_header, column_metadata): @@ -157,13 +156,17 @@ o = encoding.NumpyIO(values) encoding.read_rle_bit_packed_hybrid( io_obj, bit_width, io_obj.len-io_obj.tell(), o=o, itemsize=1) - values = values.data[:nval] + if isinstance(values, np.ndarray): + values = values[:nval] + else: + values = values.data[:nval] else: values = np.zeros(nval, dtype=np.int8) elif daph.encoding == parquet_thrift.Encoding.DELTA_BINARY_PACKED: - values = np.empty(daph.num_values - num_nulls, dtype=np.int32) + values = np.empty(daph.num_values - num_nulls, + dtype=np.int64 if metadata.type == 2 else np.int32) o = encoding.NumpyIO(values.view('uint8')) - encoding.delta_binary_unpack(io_obj, o) + encoding.delta_binary_unpack(io_obj, o, longval=metadata.type == 2) else: raise NotImplementedError('Encoding %s' % daph.encoding) return definition_levels, repetition_levels, values[:nval] diff -Nru python-fastparquet-2023.10.1/fastparquet/dataframe.py python-fastparquet-2024.2.0/fastparquet/dataframe.py --- python-fastparquet-2023.10.1/fastparquet/dataframe.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/dataframe.py 2024-02-07 18:38:43.000000000 +0000 @@ -11,7 +11,7 @@ from pandas.core.arrays.masked import BaseMaskedDtype import warnings -from .util import PANDAS_VERSION +from fastparquet.util import PANDAS_VERSION class Dummy(object): @@ -107,7 +107,7 @@ # funky pandas not-dtype t = t.base if ("M" in str(t) or "time" in str(t)) and "[" not in str(t): - t = t + "[ns]" + t = str(t) + "[ns]" d = np.empty(0, dtype=t) if d.dtype.kind == "M" and str(col) in timezones: try: diff -Nru python-fastparquet-2023.10.1/fastparquet/encoding.py python-fastparquet-2024.2.0/fastparquet/encoding.py --- python-fastparquet-2023.10.1/fastparquet/encoding.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/encoding.py 2024-02-07 18:38:43.000000000 +0000 @@ -1,8 +1,8 @@ """encoding.py - methods for reading parquet encoded data blocks.""" import numpy as np -from .cencoding import read_bitpacked1, NumpyIO -from .speedups import unpack_byte_array -from . import parquet_thrift +from fastparquet.cencoding import read_bitpacked1, NumpyIO +from fastparquet.speedups import unpack_byte_array +from fastparquet import parquet_thrift def read_plain_boolean(raw_bytes, count, out=None): diff -Nru python-fastparquet-2023.10.1/fastparquet/parquet_thrift/__init__.py python-fastparquet-2024.2.0/fastparquet/parquet_thrift/__init__.py --- python-fastparquet-2023.10.1/fastparquet/parquet_thrift/__init__.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/parquet_thrift/__init__.py 2024-02-07 18:38:43.000000000 +0000 @@ -4,5 +4,7 @@ def __getattr__(name): # for compatability with coe that calls, e.g., parquet_thrift.RowGroup(...) - from ..cencoding import ThriftObject - return partial(ThriftObject.from_fields, thrift_name=name) + from fastparquet.cencoding import ThriftObject + if name[0].isupper(): + return partial(ThriftObject.from_fields, thrift_name=name) + raise AttributeError(name) diff -Nru python-fastparquet-2023.10.1/fastparquet/schema.py python-fastparquet-2024.2.0/fastparquet/schema.py --- python-fastparquet-2023.10.1/fastparquet/schema.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/schema.py 2024-02-07 18:38:43.000000000 +0000 @@ -1,7 +1,7 @@ """Utils for working with the parquet thrift models.""" from collections import OrderedDict -from . import parquet_thrift +from fastparquet import parquet_thrift def schema_tree(schema, i=0): diff -Nru python-fastparquet-2023.10.1/fastparquet/test/test_api.py python-fastparquet-2024.2.0/fastparquet/test/test_api.py --- python-fastparquet-2023.10.1/fastparquet/test/test_api.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/test/test_api.py 2024-02-07 18:38:43.000000000 +0000 @@ -9,7 +9,7 @@ import fsspec import numpy as np import pandas as pd -from pandas._testing import makeMixedDataFrame +from .util import makeMixedDataFrame try: from pandas.tslib import Timestamp except ImportError: diff -Nru python-fastparquet-2023.10.1/fastparquet/test/test_encoding.py python-fastparquet-2024.2.0/fastparquet/test/test_encoding.py --- python-fastparquet-2023.10.1/fastparquet/test/test_encoding.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/test/test_encoding.py 2024-02-07 18:38:43.000000000 +0000 @@ -165,8 +165,8 @@ # one and only miniblock cencoding.encode_unsigned_varint(zigzag(-2), o) # minimum delta (zigzag) o.write_byte(2) # bit-width list (only one) - o.write_byte(0b00000011) # [0, 0, 0, 3] - o.write_byte(0b11111100) # [3, 3, 3, pad] + o.write_byte(0b11000000) # rev([0, 0, 0, 3]) + o.write_byte(0b00111111) # rev([3, 3, 3, pad]) o.seek(0) diff -Nru python-fastparquet-2023.10.1/fastparquet/test/test_output.py python-fastparquet-2024.2.0/fastparquet/test/test_output.py --- python-fastparquet-2023.10.1/fastparquet/test/test_output.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/test/test_output.py 2024-02-07 18:38:43.000000000 +0000 @@ -8,7 +8,7 @@ from fastparquet import ParquetFile from fastparquet import write, parquet_thrift, update_file_custom_metadata from fastparquet import writer, encoding -from pandas._testing import makeMixedDataFrame +from .util import makeMixedDataFrame from pandas.testing import assert_frame_equal from pandas.api.types import CategoricalDtype import pytest @@ -1206,3 +1206,13 @@ df.to_parquet(path=fn, engine="fastparquet") df2 = pd.read_parquet(fn, engine="fastparquet") assert df.to_dict() == df2.to_dict() + + +def test_attrs_roundtrip(tempdir): + fn = os.path.join(tempdir, "out.parq") + attrs = {"oi": 5} + df = pd.DataFrame({"A": np.array([[1.1, 1.2], [], None], dtype=object)}) + df.attrs = attrs + df.to_parquet(path=fn, engine="fastparquet") + df2 = pd.read_parquet(fn, engine="fastparquet") + assert df2.attrs == attrs diff -Nru python-fastparquet-2023.10.1/fastparquet/test/test_pd_optional_types.py python-fastparquet-2024.2.0/fastparquet/test/test_pd_optional_types.py --- python-fastparquet-2023.10.1/fastparquet/test/test_pd_optional_types.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/test/test_pd_optional_types.py 2024-02-07 18:38:43.000000000 +0000 @@ -3,6 +3,7 @@ import numpy as np import pandas as pd from pandas.testing import assert_frame_equal +from pandas.core.arrays import IntegerArray import fastparquet as fp from .util import tempdir from fastparquet import write, parquet_thrift @@ -10,61 +11,51 @@ import numpy.random as random -EXPECTED_SERIES_INT8 = pd.Series(random.uniform(low=-128, high=127,size=100)).round() -EXPECTED_SERIES_INT16 = pd.Series(random.uniform(low=-32768, high=32767,size=100)).round() -EXPECTED_SERIES_INT32 = pd.Series(random.uniform(low=-2147483648, high=2147483647,size=100)).round() -EXPECTED_SERIES_INT64 = pd.Series(random.uniform(low=-9223372036854775808, high=9223372036854775807,size=100)).round() -EXPECTED_SERIES_UINT8 = pd.Series(random.uniform(low=0, high=255,size=100)).round() -EXPECTED_SERIES_UINT16 = pd.Series(random.uniform(low=0, high=65535,size=100)).round() -EXPECTED_SERIES_UINT32 = pd.Series(random.uniform(low=0, high=4294967295,size=100)).round() -EXPECTED_SERIES_UINT64 = pd.Series(random.uniform(low=0, high=18446744073709551615,size=100)).round() -EXPECTED_SERIES_BOOL = pd.Series(random.choice([False, True], 100)) -EXPECTED_SERIES_STRING = pd.Series(random.choice([ +EXPECTED_SERIES_INT8 = random.uniform(low=-128, high=127, size=100).round() +EXPECTED_SERIES_INT16 = random.uniform(low=-32768, high=32767, size=100).round() +EXPECTED_SERIES_INT32 = random.uniform(low=-2147483648, high=2147483647, size=100).round() +EXPECTED_SERIES_INT64 = random.uniform(low=-9223372036854775808, high=9223372036854775807, size=100).round() +EXPECTED_SERIES_UINT8 = random.uniform(low=0, high=255, size=100).round() +EXPECTED_SERIES_UINT16 = random.uniform(low=0, high=65535, size=100).round() +EXPECTED_SERIES_UINT32 = random.uniform(low=0, high=4294967295, size=100).round() +EXPECTED_SERIES_UINT64 = random.uniform(low=0, high=18446744073709551615, size=100).round() +EXPECTED_SERIES_BOOL = random.choice([False, True], 100) +EXPECTED_SERIES_STRING = random.choice([ 'You', 'are', 'my', 'fire', 'The', 'one', 'desire', 'Believe', 'when', 'I', 'say', 'I', 'want', 'it', 'that', 'way' - ], 100)) + ], 100) -EXPECTED_SERIES_INT8.loc[20:30] = np.nan -EXPECTED_SERIES_INT16.loc[20:30] = np.nan -EXPECTED_SERIES_INT32.loc[20:30] = np.nan -EXPECTED_SERIES_INT64.loc[20:30] = np.nan -EXPECTED_SERIES_UINT8.loc[20:30] = np.nan -EXPECTED_SERIES_UINT16.loc[20:30] = np.nan -EXPECTED_SERIES_UINT32.loc[20:30] = np.nan -EXPECTED_SERIES_UINT64.loc[20:30] = np.nan -EXPECTED_SERIES_BOOL.loc[20:30] = np.nan -EXPECTED_SERIES_STRING.loc[20:30] = np.nan +EXPECTED_SERIES_INT8[20:30] = np.nan +EXPECTED_SERIES_INT16[20:30] = np.nan +EXPECTED_SERIES_INT32[20:30] = np.nan +EXPECTED_SERIES_INT64[20:30] = np.nan +EXPECTED_SERIES_UINT8[20:30] = np.nan +EXPECTED_SERIES_UINT16[20:30] = np.nan +EXPECTED_SERIES_UINT32[20:30] = np.nan +EXPECTED_SERIES_UINT64[20:30] = np.nan +EXPECTED_SERIES_BOOL[20:30] = np.nan +EXPECTED_SERIES_STRING[20:30] = np.nan +mask = EXPECTED_SERIES_UINT64 > -1 TEST = pd.DataFrame({ - 'int8': EXPECTED_SERIES_INT8.astype('Int8'), - 'int16': EXPECTED_SERIES_INT16.astype('Int16'), - 'int32': EXPECTED_SERIES_INT32.astype('Int32'), - 'int64': EXPECTED_SERIES_INT64.astype('Int64'), - 'uint8': EXPECTED_SERIES_UINT8.astype('UInt8'), - 'uint16': EXPECTED_SERIES_UINT16.astype('UInt16'), - 'uint32': EXPECTED_SERIES_UINT32.astype('UInt32'), - 'uint64': EXPECTED_SERIES_UINT64.astype('UInt64'), - 'bool': EXPECTED_SERIES_BOOL.astype('boolean'), - 'string': EXPECTED_SERIES_STRING.astype('string') + 'int8': pd.Series(pd.array(EXPECTED_SERIES_INT8, dtype='Int8')), + 'int16': pd.Series(pd.array(EXPECTED_SERIES_INT16, dtype='Int16')), + 'int32': pd.Series(pd.array(EXPECTED_SERIES_INT32, dtype='Int32')), + 'int64': pd.Series(pd.array(EXPECTED_SERIES_INT64, dtype='Int64')), + 'uint8': pd.Series(pd.array(EXPECTED_SERIES_UINT8, dtype='UInt8')), + 'uint16': pd.Series(pd.array(EXPECTED_SERIES_UINT16, dtype='UInt16')), + 'uint32': pd.Series(pd.array(EXPECTED_SERIES_UINT32, dtype='UInt32')), + 'uint64': pd.Series(pd.array(EXPECTED_SERIES_UINT64, dtype='UInt64')), + 'bool': pd.Series(pd.array(EXPECTED_SERIES_BOOL, dtype='boolean')), + 'string': pd.Series(EXPECTED_SERIES_STRING, dtype='string') }) -EXPECTED = pd.DataFrame({ - 'int8': EXPECTED_SERIES_INT8.astype('float16'), - 'int16': EXPECTED_SERIES_INT16.astype('float32'), - 'int32': EXPECTED_SERIES_INT32.astype('float64'), - 'int64': EXPECTED_SERIES_INT64.astype('float64'), - 'uint8': EXPECTED_SERIES_UINT8.astype('float16'), - 'uint16': EXPECTED_SERIES_UINT16.astype('float32'), - 'uint32': EXPECTED_SERIES_UINT32.astype('float64'), - 'uint64': EXPECTED_SERIES_UINT64.astype('float64'), - 'bool': EXPECTED_SERIES_BOOL.astype('float16'), - 'string': EXPECTED_SERIES_STRING -}) +EXPECTED = TEST EXPECTED_PARQUET_TYPES = { @@ -80,7 +71,8 @@ 'string': 'BYTE_ARRAY' } -@pytest.mark.parametrize('comp', (None,'snappy', 'gzip')) + +@pytest.mark.parametrize('comp', (None, 'snappy', 'gzip')) @pytest.mark.parametrize('scheme', ('simple', 'hive')) def test_write_nullable_columns(tempdir, scheme, comp): fname = os.path.join(tempdir, 'test_write_nullable_columns.parquet') diff -Nru python-fastparquet-2023.10.1/fastparquet/test/test_read.py python-fastparquet-2024.2.0/fastparquet/test/test_read.py --- python-fastparquet-2023.10.1/fastparquet/test/test_read.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/test/test_read.py 2024-02-07 18:38:43.000000000 +0000 @@ -581,3 +581,8 @@ "k_int": 1, "k_bool": True, } + +def test_reading_timezone(): + fn = os.path.join(TEST_DATA, "test-timezone.parquet") + pf = fastparquet.ParquetFile(fn) + assert pf.dtypes['date'] == 'datetime64[ns, UTC]' \ No newline at end of file diff -Nru python-fastparquet-2023.10.1/fastparquet/test/util.py python-fastparquet-2024.2.0/fastparquet/test/util.py --- python-fastparquet-2023.10.1/fastparquet/test/util.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/test/util.py 2024-02-07 18:38:43.000000000 +0000 @@ -4,6 +4,8 @@ import tempfile import shutil +import pandas as pd + TEST_DATA = "test-data" port = 5555 @@ -76,3 +78,18 @@ yield d if os.path.exists(d): shutil.rmtree(d, ignore_errors=True) + + + +def makeMixedDataFrame(): + index = pd.Index(["a", "b", "c", "d", "e"], name="index") + + data = { + "A": pd.Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float64"), + "B": pd.Series([0.0, 1.0, 0.0, 1.0, 0.0], dtype="float64"), + "C": pd.Series(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype='object'), + "D": pd.bdate_range("1/1/2009", periods=5), + } + return pd.DataFrame(data=data) + + diff -Nru python-fastparquet-2023.10.1/fastparquet/thrift_structures.py python-fastparquet-2024.2.0/fastparquet/thrift_structures.py --- python-fastparquet-2023.10.1/fastparquet/thrift_structures.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/thrift_structures.py 2024-02-07 18:38:43.000000000 +0000 @@ -1,2 +1,5 @@ -from . import parquet_thrift -from .cencoding import ThriftObject +from fastparquet import parquet_thrift +from fastparquet.cencoding import ThriftObject + + +__all__ = ["ThriftObject", "parquet_thrift"] diff -Nru python-fastparquet-2023.10.1/fastparquet/util.py python-fastparquet-2024.2.0/fastparquet/util.py --- python-fastparquet-2023.10.1/fastparquet/util.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/util.py 2024-02-07 18:38:43.000000000 +0000 @@ -14,8 +14,8 @@ import fsspec -from . import parquet_thrift -from .cencoding import ThriftObject +from fastparquet import parquet_thrift +from fastparquet.cencoding import ThriftObject from fastparquet import __version__ PANDAS_VERSION = Version(pd.__version__) @@ -297,7 +297,7 @@ Thrift object or parquet file which metadata is to update. custom_metadata : dict Key-value metadata to update in thrift object. - + The values must be strings or binary. To pass a dictionary, serialize it as json string then encode it in binary. Notes ----- Key-value metadata are expected binary encoded. This function ensures it @@ -305,6 +305,10 @@ """ kvm = (obj.key_value_metadata if isinstance(obj, ThriftObject) else obj.fmd.key_value_metadata) + + if kvm is None: + kvm = [] + # Spare list of keys. kvm_keys = [item.key for item in kvm] for key, value in custom_metadata.items(): diff -Nru python-fastparquet-2023.10.1/fastparquet/writer.py python-fastparquet-2024.2.0/fastparquet/writer.py --- python-fastparquet-2023.10.1/fastparquet/writer.py 2023-10-26 18:42:23.000000000 +0000 +++ python-fastparquet-2024.2.0/fastparquet/writer.py 2024-02-07 18:38:43.000000000 +0000 @@ -11,19 +11,17 @@ from fastparquet.util import join_path -from . import parquet_thrift -from .api import ParquetFile, partitions, part_ids -from .compression import compress_data -from .converted_types import tobson -from .json import json_encoder -from .util import (default_open, default_mkdirs, check_column_names, +from fastparquet import parquet_thrift, __version__, cencoding +from fastparquet.api import ParquetFile, partitions, part_ids +from fastparquet.compression import compress_data +from fastparquet.converted_types import tobson +from fastparquet.json import json_encoder +from fastparquet.util import (default_open, default_mkdirs, check_column_names, created_by, get_column_metadata, norm_col_name, path_string, reset_row_idx, get_fs, update_custom_metadata) -from . import __version__ -from .speedups import array_encode_utf8, pack_byte_array -from . import cencoding -from .cencoding import NumpyIO, ThriftObject, from_buffer +from fastparquet.speedups import array_encode_utf8, pack_byte_array +from fastparquet.cencoding import NumpyIO, ThriftObject, from_buffer from decimal import Decimal MARKER = b'PAR1' @@ -1243,6 +1241,9 @@ -------- >>> fastparquet.write('myfile.parquet', df) # doctest: +SKIP """ + custom_metadata = custom_metadata or {} + if getattr(data, "attrs", None): + custom_metadata["PANDAS_ATTRS"] = json.dumps(data.attrs) if file_scheme not in ('simple', 'hive', 'drill'): raise ValueError( 'File scheme should be simple|hive|drill, not ' f'{file_scheme}.') @@ -1305,7 +1306,7 @@ object_encoding=object_encoding, times=times, index_cols=index_cols, partition_cols=partition_on, cols_dtype=cols_dtype) - if custom_metadata is not None: + if custom_metadata: kvm = fmd.key_value_metadata or [] kvm.extend( [ @@ -1608,6 +1609,7 @@ Local path to file. custom_metadata : dict Key-value metadata to update in thrift object. + The values must be strings or binary. To pass a dictionary, serialize it as json string then encode it in binary. is_metadata_file : bool, default None Define if target file is a pure metadata file, or is a parquet data file. If `None`, is set depending file name. Binary files /tmp/tmp3gswtb4_/QHrXAuLxR9/python-fastparquet-2023.10.1/test-data/test-timezone.parquet and /tmp/tmp3gswtb4_/qCypudxk85/python-fastparquet-2024.2.0/test-data/test-timezone.parquet differ