diff --git a/MANIFEST.in b/MANIFEST.in index e470bfc..b7eb7b5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,4 +5,4 @@ include *.pyx recursive-include pyreadstat *.pyx include *.pxd recursive-include pyreadstat *.pxd - +recursive-include pyreadstat py.typed diff --git a/how_to_test.md b/how_to_test.md index b3c8bea..e870f79 100644 --- a/how_to_test.md +++ b/how_to_test.md @@ -1,11 +1,35 @@ +# How to test pyreadstat + +## Dependencies for testing + +Additional dependencies for testing can be installed with: + +```shell +pip install --group dev --group test +``` + +## Running tests + If you have installed pyreadstat on your environment, enter this folder and do: -```python +```shell python3 tests/test_basic.py ``` -If you have built in place do +If you have built in place, do: -``` +```shell python3 tests/test_basic.py --inplace -``` \ No newline at end of file +``` + +Type hint tests can be run with: + +```shell +pytest tests/test_typing.yml --mypy-ini-file=tests/test_mypy_setup.ini +``` + +To run all tests in place, do: + +```shell +python tests/test_basic.py --inplace && python tests/test_narwhalified.py --inplace --backend=pandas && python tests/test_narwhalified.py --inplace --backend=polars && pytest tests/test_http_integration.py && pytest tests/test_typing.yml --mypy-ini-file=tests/test_mypy_setup.ini +``` diff --git a/pyproject.toml b/pyproject.toml index bc1f36d..17092d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,3 +5,22 @@ requires = [ "cython" ] build-backend = "setuptools.build_meta" + +[dependency-groups] +dev = [ + "setuptools>=80.0.0", + "numpy>=2.0.0", + "pandas>=2.0.0", + "polars>=1.30.0", + "cython>=3.0.0", + "narwhals>=2.10.1", +] +test = [ + "pytest>=8.0.0", + "mypy>=1.20.0", + "pytest-mypy-plugins>=4.0.0", + "pandas-stubs>=2.0.0", + "pandas>=2.0.0", + "polars>=1.30.0", + "narwhals>=2.10.1", +] diff --git a/pyreadstat/__init__.py b/pyreadstat/__init__.py index 8ecd1c4..89474f9 100644 --- a/pyreadstat/__init__.py +++ b/pyreadstat/__init__.py @@ -14,12 +14,32 @@ # limitations under the License. # ############################################################################# -from .pyreadstat import read_sav, read_sas7bdat, read_xport, read_dta, read_sav, read_por, read_sas7bcat + +from .pyreadstat import read_sav, read_sas7bdat, read_xport, read_dta, read_por, read_sas7bcat from .pyreadstat import write_sav, write_dta, write_xport, write_por from .pyreadstat import read_file_in_chunks, read_file_multiprocessing from .pyclasses import metadata_container -from ._readstat_parser import ReadstatError +from ._readstat_parser import ReadstatError, PyreadstatError from .pyfunctions import set_value_labels, set_catalog_to_sas __version__ = "1.3.4" +__all__ = ( + "read_sav", + "read_sas7bdat", + "read_xport", + "read_dta", + "read_por", + "read_sas7bcat", + "write_sav", + "write_dta", + "write_xport", + "write_por", + "read_file_in_chunks", + "read_file_multiprocessing", + "metadata_container", + "ReadstatError", + "PyreadstatError", + "set_value_labels", + "set_catalog_to_sas", +) \ No newline at end of file diff --git a/pyreadstat/py.typed b/pyreadstat/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/pyreadstat/pyclasses.py b/pyreadstat/pyclasses.py index 8ba08b4..11121db 100644 --- a/pyreadstat/pyclasses.py +++ b/pyreadstat/pyclasses.py @@ -14,31 +14,59 @@ # limitations under the License. # ############################################################################# +# Typing + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Literal, TypedDict + + +class MissingRange(TypedDict): + """A dictionary to hold the definition of a missing range""" + + lo: float + hi: float + + +class MRSet(TypedDict): + """A dictionary to hold the definition of a multiple-response (MR) set""" + + type: Literal["D", "C"] + is_dichotomy: bool + counted_value: int | None + label: str + variable_list: list[str] + + +# Classes + + +@dataclass class metadata_container: """ This class holds metadata we want to give back to python """ - def __init__(self): - self.column_names = list() - self.column_labels = list() - self.column_names_to_labels = dict() - self.file_encoding = None - self.number_columns = None - self.number_rows = None - self.variable_value_labels = dict() - self.value_labels = dict() - self.variable_to_label = dict() - self.notes = list() - self.original_variable_types = dict() - self.readstat_variable_types = dict() - self.table_name = None - self.missing_ranges = dict() - self.missing_user_values = dict() - self.variable_storage_width = dict() - self.variable_display_width = dict() - self.variable_alignment = dict() - self.variable_measure = dict() - self.creation_time = None - self.modification_time = None - self.mr_sets = dict() + column_names: list[str] = field(default_factory=list) + column_labels: list[str] = field(default_factory=list) + column_names_to_labels: dict[str, str] = field(default_factory=dict) + file_encoding: str | None = None + file_label: str | None = None + number_columns: int | None = None + number_rows: int | None = None + variable_value_labels: dict[str, dict[float | int, str]] = field(default_factory=dict) + value_labels: dict[str, dict[float | int, str]] = field(default_factory=dict) + variable_to_label: dict[str, str] = field(default_factory=dict) + notes: list[str] = field(default_factory=list) + original_variable_types: dict[str, str] = field(default_factory=dict) + readstat_variable_types: dict[str, str] = field(default_factory=dict) + table_name: str | None = None + missing_ranges: dict[str, list[int | float | str | MissingRange]] = field(default_factory=dict) + missing_user_values: dict[str, list[int | float | str | MissingRange]] = field(default_factory=dict) + variable_storage_width: dict[str, int] = field(default_factory=dict) + variable_display_width: dict[str, int] = field(default_factory=dict) + variable_alignment: dict[str, str] = field(default_factory=dict) + variable_measure: dict[str, Literal["nominal", "ordinal", "scale", "unknown"]] = field(default_factory=dict) + creation_time: datetime | None = None + modification_time: datetime | None = None + mr_sets: dict[str, MRSet] = field(default_factory=dict) diff --git a/pyreadstat/pyfunctions.py b/pyreadstat/pyfunctions.py index 30cdb04..a64301e 100644 --- a/pyreadstat/pyfunctions.py +++ b/pyreadstat/pyfunctions.py @@ -1,14 +1,24 @@ """ Functions written in pure python """ -from copy import deepcopy, copy + +from copy import deepcopy import warnings import narwhals.stable.v2 as nw +from narwhals.typing import IntoDataFrameT + +from .pyclasses import metadata_container # Functions to deal with value labels -def set_value_labels(dataframe, metadata, formats_as_category=True, formats_as_ordered_category=False): + +def set_value_labels( + dataframe: IntoDataFrameT, + metadata: metadata_container, + formats_as_category: bool = True, + formats_as_ordered_category: bool = False, +) -> IntoDataFrameT: """ Changes the values in the dataframe according to the value formats in the metadata. It will return a copy of the dataframe. If no appropiate formats were found, the result will be an unchanged copy @@ -16,20 +26,20 @@ def set_value_labels(dataframe, metadata, formats_as_category=True, formats_as_o Parameters ---------- - dataframe : pandas dataframe + dataframe : pandas or polars dataframe resulting from parsing a file metadata : dictionary resulting from parsing a file formats_as_category : bool, optional - defaults to True. If True the variables having formats will be transformed into pandas categories. + defaults to True. If True the variables having formats will be transformed into pandas or polars categories. formats_as_ordered_category : bool, optional - defaults to False. If True the variables having formats will be transformed into pandas ordered categories. + defaults to False. If True the variables having formats will be transformed into pandas or polars ordered categories. it has precedence over formats_as_category, meaning if this is True, it will take effect irrespective of the value of formats_as_category. Returns ------- - df_copy : pandas dataframe + df_copy : pandas or polars dataframe a copy of the original dataframe with the values changed, if appropiate formats were found, unaltered otherwise """ @@ -51,20 +61,29 @@ def set_value_labels(dataframe, metadata, formats_as_category=True, formats_as_o if uval not in labels: labels[uval] = uval # if all values are null, there will be nothing to replace. However we cannot do replace_strict on null dtype, it raises an error - if not df_copy.implementation.is_pandas() and (len(df_copy[var_name])==df_copy[var_name].null_count()): + if not df_copy.implementation.is_pandas() and ( + len(df_copy[var_name]) == df_copy[var_name].null_count() + ): continue # replace_strict requires that all the values are in the map. Could not get map_batches or when/then/otherwise to work - elif not df_copy.implementation.is_pandas() and (df_copy[var_name].dtype==nw.Object or not all([type(v)==type(list(labels.values())[0]) for v in labels.values() if v is not None])): + elif not df_copy.implementation.is_pandas() and ( + df_copy[var_name].dtype == nw.Object + or not all( + [type(v) == type(list(labels.values())[0]) for v in labels.values() if v is not None] + ) + ): # polars is very difficult to convince to mix strings and numbers, so we have to do it this way temp = [labels[x] for x in df_copy[var_name]] - newser = nw.new_series(name=var_name, values= temp, dtype=nw.Object, backend=df_copy.implementation) + newser = nw.new_series( + name=var_name, values=temp, dtype=nw.Object, backend=df_copy.implementation + ) df_copy = df_copy.with_columns(newser.alias(var_name)) if formats_as_category or formats_as_ordered_category: msg = f"You requested formats_as_category=True or formats_as_ordered_category=True, but it was not possible to cast variable '{var_name}' to category" warnings.warn(msg, RuntimeWarning) continue # not sure if we get into this situation ever or what would exactly happen, maybe this is not needed? - elif not df_copy.implementation.is_pandas() and df_copy[var_name].dtype==nw.Unknown: + elif not df_copy.implementation.is_pandas() and df_copy[var_name].dtype == nw.Unknown: msg = f"It was not possible to apply value formats to variable '{var_name}' due to unknown/not supported data type" warnings.warn(msg, RuntimeWarning) continue @@ -74,7 +93,7 @@ def set_value_labels(dataframe, metadata, formats_as_category=True, formats_as_o categories = list(set(labels.values())) original_values = list(labels.keys()) original_values.sort() - revdict= dict() + revdict = dict() for orival in original_values: curcat = labels.get(orival) if not revdict.get(curcat): @@ -84,11 +103,16 @@ def set_value_labels(dataframe, metadata, formats_as_category=True, formats_as_o elif formats_as_category: df_copy = df_copy.with_columns(nw.col(var_name).cast(nw.Categorical)) - return df_copy.to_native() -def set_catalog_to_sas(sas_dataframe, sas_metadata, catalog_metadata, formats_as_category=True, - formats_as_ordered_category=False): + +def set_catalog_to_sas( + sas_dataframe: IntoDataFrameT, + sas_metadata: metadata_container, + catalog_metadata: metadata_container, + formats_as_category: bool = True, + formats_as_ordered_category: bool = False, +) -> tuple[IntoDataFrameT, metadata_container]: """ Changes the values in the dataframe and sas_metadata according to the formats in the catalog. It will return a copy of the dataframe and metadata. If no appropriate formats were found, the result will @@ -96,22 +120,22 @@ def set_catalog_to_sas(sas_dataframe, sas_metadata, catalog_metadata, formats_as Parameters ---------- - sas_dataframe : pandas dataframe + sas_dataframe : pandas or polars dataframe resulting from parsing a sas7bdat file sas_metadata : pyreadstat metadata object resulting from parsing a sas7bdat file catalog_metadata : pyreadstat metadata object resulting from parsing a sas7bcat (catalog) file formats_as_category : bool, optional - defaults to True. If True the variables having formats will be transformed into pandas categories. + defaults to True. If True the variables having formats will be transformed into pandas or polars categories. formats_as_ordered_category : bool, optional - defaults to False. If True the variables having formats will be transformed into pandas ordered categories. + defaults to False. If True the variables having formats will be transformed into pandas or polars ordered categories. it has precedence over formats_as_category, meaning if this is True, it will take effect irrespective of the value of formats_as_category. Returns ------- - df_copy : pandas dataframe + df_copy : pandas or polars dataframe a copy of the original dataframe with the values changed, if appropriate formats were found, unaltered otherwise metadata : dict @@ -122,8 +146,12 @@ def set_catalog_to_sas(sas_dataframe, sas_metadata, catalog_metadata, formats_as catalog_metadata_copy = deepcopy(catalog_metadata) metadata = deepcopy(sas_metadata) metadata.value_labels = catalog_metadata_copy.value_labels - df_copy = set_value_labels(sas_dataframe, metadata, formats_as_category=formats_as_category, - formats_as_ordered_category=formats_as_ordered_category) + df_copy = set_value_labels( + sas_dataframe, + metadata, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + ) variable_value_labels = dict() for var_name, var_label in metadata.variable_to_label.items(): @@ -133,9 +161,8 @@ def set_catalog_to_sas(sas_dataframe, sas_metadata, catalog_metadata, formats_as metadata.variable_value_labels = variable_value_labels else: - #df_copy = sas_dataframe.copy() + # df_copy = sas_dataframe.copy() df_copy = nw.from_native(sas_dataframe).clone().to_native() metadata = deepcopy(sas_metadata) return df_copy, metadata - diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index 3038b05..d94f742 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -14,28 +14,146 @@ # limitations under the License. # ############################################################################# +from collections.abc import Callable, Iterator import multiprocessing as mp from itertools import chain +from os import PathLike +from typing import TYPE_CHECKING, Any, Concatenate, Literal, TypeAlias, overload, Protocol import narwhals.stable.v2 as nw -from ._readstat_parser import parser_entry_point -from ._readstat_writer import writer_entry_point, PyreadstatError +from ._readstat_parser import parser_entry_point, PyreadstatError +from ._readstat_writer import writer_entry_point from .worker import worker +from .pyclasses import metadata_container, MissingRange from .pyfunctions import set_value_labels, set_catalog_to_sas +# Typing interface + +if TYPE_CHECKING: + # Setup type aliases for the public interface. + # These are not executed at runtime, but they help type checkers understand + # the expected types of the public functions and classes. + + # Since pyreadstat can work with both pandas and polars, we define a DataFrame type that can be either. + try: + from pandas import DataFrame as PandasDataFrame # type: ignore + except ImportError: + # Define a dummy DataFrame class to avoid accepting any type as PandasDataFrame when pandas is not installed + class PandasDataFrame: + pass + + try: + from polars import DataFrame as PolarsDataFrame # type: ignore + except ImportError: + # Define a dummy DataFrame class to avoid accepting any type as PolarsDataFrame when polars is not installed + class PolarsDataFrame: + pass + +DataFrame: TypeAlias = "PandasDataFrame | PolarsDataFrame" # Define type at runtime for introspection + +class FileLike(Protocol): + """Protocol for file-like objects accepted by pyreadstat""" + + # Should work with any file-like object that has read and seek methods, such as those returned by open() or io.BytesIO + def read(self, size: int | None = -1, /) -> bytes: ... + def seek(self, pos: int, whence: int = 0, /) -> int: ... + + +FilePathLike: TypeAlias = str | bytes | PathLike[str] | PathLike[bytes] +FilePathorBuffer: TypeAlias = FilePathLike | FileLike + +DictOutput: TypeAlias = dict[str, list[Any]] + +PyreadstatReadFunction: TypeAlias = Callable[ + Concatenate[FilePathorBuffer, ...], "tuple[DataFrame | DictOutput, metadata_container]" +] + + # Public interface # Parsing functions -def read_sas7bdat(filename_path, metadataonly=False, dates_as_pandas_datetime=False, catalog_file=None, - formats_as_category=True, formats_as_ordered_category=False, encoding=None, usecols=None, user_missing=False, - disable_datetime_conversion=False, row_limit=0, row_offset=0, output_format=None, - extra_datetime_formats=None, extra_date_formats=None, extra_time_formats=None): + +@overload +def read_sas7bdat( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + catalog_file: FilePathorBuffer | None = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["pandas"] | None = ..., + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_sas7bdat( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + catalog_file: FilePathorBuffer | None = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["polars"] = "polars", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_sas7bdat( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + catalog_file: FilePathorBuffer | None = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["dict"] = "dict", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> tuple[DictOutput, metadata_container]: ... +def read_sas7bdat( + filename_path: FilePathorBuffer, + metadataonly: bool = False, + dates_as_pandas_datetime: bool = False, + catalog_file: FilePathorBuffer | None = None, + formats_as_category: bool = True, + formats_as_ordered_category: bool = False, + encoding: str | None = None, + usecols: list[str] | None = None, + user_missing: bool = False, + disable_datetime_conversion: bool = False, + row_limit: int = 0, + row_offset: int = 0, + output_format: Literal["pandas", "polars", "dict"] | None = None, + extra_datetime_formats: list[str] | None = None, + extra_date_formats: list[str] | None = None, + extra_time_formats: list[str] | None = None, +) -> "tuple[DataFrame | DictOutput, metadata_container]": r""" Read a SAS sas7bdat file. It accepts the path to a sas7bcat. - + Parameters ---------- filename_path : str, bytes, Path-like object or file-like object @@ -45,8 +163,8 @@ def read_sas7bdat(filename_path, metadataonly=False, dates_as_pandas_datetime=Fa metadata object. The data frame will be set with the correct column names but no data. dates_as_pandas_datetime : bool, optional by default False. If true dates will be transformed to pandas datetime64 instead of date, effective only for pandas. - catalog_file : str, optional - path to a sas7bcat file. By default is None. If not None, will parse the catalog file and replace the values + catalog_file : str, bytes, Path-like object or file-like object, optional + path to a sas7bcat file or file-like object. By default is None. If not None, will parse the catalog file and replace the values by the formats in the catalog, if any appropiate is found. If this is not the behavior you are looking for, Use read_sas7bcat to parse the catalog independently of the sas7bdat and set_catalog_to_sas to apply the resulting format into sas7bdat files. @@ -79,7 +197,7 @@ def read_sas7bdat(filename_path, metadataonly=False, dates_as_pandas_datetime=Fa start reading rows after this offset. By default 0, meaning start with the first row not skipping anything. output_format : str, optional one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned, the - user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a + user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a dataframe is avoided. extra_datetime_formats: list of str, optional formats to be parsed as python datetime objects @@ -87,7 +205,7 @@ def read_sas7bdat(filename_path, metadataonly=False, dates_as_pandas_datetime=Fa formats to be parsed as python date objects extra_time_formats: list of str, optional formats to be parsed as python time objects - + Returns ------- @@ -99,27 +217,99 @@ def read_sas7bdat(filename_path, metadataonly=False, dates_as_pandas_datetime=Fa Look at the documentation for more information. """ parser_format = "sas7bdat" - data_frame, metadata = parser_entry_point(filename_path, parser_format, - metadataonly=metadataonly, dates_as_pandas_datetime=dates_as_pandas_datetime, - formats_as_category=formats_as_category, formats_as_ordered_category=formats_as_ordered_category, - encoding=encoding, usecols=usecols, user_missing=user_missing, - disable_datetime_conversion=disable_datetime_conversion, row_limit=row_limit, row_offset=row_offset, - output_format=output_format, extra_datetime_formats=extra_datetime_formats, - extra_date_formats=extra_date_formats, extra_time_formats=extra_time_formats) + data_frame, metadata = parser_entry_point( + filename_path, + parser_format, + metadataonly=metadataonly, + dates_as_pandas_datetime=dates_as_pandas_datetime, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + encoding=encoding, + usecols=usecols, + user_missing=user_missing, + disable_datetime_conversion=disable_datetime_conversion, + row_limit=row_limit, + row_offset=row_offset, + output_format=output_format, + extra_datetime_formats=extra_datetime_formats, + extra_date_formats=extra_date_formats, + extra_time_formats=extra_time_formats, + ) metadata.file_format = parser_format if catalog_file: - _ , catalog = read_sas7bcat(catalog_file, encoding=encoding) - data_frame, metadata = set_catalog_to_sas(data_frame, metadata, catalog, formats_as_category=formats_as_category, - formats_as_ordered_category=formats_as_ordered_category) + _, catalog = read_sas7bcat(catalog_file, encoding=encoding) + data_frame, metadata = set_catalog_to_sas( + data_frame, + metadata, + catalog, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + ) return data_frame, metadata -def read_xport(filename_path, metadataonly=False, dates_as_pandas_datetime=False, encoding=None, - usecols=None, disable_datetime_conversion=False, row_limit=0, row_offset=0, - output_format=None, extra_datetime_formats=None, extra_date_formats=None, extra_time_formats=None): +@overload +def read_xport( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["pandas"] | None = ..., + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_xport( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["polars"] = "polars", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_xport( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["dict"] = "dict", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> tuple[DictOutput, metadata_container]: ... +def read_xport( + filename_path: FilePathorBuffer, + metadataonly: bool = False, + dates_as_pandas_datetime: bool = False, + encoding: str | None = None, + usecols: list[str] | None = None, + disable_datetime_conversion: bool = False, + row_limit: int = 0, + row_offset: int = 0, + output_format: Literal["pandas", "polars", "dict"] | None = None, + extra_datetime_formats: list[str] | None = None, + extra_date_formats: list[str] | None = None, + extra_time_formats: list[str] | None = None, +) -> "tuple[DataFrame | DictOutput, metadata_container]": r""" Read a SAS xport file. @@ -150,7 +340,7 @@ def read_xport(filename_path, metadataonly=False, dates_as_pandas_datetime=False start reading rows after this offset. By default 0, meaning start with the first row not skipping anything. output_format : str, optional one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned, the - user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a + user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a dataframe is avoided. extra_datetime_formats: list of str, optional formats to be parsed as python datetime objects @@ -167,22 +357,102 @@ def read_xport(filename_path, metadataonly=False, dates_as_pandas_datetime=False object with metadata. Look at the documentation for more information. """ parser_format = "xport" - data_frame, metadata = parser_entry_point(filename_path, parser_format, - metadataonly=metadataonly, dates_as_pandas_datetime=dates_as_pandas_datetime, - encoding=encoding, usecols=usecols, - disable_datetime_conversion=disable_datetime_conversion, row_limit=row_limit, row_offset=row_offset, - output_format=output_format, extra_datetime_formats=extra_datetime_formats, - extra_date_formats=extra_date_formats, extra_time_formats=extra_time_formats) + data_frame, metadata = parser_entry_point( + filename_path, + parser_format, + metadataonly=metadataonly, + dates_as_pandas_datetime=dates_as_pandas_datetime, + encoding=encoding, + usecols=usecols, + disable_datetime_conversion=disable_datetime_conversion, + row_limit=row_limit, + row_offset=row_offset, + output_format=output_format, + extra_datetime_formats=extra_datetime_formats, + extra_date_formats=extra_date_formats, + extra_time_formats=extra_time_formats, + ) metadata.file_format = parser_format return data_frame, metadata -def read_dta(filename_path, metadataonly=False, dates_as_pandas_datetime=False, apply_value_formats=False, - formats_as_category=True, formats_as_ordered_category=False, encoding=None, usecols=None, user_missing=False, - disable_datetime_conversion=False, row_limit=0, row_offset=0, output_format=None, - extra_datetime_formats=None, extra_date_formats=None, extra_time_formats=None): +@overload +def read_dta( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["pandas"] | None = ..., + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_dta( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["polars"] = "polars", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_dta( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["dict"] = "dict", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> tuple[DictOutput, metadata_container]: ... +def read_dta( + filename_path: FilePathorBuffer, + metadataonly: bool = False, + dates_as_pandas_datetime: bool = False, + apply_value_formats: bool = False, + formats_as_category: bool = True, + formats_as_ordered_category: bool = False, + encoding: str | None = None, + usecols: list[str] | None = None, + user_missing: bool = False, + disable_datetime_conversion: bool = False, + row_limit: int = 0, + row_offset: int = 0, + output_format: Literal["pandas", "polars", "dict"] | None = None, + extra_datetime_formats: list[str] | None = None, + extra_date_formats: list[str] | None = None, + extra_time_formats: list[str] | None = None, +) -> "tuple[DataFrame | DictOutput, metadata_container]": r""" Read a STATA dta file @@ -227,7 +497,7 @@ def read_dta(filename_path, metadataonly=False, dates_as_pandas_datetime=False, start reading rows after this offset. By default 0, meaning start with the first row not skipping anything. output_format : str, optional one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned, the - user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a + user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a dataframe is avoided. extra_datetime_formats: list of str, optional formats to be parsed as python datetime objects @@ -244,27 +514,113 @@ def read_dta(filename_path, metadataonly=False, dates_as_pandas_datetime=False, object with metadata. Look at the documentation for more information. """ parser_format = "dta" - data_frame, metadata = parser_entry_point(filename_path, parser_format=parser_format, - metadataonly=metadataonly, dates_as_pandas_datetime=dates_as_pandas_datetime, - formats_as_category=formats_as_category, formats_as_ordered_category=formats_as_ordered_category, - encoding=encoding, usecols=usecols, user_missing=user_missing, - disable_datetime_conversion=disable_datetime_conversion, row_limit=row_limit, row_offset=row_offset, - output_format=output_format, extra_datetime_formats=extra_datetime_formats, - extra_date_formats=extra_date_formats, extra_time_formats=extra_time_formats) + data_frame, metadata = parser_entry_point( + filename_path, + parser_format=parser_format, + metadataonly=metadataonly, + dates_as_pandas_datetime=dates_as_pandas_datetime, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + encoding=encoding, + usecols=usecols, + user_missing=user_missing, + disable_datetime_conversion=disable_datetime_conversion, + row_limit=row_limit, + row_offset=row_offset, + output_format=output_format, + extra_datetime_formats=extra_datetime_formats, + extra_date_formats=extra_date_formats, + extra_time_formats=extra_time_formats, + ) metadata.file_format = parser_format if apply_value_formats: - data_frame = set_value_labels(data_frame, metadata, formats_as_category=formats_as_category, - formats_as_ordered_category=formats_as_ordered_category) + data_frame = set_value_labels( + data_frame, + metadata, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + ) return data_frame, metadata -def read_sav(filename_path, metadataonly=False, dates_as_pandas_datetime=False, apply_value_formats=False, - formats_as_category=True, formats_as_ordered_category=False, encoding=None, usecols=None, user_missing=False, - disable_datetime_conversion=False, row_limit=0, row_offset=0, output_format=None, extra_datetime_formats=None, - extra_date_formats=None, extra_time_formats=None): +@overload +def read_sav( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["pandas"] | None = ..., + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_sav( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["polars"] = "polars", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_sav( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["dict"] = "dict", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> tuple[DictOutput, metadata_container]: ... +def read_sav( + filename_path: FilePathorBuffer, + metadataonly: bool = False, + dates_as_pandas_datetime: bool = False, + apply_value_formats: bool = False, + formats_as_category: bool = True, + formats_as_ordered_category: bool = False, + encoding: str | None = None, + usecols: list[str] | None = None, + user_missing: bool = False, + disable_datetime_conversion: bool = False, + row_limit: int = 0, + row_offset: int = 0, + output_format: Literal["pandas", "polars", "dict"] | None = None, + extra_datetime_formats: list[str] | None = None, + extra_date_formats: list[str] | None = None, + extra_time_formats: list[str] | None = None, +) -> "tuple[DataFrame | DictOutput, metadata_container]": r""" Read a SPSS sav or zsav (compressed) files @@ -309,7 +665,7 @@ def read_sav(filename_path, metadataonly=False, dates_as_pandas_datetime=False, start reading rows after this offset. By default 0, meaning start with the first row not skipping anything. output_format : str, optional one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned, the - user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a + user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a dataframe is avoided. extra_datetime_formats: list of str, optional formats to be parsed as python datetime objects @@ -327,27 +683,105 @@ def read_sav(filename_path, metadataonly=False, dates_as_pandas_datetime=False, """ parser_format = "sav/zsav" - data_frame, metadata = parser_entry_point(filename_path, parser_format=parser_format, - metadataonly=metadataonly, dates_as_pandas_datetime=dates_as_pandas_datetime, - formats_as_category=formats_as_category, formats_as_ordered_category=formats_as_ordered_category, - encoding=encoding, usecols=usecols, user_missing=user_missing, - disable_datetime_conversion=disable_datetime_conversion, row_limit=row_limit, row_offset=row_offset, - output_format=output_format, extra_datetime_formats=extra_datetime_formats, - extra_date_formats=extra_date_formats, extra_time_formats=extra_time_formats) + data_frame, metadata = parser_entry_point( + filename_path, + parser_format=parser_format, + metadataonly=metadataonly, + dates_as_pandas_datetime=dates_as_pandas_datetime, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + encoding=encoding, + usecols=usecols, + user_missing=user_missing, + disable_datetime_conversion=disable_datetime_conversion, + row_limit=row_limit, + row_offset=row_offset, + output_format=output_format, + extra_datetime_formats=extra_datetime_formats, + extra_date_formats=extra_date_formats, + extra_time_formats=extra_time_formats, + ) metadata.file_format = parser_format if apply_value_formats: - data_frame = set_value_labels(data_frame, metadata, formats_as_category=formats_as_category, - formats_as_ordered_category=formats_as_ordered_category) + data_frame = set_value_labels( + data_frame, + metadata, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + ) return data_frame, metadata -def read_por(filename_path, metadataonly=False, dates_as_pandas_datetime=False, apply_value_formats=False, - formats_as_category=True, formats_as_ordered_category=False, usecols=None, - disable_datetime_conversion=False, row_limit=0, row_offset=0, output_format=None, - extra_datetime_formats=None, extra_date_formats=None, extra_time_formats=None): +@overload +def read_por( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + usecols: list[str] | None = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["pandas"] | None = ..., + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_por( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + usecols: list[str] | None = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["polars"] = "polars", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_por( + filename_path: FilePathorBuffer, + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + usecols: list[str] | None = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["dict"] = "dict", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> tuple[DictOutput, metadata_container]: ... +def read_por( + filename_path: FilePathorBuffer, + metadataonly: bool = False, + dates_as_pandas_datetime: bool = False, + apply_value_formats: bool = False, + formats_as_category: bool = True, + formats_as_ordered_category: bool = False, + usecols: list[str] | None = None, + disable_datetime_conversion: bool = False, + row_limit: int = 0, + row_offset: int = 0, + output_format: Literal["pandas", "polars", "dict"] | None = None, + extra_datetime_formats: list[str] | None = None, + extra_date_formats: list[str] | None = None, + extra_time_formats: list[str] | None = None, +) -> "tuple[DataFrame | DictOutput, metadata_container]": r""" Read a SPSS por file. Files are assumed to be UTF-8 encoded, the encoding cannot be set to other. @@ -386,7 +820,7 @@ def read_por(filename_path, metadataonly=False, dates_as_pandas_datetime=False, start reading rows after this offset. By default 0, meaning start with the first row not skipping anything. output_format : str, optional one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned, the - user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a + user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a dataframe is avoided. extra_datetime_formats: list of str, optional formats to be parsed as python datetime objects @@ -403,23 +837,54 @@ def read_por(filename_path, metadataonly=False, dates_as_pandas_datetime=False, object with metadata. Look at the documentation for more information. """ parser_format = "por" - data_frame, metadata = parser_entry_point(filename_path, parser_format=parser_format, - metadataonly=metadataonly, dates_as_pandas_datetime=dates_as_pandas_datetime, - formats_as_category=formats_as_category, formats_as_ordered_category=formats_as_ordered_category, - usecols=usecols, - disable_datetime_conversion=disable_datetime_conversion, row_limit=row_limit, row_offset=row_offset, - output_format=output_format, extra_datetime_formats=extra_datetime_formats, - extra_date_formats=extra_date_formats, extra_time_formats=extra_time_formats) + data_frame, metadata = parser_entry_point( + filename_path, + parser_format=parser_format, + metadataonly=metadataonly, + dates_as_pandas_datetime=dates_as_pandas_datetime, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + usecols=usecols, + disable_datetime_conversion=disable_datetime_conversion, + row_limit=row_limit, + row_offset=row_offset, + output_format=output_format, + extra_datetime_formats=extra_datetime_formats, + extra_date_formats=extra_date_formats, + extra_time_formats=extra_time_formats, + ) metadata.file_format = parser_format - + if apply_value_formats: data_frame = set_value_labels(data_frame, metadata, formats_as_category=formats_as_category) return data_frame, metadata -def read_sas7bcat(filename_path, encoding=None, output_format=None): +@overload +def read_sas7bcat( + filename_path: FilePathorBuffer, + encoding: str | None = ..., + output_format: Literal["pandas"] | None = ..., +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_sas7bcat( + filename_path: FilePathorBuffer, + encoding: str | None = ..., + output_format: Literal["polars"] = "polars", +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_sas7bcat( + filename_path: FilePathorBuffer, + encoding: str | None = ..., + output_format: Literal["dict"] = "dict", +) -> tuple[DictOutput, metadata_container]: ... +def read_sas7bcat( + filename_path: FilePathorBuffer, + encoding: str | None = None, + output_format: Literal["pandas", "polars", "dict"] | None = None, +) -> "tuple[DataFrame | DictOutput, metadata_container]": r""" Read a SAS sas7bcat file. The returning dataframe will be empty. The metadata object will contain a dictionary value_labels that contains the formats. When parsing the sas7bdat file, in the metadata, the dictionary @@ -437,7 +902,7 @@ def read_sas7bcat(filename_path, encoding=None, output_format=None): Defaults to None. If set, the system will use the defined encoding instead of guessing it. It has to be an iconv-compatible name output_format : str, optional - one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned. + one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned. Notice that for this function the resulting object is always empty, this is done for consistency with other functions but has no impact on performance. @@ -450,23 +915,78 @@ def read_sas7bcat(filename_path, encoding=None, output_format=None): Look at the documentation for more information. """ parser_format = "sas7bcat" - data_frame, metadata = parser_entry_point(filename_path, parser_format=parser_format, - encoding=encoding, - output_format=output_format, - ) + data_frame, metadata = parser_entry_point( + filename_path, + parser_format=parser_format, + encoding=encoding, + output_format=output_format, + ) metadata.file_format = parser_format return data_frame, metadata + # convenience functions to read in chunks -def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, limit=0, - multiprocess=False, num_processes=4, num_rows=None, **kwargs): + +@overload +def read_file_in_chunks( + read_function: PyreadstatReadFunction, + file_path: FilePathLike, + chunksize: int = ..., + offset: int = ..., + limit: int = ..., + multiprocess: bool = ..., + num_processes: int = ..., + num_rows: int | None = ..., + *, + output_format: Literal["pandas"] | None = ..., + **kwargs: Any, +) -> "Iterator[tuple[PandasDataFrame, metadata_container]]": ... +@overload +def read_file_in_chunks( + read_function: PyreadstatReadFunction, + file_path: FilePathLike, + chunksize: int = ..., + offset: int = ..., + limit: int = ..., + multiprocess: bool = ..., + num_processes: int = ..., + num_rows: int | None = ..., + *, + output_format: Literal["polars"] = "polars", + **kwargs: Any, +) -> "Iterator[tuple[PolarsDataFrame, metadata_container]]": ... +@overload +def read_file_in_chunks( + read_function: PyreadstatReadFunction, + file_path: FilePathLike, + chunksize: int = ..., + offset: int = ..., + limit: int = ..., + multiprocess: bool = ..., + num_processes: int = ..., + num_rows: int | None = ..., + *, + output_format: Literal["dict"] = "dict", + **kwargs: Any, +) -> Iterator[tuple[DictOutput, metadata_container]]: ... +def read_file_in_chunks( + read_function: PyreadstatReadFunction, + file_path: FilePathLike, + chunksize: int = 100000, + offset: int = 0, + limit: int = 0, + multiprocess: bool = False, + num_processes: int = 4, + num_rows: int | None = None, + **kwargs: Any, +) -> "Iterator[tuple[DataFrame | DictOutput, metadata_container]]": """ Returns a generator that will allow to read a file in chunks. - If using multiprocessing, for Xport, Por and some defective sav files where the number of rows in the dataset canot be obtained from the metadata, + If using multiprocessing, for Xport, Por and some defective sav files where the number of rows in the dataset canot be obtained from the metadata, the parameter num_rows must be set to a number equal or larger than the number of rows in the dataset. That information must be obtained by the user before running this function. @@ -474,7 +994,7 @@ def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, li ---------- read_function : pyreadstat function a pyreadstat reading function - file_path : string + file_path : str, bytes or Path-like object path to the file to be read chunksize : integer, optional size of the chunks to read @@ -488,7 +1008,7 @@ def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, li in case multiprocess is true, how many workers/processes to spawn? num_rows: integer, optional number of rows in the dataset. If using multiprocessing it is obligatory for files where - the number of rows cannot be obtained from the medatata, such as por and + the number of rows cannot be obtained from the medatata, such as por and some defective xport and sav files. The user must obtain this value by reading the file without multiprocessing first or any other means. A number larger than the actual number of rows will work as well. Discarded if the number of rows can be obtained from the metadata or not using multiprocessing. @@ -500,7 +1020,7 @@ def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, li data_frame : dataframe a dataframe with the data metadata : - object with metadata. + object with metadata. Look at the documentation for more information. it : generator @@ -509,7 +1029,7 @@ def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, li if read_function == read_sas7bcat: raise Exception("read_sas7bcat not supported") - + if "row_offset" in kwargs: _ = kwargs.pop("row_offset") @@ -525,7 +1045,7 @@ def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, li if not limit: limit = numrows else: - limit = min(offset+limit, numrows) + limit = min(offset + limit, numrows) else: if limit: limit = offset + limit @@ -534,18 +1054,62 @@ def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, li if limit and (offset >= limit): break if multiprocess: - df, meta = read_file_multiprocessing(read_function, file_path, num_processes=num_processes, - row_offset=offset, row_limit=chunksize, num_rows=num_rows, **kwargs) + df, meta = read_file_multiprocessing( + read_function, + file_path, + num_processes=num_processes, + row_offset=offset, + row_limit=chunksize, + num_rows=num_rows, + **kwargs, + ) else: df, meta = read_function(file_path, row_offset=offset, row_limit=chunksize, **kwargs) if len(df): yield df, meta offset += chunksize -def read_file_multiprocessing(read_function, file_path, num_processes=None, num_rows=None, **kwargs): + +@overload +def read_file_multiprocessing( + read_function: PyreadstatReadFunction, + file_path: FilePathLike, + num_processes: int | None = ..., + num_rows: int | None = ..., + *, + output_format: Literal["pandas"] | None = ..., + **kwargs: Any, +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_file_multiprocessing( + read_function: PyreadstatReadFunction, + file_path: FilePathLike, + num_processes: int | None = ..., + num_rows: int | None = ..., + *, + output_format: Literal["polars"] = "polars", + **kwargs: Any, +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_file_multiprocessing( + read_function: PyreadstatReadFunction, + file_path: FilePathLike, + num_processes: int | None = ..., + num_rows: int | None = ..., + *, + output_format: Literal["dict"] = "dict", + **kwargs: Any, +) -> tuple[DictOutput, metadata_container]: ... +def read_file_multiprocessing( + read_function: PyreadstatReadFunction, + file_path: FilePathLike, + num_processes: int | None = None, + num_rows: int | None = None, + **kwargs: Any, +) -> "tuple[DataFrame | DictOutput, metadata_container]": """ Reads a file in parallel using multiprocessing. - For Xport, Por and some defective sav files where the number of rows in the dataset canot be obtained from the metadata, + For Xport, Por and some defective sav files where the number of rows in the dataset canot be obtained from the metadata, the parameter num_rows must be set to a number equal or larger than the number of rows in the dataset. That information must be obtained by the user before running this function. @@ -553,16 +1117,16 @@ def read_file_multiprocessing(read_function, file_path, num_processes=None, num_ ---------- read_function : pyreadstat function a pyreadstat reading function - file_path : string + file_path : str, bytes or Path-like object path to the file to be read num_processes : integer, optional number of processes to spawn, by default the min 4 and the max cores on the computer num_rows: integer, optional - number of rows in the dataset. Obligatory for files where the number of rows cannot be obtained from the medatata, such as por and + number of rows in the dataset. Obligatory for files where the number of rows cannot be obtained from the medatata, such as por and some defective xport and sav files. The user must obtain this value by reading the file without multiprocessing first or any other means. A number larger than the actual number of rows will work as well. Discarded if the number of rows can be obtained from the metadata. kwargs : dict, optional - any other keyword argument to pass to the read_function. + any other keyword argument to pass to the read_function. Returns ------- @@ -576,26 +1140,30 @@ def read_file_multiprocessing(read_function, file_path, num_processes=None, num_ raise Exception("read_sas7bcat is not supported") if read_function == read_por and num_rows is None: - raise Exception("num_rows must be specified for read_por to be a number equal or larger than the number of rows in the dataset.") + raise Exception( + "num_rows must be specified for read_por to be a number equal or larger than the number of rows in the dataset." + ) if not num_processes: # let's be conservative with the number of workers num_processes = min(mp.cpu_count(), 4) - _ = kwargs.pop('metadataonly', None) + _ = kwargs.pop("metadataonly", None) row_offset = kwargs.pop("row_offset", 0) - row_limit = kwargs.pop("row_limit", float('inf')) + row_limit = kwargs.pop("row_limit", float("inf")) _, meta = read_function(file_path, metadataonly=True, **kwargs) numrows = meta.number_rows if numrows is None: if num_rows is None: - raise Exception("The number of rows of the file cannot be determined from the file's metadata. If you still want to proceed, please set num_rows to a number equal or larger than the number of rows of your data") + raise Exception( + "The number of rows of the file cannot be determined from the file's metadata. If you still want to proceed, please set num_rows to a number equal or larger than the number of rows of your data" + ) numrows = num_rows elif numrows == 0: final, meta = read_function(file_path, **kwargs) - numrows = min(max(numrows - row_offset, 0), row_limit) - divs = [numrows // num_processes + (1 if x < numrows % num_processes else 0) for x in range (num_processes)] + numrows = min(max(numrows - row_offset, 0), row_limit) + divs = [numrows // num_processes + (1 if x < numrows % num_processes else 0) for x in range(num_processes)] offsets = list() prev_offset = row_offset prev_div = 0 @@ -613,13 +1181,13 @@ def read_file_multiprocessing(read_function, file_path, num_processes=None, num_ finally: pool.close() output_format = kwargs.get("output_format") - if output_format == 'dict': + if output_format == "dict": keys = chunks[0].keys() final = {key: list(chain.from_iterable(chunk[key] for chunk in chunks)) for key in keys} else: - #final = pd.concat(chunks, axis=0, ignore_index=True) + # final = pd.concat(chunks, axis=0, ignore_index=True) chunks = [nw.from_native(x) for x in chunks] - final = nw.concat(chunks, how='vertical') + final = nw.concat(chunks, how="vertical") ispandas = False if final.implementation.is_pandas(): ispandas = True @@ -628,11 +1196,24 @@ def read_file_multiprocessing(read_function, file_path, num_processes=None, num_ final = final.reset_index(drop=True) return final, meta + # Write API -def write_sav(df, dst_path, file_label="", column_labels=None, compress=False, row_compress=False, note=None, - variable_value_labels=None, missing_ranges=None, variable_display_width=None, - variable_measure=None, variable_format=None): + +def write_sav( + df: "DataFrame", + dst_path: FilePathLike, + file_label: str = "", + column_labels: list[str] | dict[str, str] | None = None, + compress: bool = False, + row_compress: bool = False, + note: str | list[str] | None = None, + variable_value_labels: dict[str, dict[int | float, str]] | None = None, + missing_ranges: dict[str, list[int | float | str | MissingRange]] | None = None, + variable_display_width: dict[str, int] | None = None, + variable_measure: dict[str, str] | None = None, + variable_format: dict[str, str] | None = None, +) -> None: """ Writes a dataframe to a SPSS sav or zsav file. @@ -640,7 +1221,7 @@ def write_sav(df, dst_path, file_label="", column_labels=None, compress=False, r ---------- df : dataframe dataframe to write to sav or zsav - dst_path : str or pathlib.Path + dst_path : str, bytes or Path-like object full path to the result sav or zsav file file_label : str, optional a label for the file @@ -674,27 +1255,47 @@ def write_sav(df, dst_path, file_label="", column_labels=None, compress=False, r sets the measure type for a variable. Must be a dictionary with keys being variable names and values being strings one of "nominal", "ordinal", "scale" or "unknown" (default). variable_format: dict, optional - sets the format of a variable. Must be a dictionary with keys being the variable names and + sets the format of a variable. Must be a dictionary with keys being the variable names and values being strings defining the format. See README, setting variable formats section, for more information. """ writer_format = "sav" # formats - formats_presets = {'restricted_integer':'N{var_width}', 'integer':'F{var_width}.0'} + formats_presets = {"restricted_integer": "N{var_width}", "integer": "F{var_width}.0"} if variable_format: for col_name, col_format in variable_format.items(): if col_format in formats_presets.keys() and col_name in df.columns: var_width = str(len(str(max(df[col_name])))) - variable_format[col_name] = formats_presets[col_format].format(var_width=var_width) - - writer_entry_point(df, dst_path, writer_format=writer_format, file_label=file_label, column_labels=column_labels, - compress=compress, row_compress=row_compress, note=note, - variable_value_labels=variable_value_labels, missing_ranges=missing_ranges, variable_display_width=variable_display_width, - variable_measure=variable_measure, variable_format=variable_format) - -def write_dta(df, dst_path, file_label="", column_labels=None, version=15, - variable_value_labels=None, missing_user_values=None, variable_format=None): + variable_format[col_name] = formats_presets[col_format].format(var_width=var_width) + + writer_entry_point( + df, + dst_path, + writer_format=writer_format, + file_label=file_label, + column_labels=column_labels, + compress=compress, + row_compress=row_compress, + note=note, + variable_value_labels=variable_value_labels, + missing_ranges=missing_ranges, + variable_display_width=variable_display_width, + variable_measure=variable_measure, + variable_format=variable_format, + ) + + +def write_dta( + df: "DataFrame", + dst_path: FilePathLike, + file_label: str = "", + column_labels: list[str] | dict[str, str] | None = None, + version: int = 15, + variable_value_labels: dict[str, dict[int | float, str]] | None = None, + missing_user_values: dict[str, list[str]] | None = None, + variable_format: dict[str, str] | None = None, +) -> None: """ Writes a dataframe to a STATA dta file @@ -702,7 +1303,7 @@ def write_dta(df, dst_path, file_label="", column_labels=None, version=15, ---------- df : dataframe dataframe to write to sav or zsav - dst_path : str or pathlib.Path + dst_path : str, bytes or Path-like object full path to the result dta file file_label : str, optional a label for the file @@ -722,20 +1323,34 @@ def write_dta(df, dst_path, file_label="", column_labels=None, version=15, names and values being a list of missing values. Missing values must be a single character between a and z. variable_format: dict, optional - sets the format of a variable. Must be a dictionary with keys being the variable names and + sets the format of a variable. Must be a dictionary with keys being the variable names and values being strings defining the format. See README, setting variable formats section, for more information. """ writer_format = "dta" - writer_entry_point(df, dst_path, writer_format=writer_format, file_label=file_label, column_labels=column_labels, - version=version, - variable_value_labels=variable_value_labels, - missing_user_values=missing_user_values, - variable_format=variable_format) - -def write_xport(df, dst_path, file_label="", column_labels=None, table_name=None, file_format_version = 8, - variable_format=None): + writer_entry_point( + df, + dst_path, + writer_format=writer_format, + file_label=file_label, + column_labels=column_labels, + version=version, + variable_value_labels=variable_value_labels, + missing_user_values=missing_user_values, + variable_format=variable_format, + ) + + +def write_xport( + df: "DataFrame", + dst_path: FilePathLike, + file_label: str = "", + column_labels: list[str] | dict[str, str] | None = None, + table_name: str | None = None, + file_format_version: Literal[5, 8] = 8, + variable_format: dict[str, str] | None = None, +) -> None: """ Writes a dataframe to a SAS Xport (xpt) file. If no table_name is specified the dataset has by default the name DATASET (take it into account if @@ -746,7 +1361,7 @@ def write_xport(df, dst_path, file_label="", column_labels=None, table_name=None ---------- df : dataframe dataframe to write to xport - dst_path : str or pathlib.Path + dst_path : str, bytes or Path-like object full path to the result xport file file_label : str, optional a label for the file @@ -760,18 +1375,31 @@ def write_xport(df, dst_path, file_label="", column_labels=None, table_name=None file_format_version : int, optional XPORT file version, either 8 or 5, default is 8 variable_format: dict, optional - sets the format of a variable. Must be a dictionary with keys being the variable names and + sets the format of a variable. Must be a dictionary with keys being the variable names and values being strings defining the format. See README, setting variable formats section, for more information. """ writer_format = "xport" - writer_entry_point(df, dst_path, writer_format=writer_format, file_label=file_label, column_labels=column_labels, - version=file_format_version, - table_name=table_name, - variable_format=variable_format) - -def write_por(df, dst_path, file_label="", column_labels=None, variable_format=None): + writer_entry_point( + df, + dst_path, + writer_format=writer_format, + file_label=file_label, + column_labels=column_labels, + version=file_format_version, + table_name=table_name, + variable_format=variable_format, + ) + + +def write_por( + df: "DataFrame", + dst_path: FilePathLike, + file_label: str = "", + column_labels: list[str] | dict[str, str] | None = None, + variable_format: dict[str, str] | None = None, +) -> None: """ Writes a dataframe to a SPSS POR file. @@ -779,7 +1407,7 @@ def write_por(df, dst_path, file_label="", column_labels=None, variable_format=N ---------- df : dataframe data frame to write to por - dst_path : str or pathlib.Path + dst_path : str, bytes or Path-like object full path to the result por file file_label : str, optional a label for the file @@ -789,12 +1417,17 @@ def write_por(df, dst_path, file_label="", column_labels=None, variable_format=N In such case there is no need to include all variables; labels for non existent variables will be ignored with no warning or error. variable_format: dict, optional - sets the format of a variable. Must be a dictionary with keys being the variable names and + sets the format of a variable. Must be a dictionary with keys being the variable names and values being strings defining the format. See README, setting variable formats section, for more information. """ writer_format = "por" - writer_entry_point(df, dst_path, writer_format=writer_format, file_label=file_label, column_labels=column_labels, - variable_format=variable_format) - + writer_entry_point( + df, + dst_path, + writer_format=writer_format, + file_label=file_label, + column_labels=column_labels, + variable_format=variable_format, + ) diff --git a/pyreadstat/worker.py b/pyreadstat/worker.py index ea003ba..546763c 100644 --- a/pyreadstat/worker.py +++ b/pyreadstat/worker.py @@ -1,4 +1,3 @@ - # ############################################################################# # Copyright 2018 Hoffmann-La Roche # @@ -19,7 +18,16 @@ Functions to work with multiprocessing """ -def worker(inpt): +from os import PathLike +from typing import TYPE_CHECKING, Any, TypeAlias + +if TYPE_CHECKING: + from .pyreadstat import PyreadstatReadFunction, DataFrame, DictOutput + +Input: TypeAlias = "tuple[PyreadstatReadFunction, str | bytes | PathLike, int, int, dict[str, Any]]" + + +def worker(inpt: Input) -> "DataFrame | DictOutput": read_function, path, row_offset, row_limit, kwargs = inpt df, meta = read_function(path, row_offset=row_offset, row_limit=row_limit, **kwargs) - return df \ No newline at end of file + return df diff --git a/setup.py b/setup.py index b805233..475653c 100644 --- a/setup.py +++ b/setup.py @@ -168,6 +168,8 @@ def is_python_lt_14(): ], ext_modules=extensions, packages=["pyreadstat"], + package_data={"pyreadstat": ["py.typed"]}, + include_package_data=True, data_files=data_files, install_requires=['narwhals>=2.10.1', 'numpy'], license="Apache-2.0", diff --git a/tests/test_mypy_setup.ini b/tests/test_mypy_setup.ini new file mode 100644 index 0000000..2868ef8 --- /dev/null +++ b/tests/test_mypy_setup.ini @@ -0,0 +1,4 @@ +[mypy] +ignore_missing_imports = True +strict_optional = True +follow_imports = silent \ No newline at end of file diff --git a/tests/test_typing.yml b/tests/test_typing.yml new file mode 100644 index 0000000..996e25a --- /dev/null +++ b/tests/test_typing.yml @@ -0,0 +1,345 @@ +# requires pytest-mypy-plugins +# command to run: pytest tests/test_typing.yml --mypy-ini-file=tests/test_mypy_setup.ini + +# yaml-language-server: $schema=https://raw.githubusercontent.com/typeddjango/pytest-mypy-plugins/master/pytest_mypy_plugins/schema.json + +- case: read_sav_default_types + main: | + from pyreadstat import read_sav + df, meta = read_sav("file.sav") + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_sav_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "dict[str, list[Any]]" + main: | + from pyreadstat import read_sav + df: object + df, meta = read_sav("file.sav", output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_sav_buffer_types + main: | + import io + from pyreadstat import read_sav + buffer = io.BytesIO() + df, meta = read_sav(buffer) + +- case: read_dta_default_types + main: | + from pyreadstat import read_dta + df, meta = read_dta("file.dta") + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_dta_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "dict[str, list[Any]]" + main: | + from pyreadstat import read_dta + df, meta = read_dta("file.dta", output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_dta_buffer_types + main: | + import io + from pyreadstat import read_dta + buffer = io.BytesIO() + df, meta = read_dta(buffer) + +- case: read_por_default_types + main: | + from pyreadstat import read_por + df, meta = read_por("file.por") + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_por_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "dict[str, list[Any]]" + main: | + from pyreadstat import read_por + df, meta = read_por("file.por", output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_por_buffer_types + main: | + import io + from pyreadstat import read_por + buffer = io.BytesIO() + df, meta = read_por(buffer) + +- case: read_sas7bdat_default_types + main: | + from pyreadstat import read_sas7bdat + df, meta = read_sas7bdat("file.sas7bdat") + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_sas7bdat_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "dict[str, list[Any]]" + main: | + from pyreadstat import read_sas7bdat + df, meta = read_sas7bdat("file.sas7bdat", output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_sas7bdat_buffer_types + main: | + import io + from pyreadstat import read_sas7bdat + buffer = io.BytesIO() + df, meta = read_sas7bdat(buffer) + +- case: read_xport_default_types + main: | + from pyreadstat import read_xport + df, meta = read_xport("file.xpt") + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_xport_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "dict[str, list[Any]]" + main: | + from pyreadstat import read_xport + df, meta = read_xport("file.xpt", output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_xport_buffer_types + main: | + import io + from pyreadstat import read_xport + buffer = io.BytesIO() + df, meta = read_xport(buffer) + +- case: read_sas7bcat_default_types + main: | + from pyreadstat import read_sas7bcat + df, meta = read_sas7bcat("file.sas7bcat") + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_sas7bcat_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "dict[str, list[Any]]" + main: | + from pyreadstat import read_sas7bcat + df, meta = read_sas7bcat("file.sas7bcat", output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_sas7bcat_buffer_types + main: | + import io + from pyreadstat import read_sas7bcat + buffer = io.BytesIO() + df, meta = read_sas7bcat(buffer) + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + +- case: read_file_multiprocessing_default_types + main: | + from pyreadstat import read_file_multiprocessing, read_sav + df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True) + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_file_multiprocessing_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "dict[str, list[Any]]" + main: | + from pyreadstat import read_file_multiprocessing, read_sav + df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True, output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_file_multiprocessing_invalid_callable + main: | + from pyreadstat import read_file_multiprocessing + def noop(a: int, /) -> int: + return a + read_file_multiprocessing(noop, "file.sav", 1, 1) # ER: Argument 1 to "read_file_multiprocessing" has incompatible type .+ + +- case: read_file_in_chunks_default_types + main: | + from pyreadstat import read_file_in_chunks, read_sav + for df, meta in read_file_in_chunks(read_sav, "file.sav", metadataonly=True): + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_file_in_chunks_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "dict[str, list[Any]]" + main: | + from pyreadstat import read_file_in_chunks, read_sav + for df, meta in read_file_in_chunks(read_sav, "file.sav", metadataonly=True, output_format="{{ output_format }}"): + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_file_in_chunks_invalid_callable + main: | + from pyreadstat import read_file_in_chunks + def noop(a: int, /) -> int: + return a + read_file_in_chunks(noop, "file.sav", 1, 1) # ER: Argument 1 to "read_file_in_chunks" has incompatible type .+ + +- case: write_sav_types + main: | + import pandas as pd + import polars as pl + from pathlib import Path + from pyreadstat import write_sav + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + write_sav(pandas_df, "file.sav") + write_sav(polars_df, "file.sav") + write_sav(pandas_df, Path("file.sav")) + +- case: write_dta_types + main: | + import pandas as pd + import polars as pl + from pathlib import Path + from pyreadstat import write_dta + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + write_dta(pandas_df, "file.dta") + write_dta(polars_df, "file.dta") + write_dta(pandas_df, Path("file.dta")) + +- case: write_xport_types + main: | + import pandas as pd + import polars as pl + from pathlib import Path + from pyreadstat import write_xport + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + write_xport(pandas_df, "file.xpt") + write_xport(polars_df, "file.xpt") + write_xport(pandas_df, Path("file.xpt")) + +- case: write_por_types + main: | + import pandas as pd + import polars as pl + from pathlib import Path + from pyreadstat import write_por + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + write_por(pandas_df, "file.por") + write_por(polars_df, "file.por") + write_por(pandas_df, Path("file.por")) + +- case: set_value_labels_types + parametrized: + - backend: "pandas" + expected_value: "pandas.core.frame.DataFrame" + - backend: "polars" + expected_value: "polars.dataframe.frame.DataFrame" + main: | + import {{ backend }} + from pyreadstat import set_value_labels, metadata_container + df = {{ backend }}.DataFrame() + metadata = metadata_container() + df = set_value_labels(df, metadata) + reveal_type(df) # N: Revealed type is "{{ expected_value }}" + +- case: set_catalog_to_sas_types + parametrized: + - backend: "pandas" + expected_value: "pandas.core.frame.DataFrame" + - backend: "polars" + expected_value: "polars.dataframe.frame.DataFrame" + main: | + import {{ backend }} + from pyreadstat import set_catalog_to_sas, metadata_container + df = {{ backend }}.DataFrame() + sas_metadata = metadata_container() + catalog_metadata = metadata_container() + df, meta = set_catalog_to_sas(df, sas_metadata, catalog_metadata) + reveal_type(df) # N: Revealed type is "{{ expected_value }}" + +- case: worker_types + main: | + from pyreadstat import read_sav + from pyreadstat.worker import Input, worker + inpt: Input = (read_sav, "test_data/file.sav", 0, 100, {}) + result = worker(inpt) + reveal_type(result) # N: Revealed type is "pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, list[Any]]" + def noop(a: int, /) -> int: + return a + inpt_invalid: Input = (noop, "test_data/file.sav", 0, 100, {}) # ER: Incompatible types in assignment .+ + +- case: metadata_container_types + main: | + from pyreadstat.pyclasses import metadata_container + meta = metadata_container() + meta.missing_ranges = { + "var1": [1, 5], + "var2": [{"hi": 1.0, "lo": 0.0}], + "var3": ["a", "b"], + "var4": {1, 2, 3}, # E: Dict entry 3 has incompatible type "str": "set[int]"; expected "str": "list[int | float | str | MissingRange]" [dict-item] + } + meta.mr_sets = { + "set1": { + "type": "D", + "is_dichotomy": True, + "counted_value": 1, + "label": "Set 1", + "variable_list": ["var1", "var2"], + }, + "set2": { + "type": "C", + "is_dichotomy": False, + "counted_value": None, + "label": "Set 2", + "variable_list": ["var3"], + }, + "set3": {} # E: Missing keys ("counted_value", "is_dichotomy", "label", "type", "variable_list") for TypedDict "MRSet" [typeddict-item] + } + meta.variable_measure = { + "var1": "nominal", + "var2": "ordinal", + "var3": "scale", + "var4": "unknown", + "var5": "another", # E: Dict entry 4 has incompatible type "str": "Literal['another']"; expected "str": "Literal['nominal', 'ordinal', 'scale', 'unknown']" [dict-item] + }