From 71dc48d3316486d0c970cd4c32c2e54ae5e14b69 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Fri, 6 Feb 2026 20:17:27 +0000 Subject: [PATCH 01/38] first pass at typing public interface --- pyreadstat/pyclasses.py | 76 ++-- pyreadstat/pyfunctions.py | 80 +++- pyreadstat/pyreadstat.py | 803 +++++++++++++++++++++++++++++++------- tests/typing_tests.pyi | 53 +++ 4 files changed, 839 insertions(+), 173 deletions(-) create mode 100644 tests/typing_tests.pyi diff --git a/pyreadstat/pyclasses.py b/pyreadstat/pyclasses.py index 8ba08b4..cb17f21 100644 --- a/pyreadstat/pyclasses.py +++ b/pyreadstat/pyclasses.py @@ -14,31 +14,61 @@ # limitations under the License. # ############################################################################# +# Typing + +from typing import TYPE_CHECKING, Literal, TypedDict + +if TYPE_CHECKING: + from datetime import datetime + + +class _MissingRange(TypedDict): + lo: float + hi: float + + +class MRSet(TypedDict): + """A dictionary to hold the definition of a multiple-response (MR) set.""" + + type: Literal["D", "C"] + is_dichotomy: bool + counted_value: int + label: str + variable_list: list[str] + + +# Classes + + class metadata_container: """ This class holds metadata we want to give back to python """ - def __init__(self): - self.column_names = list() - self.column_labels = list() - self.column_names_to_labels = dict() - self.file_encoding = None - self.number_columns = None - self.number_rows = None - self.variable_value_labels = dict() - self.value_labels = dict() - self.variable_to_label = dict() - self.notes = list() - self.original_variable_types = dict() - self.readstat_variable_types = dict() - self.table_name = None - self.missing_ranges = dict() - self.missing_user_values = dict() - self.variable_storage_width = dict() - self.variable_display_width = dict() - self.variable_alignment = dict() - self.variable_measure = dict() - self.creation_time = None - self.modification_time = None - self.mr_sets = dict() + def __init__(self) -> None: + self.column_names: list[str] = list() + self.column_labels: list[str] = list() + self.column_names_to_labels: dict[str, str] = dict() + self.file_encoding: str | None = None + self.number_columns: int | None = None + self.number_rows: int | None = None + self.variable_value_labels: dict[str, dict[float | int, str]] = dict() + self.value_labels: dict[str, dict[float | int, str]] = dict() + self.variable_to_label: dict[str, str] = dict() + self.notes: list[str] = list() + self.original_variable_types: dict[str, str] = dict() + self.readstat_variable_types: dict[str, str] = dict() + self.table_name: str | None = None + self.missing_ranges: dict[str, list[int | float | str | _MissingRange]] = dict() + self.missing_user_values: dict[str, list[int | float | str | _MissingRange]] = ( + dict() + ) + self.variable_storage_width: dict[str, int] = dict() + self.variable_display_width: dict[str, int] = dict() + self.variable_alignment: dict[str, str] = dict() + self.variable_measure: dict[ + str, Literal["nominal", "ordinal", "scale", "unknown"] + ] = dict() + self.creation_time: "datetime | None" = None + self.modification_time: "datetime | None" = None + self.mr_sets: dict[str, MRSet] = dict() diff --git a/pyreadstat/pyfunctions.py b/pyreadstat/pyfunctions.py index 30cdb04..ed477cc 100644 --- a/pyreadstat/pyfunctions.py +++ b/pyreadstat/pyfunctions.py @@ -1,14 +1,24 @@ """ Functions written in pure python """ + from copy import deepcopy, copy import warnings import narwhals.stable.v2 as nw +from narwhals.typing import IntoDataFrameT + +from .pyclasses import metadata_container # Functions to deal with value labels -def set_value_labels(dataframe, metadata, formats_as_category=True, formats_as_ordered_category=False): + +def set_value_labels( + dataframe: IntoDataFrameT, + metadata: metadata_container, + formats_as_category: bool = True, + formats_as_ordered_category: bool = False, +) -> IntoDataFrameT: """ Changes the values in the dataframe according to the value formats in the metadata. It will return a copy of the dataframe. If no appropiate formats were found, the result will be an unchanged copy @@ -43,7 +53,10 @@ def set_value_labels(dataframe, metadata, formats_as_category=True, formats_as_o labels = deepcopy(labels) if var_name in df_copy.columns: # unique does not work for polars Object - if not df_copy.implementation.is_pandas() and df_copy[var_name].dtype == nw.Object: + if ( + not df_copy.implementation.is_pandas() + and df_copy[var_name].dtype == nw.Object + ): unvals = list(set(df_copy[var_name].to_list())) else: unvals = df_copy[var_name].unique() @@ -51,44 +64,74 @@ def set_value_labels(dataframe, metadata, formats_as_category=True, formats_as_o if uval not in labels: labels[uval] = uval # if all values are null, there will be nothing to replace. However we cannot do replace_strict on null dtype, it raises an error - if not df_copy.implementation.is_pandas() and (len(df_copy[var_name])==df_copy[var_name].null_count()): + if not df_copy.implementation.is_pandas() and ( + len(df_copy[var_name]) == df_copy[var_name].null_count() + ): continue # replace_strict requires that all the values are in the map. Could not get map_batches or when/then/otherwise to work - elif not df_copy.implementation.is_pandas() and (df_copy[var_name].dtype==nw.Object or not all([type(v)==type(list(labels.values())[0]) for v in labels.values() if v is not None])): + elif not df_copy.implementation.is_pandas() and ( + df_copy[var_name].dtype == nw.Object + or not all( + [ + type(v) == type(list(labels.values())[0]) + for v in labels.values() + if v is not None + ] + ) + ): # polars is very difficult to convince to mix strings and numbers, so we have to do it this way temp = [labels[x] for x in df_copy[var_name]] - newser = nw.new_series(name=var_name, values= temp, dtype=nw.Object, backend=df_copy.implementation) + newser = nw.new_series( + name=var_name, + values=temp, + dtype=nw.Object, + backend=df_copy.implementation, + ) df_copy = df_copy.with_columns(newser.alias(var_name)) if formats_as_category or formats_as_ordered_category: msg = f"You requested formats_as_category=True or formats_as_ordered_category=True, but it was not possible to cast variable '{var_name}' to category" warnings.warn(msg, RuntimeWarning) continue # not sure if we get into this situation ever or what would exactly happen, maybe this is not needed? - elif not df_copy.implementation.is_pandas() and df_copy[var_name].dtype==nw.Unknown: + elif ( + not df_copy.implementation.is_pandas() + and df_copy[var_name].dtype == nw.Unknown + ): msg = f"It was not possible to apply value formats to variable '{var_name}' due to unknown/not supported data type" warnings.warn(msg, RuntimeWarning) continue else: - df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels)) + df_copy = df_copy.with_columns( + nw.col(var_name).replace_strict(labels) + ) if formats_as_ordered_category: categories = list(set(labels.values())) original_values = list(labels.keys()) original_values.sort() - revdict= dict() + revdict = dict() for orival in original_values: curcat = labels.get(orival) if not revdict.get(curcat): revdict[curcat] = orival categories.sort(key=revdict.get) - df_copy = df_copy.with_columns(nw.col(var_name).cast(nw.Enum(categories))) + df_copy = df_copy.with_columns( + nw.col(var_name).cast(nw.Enum(categories)) + ) elif formats_as_category: - df_copy = df_copy.with_columns(nw.col(var_name).cast(nw.Categorical)) - + df_copy = df_copy.with_columns( + nw.col(var_name).cast(nw.Categorical) + ) return df_copy.to_native() -def set_catalog_to_sas(sas_dataframe, sas_metadata, catalog_metadata, formats_as_category=True, - formats_as_ordered_category=False): + +def set_catalog_to_sas( + sas_dataframe: IntoDataFrameT, + sas_metadata: metadata_container, + catalog_metadata: metadata_container, + formats_as_category: bool = True, + formats_as_ordered_category: bool = False, +) -> tuple[IntoDataFrameT, metadata_container]: """ Changes the values in the dataframe and sas_metadata according to the formats in the catalog. It will return a copy of the dataframe and metadata. If no appropriate formats were found, the result will @@ -122,8 +165,12 @@ def set_catalog_to_sas(sas_dataframe, sas_metadata, catalog_metadata, formats_as catalog_metadata_copy = deepcopy(catalog_metadata) metadata = deepcopy(sas_metadata) metadata.value_labels = catalog_metadata_copy.value_labels - df_copy = set_value_labels(sas_dataframe, metadata, formats_as_category=formats_as_category, - formats_as_ordered_category=formats_as_ordered_category) + df_copy = set_value_labels( + sas_dataframe, + metadata, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + ) variable_value_labels = dict() for var_name, var_label in metadata.variable_to_label.items(): @@ -133,9 +180,8 @@ def set_catalog_to_sas(sas_dataframe, sas_metadata, catalog_metadata, formats_as metadata.variable_value_labels = variable_value_labels else: - #df_copy = sas_dataframe.copy() + # df_copy = sas_dataframe.copy() df_copy = nw.from_native(sas_dataframe).clone().to_native() metadata = deepcopy(sas_metadata) return df_copy, metadata - diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index 5e349a0..5b5ff24 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -14,6 +14,7 @@ # limitations under the License. # ############################################################################# +from collections.abc import Callable, Iterator import multiprocessing as mp import narwhals.stable.v2 as nw @@ -24,18 +25,132 @@ from .worker import worker from .pyfunctions import set_value_labels, set_catalog_to_sas +# Typing interface + +from typing import ( + TYPE_CHECKING, + Any, + Concatenate, + Literal, + TypeAlias, + overload, + Protocol, +) + +from narwhals.typing import IntoDataFrame + +from .pyclasses import metadata_container, _MissingRange + +if TYPE_CHECKING: + from os import PathLike + + try: + from pandas import DataFrame as PandasDataFrame + except ImportError: + pass + try: + from polars import DataFrame as PolarsDataFrame + except ImportError: + pass + + DataFrame: TypeAlias = PandasDataFrame | PolarsDataFrame + + +class FileLike(Protocol): + def read(self, size: int | None = -1, /) -> bytes: ... + def seek(self, pos: int, whence: int = 0, /) -> int: ... + + +PyreadstatReadFunction = Callable[ + Concatenate["str | bytes | PathLike | FileLike", ...], + "tuple[DataFrame | dict[str, np.ndarray], metadata_container]", +] + + # Public interface # Parsing functions -def read_sas7bdat(filename_path, metadataonly=False, dates_as_pandas_datetime=False, catalog_file=None, - formats_as_category=True, formats_as_ordered_category=False, encoding=None, usecols=None, user_missing=False, - disable_datetime_conversion=False, row_limit=0, row_offset=0, output_format=None, - extra_datetime_formats=None, extra_date_formats=None, extra_time_formats=None): + +@overload +def read_sas7bdat( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + catalog_file: "str | bytes | PathLike | FileLike | None" = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["pandas"] | None = ..., + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_sas7bdat( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + catalog_file: "str | bytes | PathLike | FileLike | None" = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["polars"] = "polars", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_sas7bdat( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + catalog_file: "str | bytes | PathLike | FileLike | None" = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["dict"] = "dict", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[dict[str, np.ndarray], metadata_container]": ... +def read_sas7bdat( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = False, + dates_as_pandas_datetime: bool = False, + catalog_file: "str | bytes | PathLike | FileLike | None" = None, + formats_as_category: bool = True, + formats_as_ordered_category: bool = False, + encoding: str | None = None, + usecols: list[str] | None = None, + user_missing: bool = False, + disable_datetime_conversion: bool = False, + row_limit: int = 0, + row_offset: int = 0, + output_format: Literal["pandas", "polars", "dict"] | None = None, + extra_datetime_formats: list[str] | None = None, + extra_date_formats: list[str] | None = None, + extra_time_formats: list[str] | None = None, +) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": r""" Read a SAS sas7bdat file. It accepts the path to a sas7bcat. - + Parameters ---------- filename_path : str, bytes, Path-like object or file-like object @@ -79,7 +194,7 @@ def read_sas7bdat(filename_path, metadataonly=False, dates_as_pandas_datetime=Fa start reading rows after this offset. By default 0, meaning start with the first row not skipping anything. output_format : str, optional one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned, the - user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a + user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a dataframe is avoided. extra_datetime_formats: list of str, optional formats to be parsed as python datetime objects @@ -87,7 +202,7 @@ def read_sas7bdat(filename_path, metadataonly=False, dates_as_pandas_datetime=Fa formats to be parsed as python date objects extra_time_formats: list of str, optional formats to be parsed as python time objects - + Returns ------- @@ -99,27 +214,99 @@ def read_sas7bdat(filename_path, metadataonly=False, dates_as_pandas_datetime=Fa Look at the documentation for more information. """ parser_format = "sas7bdat" - data_frame, metadata = parser_entry_point(filename_path, parser_format, - metadataonly=metadataonly, dates_as_pandas_datetime=dates_as_pandas_datetime, - formats_as_category=formats_as_category, formats_as_ordered_category=formats_as_ordered_category, - encoding=encoding, usecols=usecols, user_missing=user_missing, - disable_datetime_conversion=disable_datetime_conversion, row_limit=row_limit, row_offset=row_offset, - output_format=output_format, extra_datetime_formats=extra_datetime_formats, - extra_date_formats=extra_date_formats, extra_time_formats=extra_time_formats) + data_frame, metadata = parser_entry_point( + filename_path, + parser_format, + metadataonly=metadataonly, + dates_as_pandas_datetime=dates_as_pandas_datetime, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + encoding=encoding, + usecols=usecols, + user_missing=user_missing, + disable_datetime_conversion=disable_datetime_conversion, + row_limit=row_limit, + row_offset=row_offset, + output_format=output_format, + extra_datetime_formats=extra_datetime_formats, + extra_date_formats=extra_date_formats, + extra_time_formats=extra_time_formats, + ) metadata.file_format = parser_format if catalog_file: - _ , catalog = read_sas7bcat(catalog_file, encoding=encoding) - data_frame, metadata = set_catalog_to_sas(data_frame, metadata, catalog, formats_as_category=formats_as_category, - formats_as_ordered_category=formats_as_ordered_category) + _, catalog = read_sas7bcat(catalog_file, encoding=encoding) + data_frame, metadata = set_catalog_to_sas( + data_frame, + metadata, + catalog, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + ) return data_frame, metadata -def read_xport(filename_path, metadataonly=False, dates_as_pandas_datetime=False, encoding=None, - usecols=None, disable_datetime_conversion=False, row_limit=0, row_offset=0, - output_format=None, extra_datetime_formats=None, extra_date_formats=None, extra_time_formats=None): +@overload +def read_xport( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["pandas"] | None = ..., + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_xport( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["polars"] = "polars", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_xport( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["dict"] = "dict", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[dict[str, np.ndarray], metadata_container]": ... +def read_xport( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = False, + dates_as_pandas_datetime: bool = False, + encoding: str | None = None, + usecols: list[str] | None = None, + disable_datetime_conversion: bool = False, + row_limit: int = 0, + row_offset: int = 0, + output_format: Literal["pandas", "polars", "dict"] | None = None, + extra_datetime_formats: list[str] | None = None, + extra_date_formats: list[str] | None = None, + extra_time_formats: list[str] | None = None, +) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": r""" Read a SAS xport file. @@ -150,7 +337,7 @@ def read_xport(filename_path, metadataonly=False, dates_as_pandas_datetime=False start reading rows after this offset. By default 0, meaning start with the first row not skipping anything. output_format : str, optional one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned, the - user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a + user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a dataframe is avoided. extra_datetime_formats: list of str, optional formats to be parsed as python datetime objects @@ -167,22 +354,45 @@ def read_xport(filename_path, metadataonly=False, dates_as_pandas_datetime=False object with metadata. Look at the documentation for more information. """ parser_format = "xport" - data_frame, metadata = parser_entry_point(filename_path, parser_format, - metadataonly=metadataonly, dates_as_pandas_datetime=dates_as_pandas_datetime, - encoding=encoding, usecols=usecols, - disable_datetime_conversion=disable_datetime_conversion, row_limit=row_limit, row_offset=row_offset, - output_format=output_format, extra_datetime_formats=extra_datetime_formats, - extra_date_formats=extra_date_formats, extra_time_formats=extra_time_formats) + data_frame, metadata = parser_entry_point( + filename_path, + parser_format, + metadataonly=metadataonly, + dates_as_pandas_datetime=dates_as_pandas_datetime, + encoding=encoding, + usecols=usecols, + disable_datetime_conversion=disable_datetime_conversion, + row_limit=row_limit, + row_offset=row_offset, + output_format=output_format, + extra_datetime_formats=extra_datetime_formats, + extra_date_formats=extra_date_formats, + extra_time_formats=extra_time_formats, + ) metadata.file_format = parser_format return data_frame, metadata -def read_dta(filename_path, metadataonly=False, dates_as_pandas_datetime=False, apply_value_formats=False, - formats_as_category=True, formats_as_ordered_category=False, encoding=None, usecols=None, user_missing=False, - disable_datetime_conversion=False, row_limit=0, row_offset=0, output_format=None, - extra_datetime_formats=None, extra_date_formats=None, extra_time_formats=None): +def read_dta( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = False, + dates_as_pandas_datetime: bool = False, + apply_value_formats: bool = False, + formats_as_category: bool = True, + formats_as_ordered_category: bool = False, + encoding: str | None = None, + usecols: list[str] | None = None, + user_missing: bool = False, + disable_datetime_conversion: bool = False, + row_limit: int = 0, + row_offset: int = 0, + output_format: str | None = None, + extra_datetime_formats: list[str] | None = None, + extra_date_formats: list[str] | None = None, + extra_time_formats: list[str] | None = None, +) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": r""" Read a STATA dta file @@ -227,7 +437,7 @@ def read_dta(filename_path, metadataonly=False, dates_as_pandas_datetime=False, start reading rows after this offset. By default 0, meaning start with the first row not skipping anything. output_format : str, optional one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned, the - user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a + user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a dataframe is avoided. extra_datetime_formats: list of str, optional formats to be parsed as python datetime objects @@ -244,27 +454,113 @@ def read_dta(filename_path, metadataonly=False, dates_as_pandas_datetime=False, object with metadata. Look at the documentation for more information. """ parser_format = "dta" - data_frame, metadata = parser_entry_point(filename_path, parser_format=parser_format, - metadataonly=metadataonly, dates_as_pandas_datetime=dates_as_pandas_datetime, - formats_as_category=formats_as_category, formats_as_ordered_category=formats_as_ordered_category, - encoding=encoding, usecols=usecols, user_missing=user_missing, - disable_datetime_conversion=disable_datetime_conversion, row_limit=row_limit, row_offset=row_offset, - output_format=output_format, extra_datetime_formats=extra_datetime_formats, - extra_date_formats=extra_date_formats, extra_time_formats=extra_time_formats) + data_frame, metadata = parser_entry_point( + filename_path, + parser_format=parser_format, + metadataonly=metadataonly, + dates_as_pandas_datetime=dates_as_pandas_datetime, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + encoding=encoding, + usecols=usecols, + user_missing=user_missing, + disable_datetime_conversion=disable_datetime_conversion, + row_limit=row_limit, + row_offset=row_offset, + output_format=output_format, + extra_datetime_formats=extra_datetime_formats, + extra_date_formats=extra_date_formats, + extra_time_formats=extra_time_formats, + ) metadata.file_format = parser_format if apply_value_formats: - data_frame = set_value_labels(data_frame, metadata, formats_as_category=formats_as_category, - formats_as_ordered_category=formats_as_ordered_category) + data_frame = set_value_labels( + data_frame, + metadata, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + ) return data_frame, metadata -def read_sav(filename_path, metadataonly=False, dates_as_pandas_datetime=False, apply_value_formats=False, - formats_as_category=True, formats_as_ordered_category=False, encoding=None, usecols=None, user_missing=False, - disable_datetime_conversion=False, row_limit=0, row_offset=0, output_format=None, extra_datetime_formats=None, - extra_date_formats=None, extra_time_formats=None): +@overload +def read_sav( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["pandas"] | None = ..., + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_sav( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["polars"] = "polars", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_sav( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["dict"] = "dict", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[dict[str, np.ndarray], metadata_container]": ... +def read_sav( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = False, + dates_as_pandas_datetime: bool = False, + apply_value_formats: bool = False, + formats_as_category: bool = True, + formats_as_ordered_category: bool = False, + encoding: str | None = None, + usecols: list[str] | None = None, + user_missing: bool = False, + disable_datetime_conversion: bool = False, + row_limit: int = 0, + row_offset: int = 0, + output_format: Literal["pandas", "polars", "dict"] | None = None, + extra_datetime_formats: list[str] | None = None, + extra_date_formats: list[str] | None = None, + extra_time_formats: list[str] | None = None, +) -> "tuple[DataFrame | dict[str, np.ndarray], Any]": r""" Read a SPSS sav or zsav (compressed) files @@ -309,7 +605,7 @@ def read_sav(filename_path, metadataonly=False, dates_as_pandas_datetime=False, start reading rows after this offset. By default 0, meaning start with the first row not skipping anything. output_format : str, optional one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned, the - user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a + user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a dataframe is avoided. extra_datetime_formats: list of str, optional formats to be parsed as python datetime objects @@ -327,27 +623,54 @@ def read_sav(filename_path, metadataonly=False, dates_as_pandas_datetime=False, """ parser_format = "sav/zsav" - data_frame, metadata = parser_entry_point(filename_path, parser_format=parser_format, - metadataonly=metadataonly, dates_as_pandas_datetime=dates_as_pandas_datetime, - formats_as_category=formats_as_category, formats_as_ordered_category=formats_as_ordered_category, - encoding=encoding, usecols=usecols, user_missing=user_missing, - disable_datetime_conversion=disable_datetime_conversion, row_limit=row_limit, row_offset=row_offset, - output_format=output_format, extra_datetime_formats=extra_datetime_formats, - extra_date_formats=extra_date_formats, extra_time_formats=extra_time_formats) + data_frame, metadata = parser_entry_point( + filename_path, + parser_format=parser_format, + metadataonly=metadataonly, + dates_as_pandas_datetime=dates_as_pandas_datetime, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + encoding=encoding, + usecols=usecols, + user_missing=user_missing, + disable_datetime_conversion=disable_datetime_conversion, + row_limit=row_limit, + row_offset=row_offset, + output_format=output_format, + extra_datetime_formats=extra_datetime_formats, + extra_date_formats=extra_date_formats, + extra_time_formats=extra_time_formats, + ) metadata.file_format = parser_format if apply_value_formats: - data_frame = set_value_labels(data_frame, metadata, formats_as_category=formats_as_category, - formats_as_ordered_category=formats_as_ordered_category) + data_frame = set_value_labels( + data_frame, + metadata, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + ) return data_frame, metadata -def read_por(filename_path, metadataonly=False, dates_as_pandas_datetime=False, apply_value_formats=False, - formats_as_category=True, formats_as_ordered_category=False, usecols=None, - disable_datetime_conversion=False, row_limit=0, row_offset=0, output_format=None, - extra_datetime_formats=None, extra_date_formats=None, extra_time_formats=None): +def read_por( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = False, + dates_as_pandas_datetime: bool = False, + apply_value_formats: bool = False, + formats_as_category: bool = True, + formats_as_ordered_category: bool = False, + usecols: list[str] | None = None, + disable_datetime_conversion: bool = False, + row_limit: int = 0, + row_offset: int = 0, + output_format: str | None = None, + extra_datetime_formats: list[str] | None = None, + extra_date_formats: list[str] | None = None, + extra_time_formats: list[str] | None = None, +) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": r""" Read a SPSS por file. Files are assumed to be UTF-8 encoded, the encoding cannot be set to other. @@ -386,7 +709,7 @@ def read_por(filename_path, metadataonly=False, dates_as_pandas_datetime=False, start reading rows after this offset. By default 0, meaning start with the first row not skipping anything. output_format : str, optional one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned, the - user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a + user can then convert it to her preferred data format. Using dict is faster as the other types as the conversion to a dataframe is avoided. extra_datetime_formats: list of str, optional formats to be parsed as python datetime objects @@ -403,23 +726,56 @@ def read_por(filename_path, metadataonly=False, dates_as_pandas_datetime=False, object with metadata. Look at the documentation for more information. """ parser_format = "por" - data_frame, metadata = parser_entry_point(filename_path, parser_format=parser_format, - metadataonly=metadataonly, dates_as_pandas_datetime=dates_as_pandas_datetime, - formats_as_category=formats_as_category, formats_as_ordered_category=formats_as_ordered_category, - usecols=usecols, - disable_datetime_conversion=disable_datetime_conversion, row_limit=row_limit, row_offset=row_offset, - output_format=output_format, extra_datetime_formats=extra_datetime_formats, - extra_date_formats=extra_date_formats, extra_time_formats=extra_time_formats) + data_frame, metadata = parser_entry_point( + filename_path, + parser_format=parser_format, + metadataonly=metadataonly, + dates_as_pandas_datetime=dates_as_pandas_datetime, + formats_as_category=formats_as_category, + formats_as_ordered_category=formats_as_ordered_category, + usecols=usecols, + disable_datetime_conversion=disable_datetime_conversion, + row_limit=row_limit, + row_offset=row_offset, + output_format=output_format, + extra_datetime_formats=extra_datetime_formats, + extra_date_formats=extra_date_formats, + extra_time_formats=extra_time_formats, + ) metadata.file_format = parser_format - + if apply_value_formats: - data_frame = set_value_labels(data_frame, metadata, formats_as_category=formats_as_category) + data_frame = set_value_labels( + data_frame, metadata, formats_as_category=formats_as_category + ) return data_frame, metadata -def read_sas7bcat(filename_path, encoding=None, output_format=None): +@overload +def read_sas7bcat( + filename_path: "str | bytes | PathLike | FileLike", + encoding: str | None = ..., + output_format: Literal["pandas"] | None = ..., +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_sas7bcat( + filename_path: "str | bytes | PathLike | FileLike", + encoding: str | None = ..., + output_format: Literal["polars"] = "polars", +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_sas7bcat( + filename_path: "str | bytes | PathLike | FileLike", + encoding: str | None = ..., + output_format: Literal["dict"] = "dict", +) -> "tuple[dict[str, np.ndarray], metadata_container]": ... +def read_sas7bcat( + filename_path: "str | bytes | PathLike | FileLike", + encoding: str | None = None, + output_format: Literal["pandas", "polars", "dict"] | None = None, +) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": r""" Read a SAS sas7bcat file. The returning dataframe will be empty. The metadata object will contain a dictionary value_labels that contains the formats. When parsing the sas7bdat file, in the metadata, the dictionary @@ -437,7 +793,7 @@ def read_sas7bcat(filename_path, encoding=None, output_format=None): Defaults to None. If set, the system will use the defined encoding instead of guessing it. It has to be an iconv-compatible name output_format : str, optional - one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned. + one of 'pandas' (default), 'polars' or 'dict'. If 'dict' a dictionary with numpy arrays as values will be returned. Notice that for this function the resulting object is always empty, this is done for consistency with other functions but has no impact on performance. @@ -450,23 +806,78 @@ def read_sas7bcat(filename_path, encoding=None, output_format=None): Look at the documentation for more information. """ parser_format = "sas7bcat" - data_frame, metadata = parser_entry_point(filename_path, parser_format=parser_format, - encoding=encoding, - output_format=output_format, - ) + data_frame, metadata = parser_entry_point( + filename_path, + parser_format=parser_format, + encoding=encoding, + output_format=output_format, + ) metadata.file_format = parser_format return data_frame, metadata + # convenience functions to read in chunks -def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, limit=0, - multiprocess=False, num_processes=4, num_rows=None, **kwargs): + +@overload +def read_file_in_chunks( + read_function: PyreadstatReadFunction, + file_path: "str | bytes | PathLike | FileLike", + chunksize: int = ..., + offset: int = ..., + limit: int = ..., + multiprocess: bool = ..., + num_processes: int = ..., + num_rows: int | None = ..., + *, + output_format: Literal["pandas"] | None = ..., + **kwargs, +) -> "Iterator[tuple[PandasDataFrame, metadata_container]]": ... +@overload +def read_file_in_chunks( + read_function: PyreadstatReadFunction, + file_path: "str | bytes | PathLike | FileLike", + chunksize: int = ..., + offset: int = ..., + limit: int = ..., + multiprocess: bool = ..., + num_processes: int = ..., + num_rows: int | None = ..., + *, + output_format: Literal["polars"] = "polars", + **kwargs, +) -> "Iterator[tuple[PolarsDataFrame, metadata_container]]": ... +@overload +def read_file_in_chunks( + read_function: PyreadstatReadFunction, + file_path: "str | bytes | PathLike | FileLike", + chunksize: int = ..., + offset: int = ..., + limit: int = ..., + multiprocess: bool = ..., + num_processes: int = ..., + num_rows: int | None = ..., + *, + output_format: Literal["dict"] = "dict", + **kwargs, +) -> "Iterator[tuple[dict[str, np.ndarray], metadata_container]]": ... +def read_file_in_chunks( + read_function: PyreadstatReadFunction, + file_path: "str | bytes | PathLike | FileLike", + chunksize: int = 100000, + offset: int = 0, + limit: int = 0, + multiprocess: bool = False, + num_processes: int = 4, + num_rows: int | None = None, + **kwargs, +) -> "Iterator[tuple[DataFrame | dict[str, np.ndarray], metadata_container]]": """ Returns a generator that will allow to read a file in chunks. - If using multiprocessing, for Xport, Por and some defective sav files where the number of rows in the dataset canot be obtained from the metadata, + If using multiprocessing, for Xport, Por and some defective sav files where the number of rows in the dataset canot be obtained from the metadata, the parameter num_rows must be set to a number equal or larger than the number of rows in the dataset. That information must be obtained by the user before running this function. @@ -488,7 +899,7 @@ def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, li in case multiprocess is true, how many workers/processes to spawn? num_rows: integer, optional number of rows in the dataset. If using multiprocessing it is obligatory for files where - the number of rows cannot be obtained from the medatata, such as por and + the number of rows cannot be obtained from the medatata, such as por and some defective xport and sav files. The user must obtain this value by reading the file without multiprocessing first or any other means. A number larger than the actual number of rows will work as well. Discarded if the number of rows can be obtained from the metadata or not using multiprocessing. @@ -500,7 +911,7 @@ def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, li data_frame : dataframe a dataframe with the data metadata : - object with metadata. + object with metadata. Look at the documentation for more information. it : generator @@ -509,7 +920,7 @@ def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, li if read_function == read_sas7bcat: raise Exception("read_sas7bcat not supported") - + if "row_offset" in kwargs: _ = kwargs.pop("row_offset") @@ -525,7 +936,7 @@ def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, li if not limit: limit = numrows else: - limit = min(offset+limit, numrows) + limit = min(offset + limit, numrows) else: if limit: limit = offset + limit @@ -534,18 +945,64 @@ def read_file_in_chunks(read_function, file_path, chunksize=100000, offset=0, li if limit and (offset >= limit): break if multiprocess: - df, meta = read_file_multiprocessing(read_function, file_path, num_processes=num_processes, - row_offset=offset, row_limit=chunksize, num_rows=num_rows, **kwargs) + df, meta = read_file_multiprocessing( + read_function, + file_path, + num_processes=num_processes, + row_offset=offset, + row_limit=chunksize, + num_rows=num_rows, + **kwargs, + ) else: - df, meta = read_function(file_path, row_offset=offset, row_limit=chunksize, **kwargs) + df, meta = read_function( + file_path, row_offset=offset, row_limit=chunksize, **kwargs + ) if len(df): yield df, meta offset += chunksize -def read_file_multiprocessing(read_function, file_path, num_processes=None, num_rows=None, **kwargs): + +@overload +def read_file_multiprocessing( + read_function: PyreadstatReadFunction, + file_path: "str | bytes | PathLike | FileLike", + num_processes: int | None = ..., + num_rows: int | None = ..., + *, + output_format: Literal["pandas"] | None = ..., + **kwargs, +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_file_multiprocessing( + read_function: PyreadstatReadFunction, + file_path: "str | bytes | PathLike | FileLike", + num_processes: int | None = ..., + num_rows: int | None = ..., + *, + output_format: Literal["polars"] = "polars", + **kwargs, +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_file_multiprocessing( + read_function: PyreadstatReadFunction, + file_path: "str | bytes | PathLike | FileLike", + num_processes: int | None = ..., + num_rows: int | None = ..., + *, + output_format: Literal["dict"] = "dict", + **kwargs, +) -> "tuple[dict[str, np.ndarray], metadata_container]": ... +def read_file_multiprocessing( + read_function: PyreadstatReadFunction, + file_path: "str | bytes | PathLike | FileLike", + num_processes: int | None = None, + num_rows: int | None = None, + **kwargs, +) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": """ Reads a file in parallel using multiprocessing. - For Xport, Por and some defective sav files where the number of rows in the dataset canot be obtained from the metadata, + For Xport, Por and some defective sav files where the number of rows in the dataset canot be obtained from the metadata, the parameter num_rows must be set to a number equal or larger than the number of rows in the dataset. That information must be obtained by the user before running this function. @@ -558,11 +1015,11 @@ def read_file_multiprocessing(read_function, file_path, num_processes=None, num_ num_processes : integer, optional number of processes to spawn, by default the min 4 and the max cores on the computer num_rows: integer, optional - number of rows in the dataset. Obligatory for files where the number of rows cannot be obtained from the medatata, such as por and + number of rows in the dataset. Obligatory for files where the number of rows cannot be obtained from the medatata, such as por and some defective xport and sav files. The user must obtain this value by reading the file without multiprocessing first or any other means. A number larger than the actual number of rows will work as well. Discarded if the number of rows can be obtained from the metadata. kwargs : dict, optional - any other keyword argument to pass to the read_function. + any other keyword argument to pass to the read_function. Returns ------- @@ -576,26 +1033,33 @@ def read_file_multiprocessing(read_function, file_path, num_processes=None, num_ raise Exception("read_sas7bcat is not supported") if read_function == read_por and num_rows is None: - raise Exception("num_rows must be specified for read_por to be a number equal or larger than the number of rows in the dataset.") + raise Exception( + "num_rows must be specified for read_por to be a number equal or larger than the number of rows in the dataset." + ) if not num_processes: # let's be conservative with the number of workers num_processes = min(mp.cpu_count(), 4) - _ = kwargs.pop('metadataonly', None) + _ = kwargs.pop("metadataonly", None) row_offset = kwargs.pop("row_offset", 0) - row_limit = kwargs.pop("row_limit", float('inf')) + row_limit = kwargs.pop("row_limit", float("inf")) _, meta = read_function(file_path, metadataonly=True, **kwargs) numrows = meta.number_rows if numrows is None: if num_rows is None: - raise Exception("The number of rows of the file cannot be determined from the file's metadata. If you still want to proceed, please set num_rows to a number equal or larger than the number of rows of your data") + raise Exception( + "The number of rows of the file cannot be determined from the file's metadata. If you still want to proceed, please set num_rows to a number equal or larger than the number of rows of your data" + ) numrows = num_rows elif numrows == 0: final, meta = read_function(file_path, **kwargs) - numrows = min(max(numrows - row_offset, 0), row_limit) - divs = [numrows // num_processes + (1 if x < numrows % num_processes else 0) for x in range (num_processes)] + numrows = min(max(numrows - row_offset, 0), row_limit) + divs = [ + numrows // num_processes + (1 if x < numrows % num_processes else 0) + for x in range(num_processes) + ] offsets = list() prev_offset = row_offset prev_div = 0 @@ -604,7 +1068,10 @@ def read_file_multiprocessing(read_function, file_path, num_processes=None, num_ prev_offset = offset prev_div = div offsets.append((offset, div)) - jobs = [(read_function, file_path, offset, chunksize, kwargs) for offset, chunksize in offsets] + jobs = [ + (read_function, file_path, offset, chunksize, kwargs) + for offset, chunksize in offsets + ] pool = mp.Pool(processes=num_processes) try: chunks = pool.map(worker, jobs) @@ -613,15 +1080,15 @@ def read_file_multiprocessing(read_function, file_path, num_processes=None, num_ finally: pool.close() output_format = kwargs.get("output_format") - if output_format == 'dict': + if output_format == "dict": keys = chunks[0].keys() final = dict() for key in keys: final[key] = np.concatenate([chunk[key] for chunk in chunks]) else: - #final = pd.concat(chunks, axis=0, ignore_index=True) + # final = pd.concat(chunks, axis=0, ignore_index=True) chunks = [nw.from_native(x) for x in chunks] - final = nw.concat(chunks, how='vertical') + final = nw.concat(chunks, how="vertical") ispandas = False if final.implementation.is_pandas(): ispandas = True @@ -630,11 +1097,24 @@ def read_file_multiprocessing(read_function, file_path, num_processes=None, num_ final = final.reset_index(drop=True) return final, meta + # Write API -def write_sav(df, dst_path, file_label="", column_labels=None, compress=False, row_compress=False, note=None, - variable_value_labels=None, missing_ranges=None, variable_display_width=None, - variable_measure=None, variable_format=None): + +def write_sav( + df: "DataFrame", + dst_path: "str | bytes | PathLike | FileLike", + file_label: str = "", + column_labels: list[str] | dict[str, str] | None = None, + compress: bool = False, + row_compress: bool = False, + note: str | list[str] | None = None, + variable_value_labels: dict[str, dict[int | float, str]] | None = None, + missing_ranges: "dict[str, list[int | float | str | _MissingRange]] | None" = None, + variable_display_width: dict[str, int] | None = None, + variable_measure: dict[str, str] | None = None, + variable_format: dict[str, str] | None = None, +) -> None: """ Writes a dataframe to a SPSS sav or zsav file. @@ -676,27 +1156,52 @@ def write_sav(df, dst_path, file_label="", column_labels=None, compress=False, r sets the measure type for a variable. Must be a dictionary with keys being variable names and values being strings one of "nominal", "ordinal", "scale" or "unknown" (default). variable_format: dict, optional - sets the format of a variable. Must be a dictionary with keys being the variable names and + sets the format of a variable. Must be a dictionary with keys being the variable names and values being strings defining the format. See README, setting variable formats section, for more information. """ writer_format = "sav" # formats - formats_presets = {'restricted_integer':'N{var_width}', 'integer':'F{var_width}.0'} + formats_presets = { + "restricted_integer": "N{var_width}", + "integer": "F{var_width}.0", + } if variable_format: for col_name, col_format in variable_format.items(): if col_format in formats_presets.keys() and col_name in df.columns: var_width = str(len(str(max(df[col_name])))) - variable_format[col_name] = formats_presets[col_format].format(var_width=var_width) - - writer_entry_point(df, dst_path, writer_format=writer_format, file_label=file_label, column_labels=column_labels, - compress=compress, row_compress=row_compress, note=note, - variable_value_labels=variable_value_labels, missing_ranges=missing_ranges, variable_display_width=variable_display_width, - variable_measure=variable_measure, variable_format=variable_format) - -def write_dta(df, dst_path, file_label="", column_labels=None, version=15, - variable_value_labels=None, missing_user_values=None, variable_format=None): + variable_format[col_name] = formats_presets[col_format].format( + var_width=var_width + ) + + writer_entry_point( + df, + dst_path, + writer_format=writer_format, + file_label=file_label, + column_labels=column_labels, + compress=compress, + row_compress=row_compress, + note=note, + variable_value_labels=variable_value_labels, + missing_ranges=missing_ranges, + variable_display_width=variable_display_width, + variable_measure=variable_measure, + variable_format=variable_format, + ) + + +def write_dta( + df: IntoDataFrame, + dst_path: "str | bytes | PathLike | FileLike", + file_label: str = "", + column_labels: list[str] | dict[str, str] | None = None, + version: int = 15, + variable_value_labels: dict[str, dict[int | float, str]] | None = None, + missing_user_values: dict[str, list[str]] | None = None, + variable_format: dict[str, str] | None = None, +): """ Writes a dataframe to a STATA dta file @@ -724,20 +1229,34 @@ def write_dta(df, dst_path, file_label="", column_labels=None, version=15, names and values being a list of missing values. Missing values must be a single character between a and z. variable_format: dict, optional - sets the format of a variable. Must be a dictionary with keys being the variable names and + sets the format of a variable. Must be a dictionary with keys being the variable names and values being strings defining the format. See README, setting variable formats section, for more information. """ writer_format = "dta" - writer_entry_point(df, dst_path, writer_format=writer_format, file_label=file_label, column_labels=column_labels, - version=version, - variable_value_labels=variable_value_labels, - missing_user_values=missing_user_values, - variable_format=variable_format) - -def write_xport(df, dst_path, file_label="", column_labels=None, table_name=None, file_format_version = 8, - variable_format=None): + writer_entry_point( + df, + dst_path, + writer_format=writer_format, + file_label=file_label, + column_labels=column_labels, + version=version, + variable_value_labels=variable_value_labels, + missing_user_values=missing_user_values, + variable_format=variable_format, + ) + + +def write_xport( + df: IntoDataFrame, + dst_path: "str | bytes | PathLike | FileLike", + file_label: str = "", + column_labels: list[str] | dict[str, str] | None = None, + table_name: str | None = None, + file_format_version: int = 8, + variable_format: dict[str, str] | None = None, +) -> None: """ Writes a dataframe to a SAS Xport (xpt) file. If no table_name is specified the dataset has by default the name DATASET (take it into account if @@ -762,18 +1281,31 @@ def write_xport(df, dst_path, file_label="", column_labels=None, table_name=None file_format_version : int, optional XPORT file version, either 8 or 5, default is 8 variable_format: dict, optional - sets the format of a variable. Must be a dictionary with keys being the variable names and + sets the format of a variable. Must be a dictionary with keys being the variable names and values being strings defining the format. See README, setting variable formats section, for more information. """ writer_format = "xport" - writer_entry_point(df, dst_path, writer_format=writer_format, file_label=file_label, column_labels=column_labels, - version=file_format_version, - table_name=table_name, - variable_format=variable_format) - -def write_por(df, dst_path, file_label="", column_labels=None, variable_format=None): + writer_entry_point( + df, + dst_path, + writer_format=writer_format, + file_label=file_label, + column_labels=column_labels, + version=file_format_version, + table_name=table_name, + variable_format=variable_format, + ) + + +def write_por( + df: IntoDataFrame, + dst_path: "str | bytes | PathLike | FileLike", + file_label: str = "", + column_labels: list[str] | dict[str, str] | None = None, + variable_format: dict[str, str] | None = None, +) -> None: """ Writes a dataframe to a SPSS POR file. @@ -791,12 +1323,17 @@ def write_por(df, dst_path, file_label="", column_labels=None, variable_format=N In such case there is no need to include all variables; labels for non existent variables will be ignored with no warning or error. variable_format: dict, optional - sets the format of a variable. Must be a dictionary with keys being the variable names and + sets the format of a variable. Must be a dictionary with keys being the variable names and values being strings defining the format. See README, setting variable formats section, for more information. """ writer_format = "por" - writer_entry_point(df, dst_path, writer_format=writer_format, file_label=file_label, column_labels=column_labels, - variable_format=variable_format) - + writer_entry_point( + df, + dst_path, + writer_format=writer_format, + file_label=file_label, + column_labels=column_labels, + variable_format=variable_format, + ) diff --git a/tests/typing_tests.pyi b/tests/typing_tests.pyi new file mode 100644 index 0000000..128149c --- /dev/null +++ b/tests/typing_tests.pyi @@ -0,0 +1,53 @@ +# Run with `mypy tests/typing_tests.py` + +import io +from pathlib import Path +from typing import reveal_type + +import pandas as pd +import polars as pl + +from pyreadstat import * + +def test_read_sav_default() -> None: + df, meta = read_sav("file.sav") + reveal_type(df) # pandas.core.frame.DataFrame + reveal_type(meta) # metadata_container + +def test_read_sav_pandas_type() -> None: + df, meta = read_sav("file.sav", output_format="pandas") + reveal_type(df) # pandas.core.frame.DataFrame + +def test_read_sav_polars_type() -> None: + df, meta = read_sav("file.sav", output_format="polars") + reveal_type(df) # polars.dataframe.frame.DataFrame + +def test_read_sav_dict_type() -> None: + df, meta = read_sav("file.sav", output_format="dict") + reveal_type(df) # dict[str, ndarray] + +def test_read_sav_buffer_type() -> None: + buffer = io.BytesIO() + df, meta = read_sav(buffer) + +def test_write_sav_types() -> None: + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + write_sav(pandas_df, "file.sav") + write_sav(polars_df, "file.sav") + write_sav(pandas_df, Path("file.sav")) + +def test_read_multiprocessing() -> None: + df: pd.DataFrame | pl.DataFrame + def noop(a: int, /) -> int: + return a + + df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True) + reveal_type(df) # pandas.core.frame.DataFrame + + df, meta = read_file_multiprocessing( + read_sav, "file.sav", metadataonly=True, output_format="polars" + ) + reveal_type(df) # polars.dataframe.frame.DataFrame + + read_file_multiprocessing(noop, "file.sav", 1, 1) # wrong callable, should error From ce93b593e09302e07afc935ebbe1621d2f7e74b8 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 15:14:50 +0000 Subject: [PATCH 02/38] remove Optional types since they will always be returned with actual values --- pyreadstat/pyclasses.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pyreadstat/pyclasses.py b/pyreadstat/pyclasses.py index cb17f21..3d91341 100644 --- a/pyreadstat/pyclasses.py +++ b/pyreadstat/pyclasses.py @@ -49,16 +49,16 @@ def __init__(self) -> None: self.column_names: list[str] = list() self.column_labels: list[str] = list() self.column_names_to_labels: dict[str, str] = dict() - self.file_encoding: str | None = None - self.number_columns: int | None = None - self.number_rows: int | None = None + self.file_encoding: str = None # type: ignore[assignment] + self.number_columns: int = None # type: ignore[assignment] + self.number_rows: int = None # type: ignore[assignment] self.variable_value_labels: dict[str, dict[float | int, str]] = dict() self.value_labels: dict[str, dict[float | int, str]] = dict() self.variable_to_label: dict[str, str] = dict() self.notes: list[str] = list() self.original_variable_types: dict[str, str] = dict() self.readstat_variable_types: dict[str, str] = dict() - self.table_name: str | None = None + self.table_name: str = None # type: ignore[assignment] self.missing_ranges: dict[str, list[int | float | str | _MissingRange]] = dict() self.missing_user_values: dict[str, list[int | float | str | _MissingRange]] = ( dict() @@ -69,6 +69,6 @@ def __init__(self) -> None: self.variable_measure: dict[ str, Literal["nominal", "ordinal", "scale", "unknown"] ] = dict() - self.creation_time: "datetime | None" = None - self.modification_time: "datetime | None" = None + self.creation_time: "datetime" = None # type: ignore[assignment] + self.modification_time: "datetime" = None # type: ignore[assignment] self.mr_sets: dict[str, MRSet] = dict() From 34ac76e25afb34639cdcbd758a7db5b09029da02 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 15:15:08 +0000 Subject: [PATCH 03/38] finish overloads for all read_ functions --- pyreadstat/pyreadstat.py | 118 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 113 insertions(+), 5 deletions(-) diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index 5b5ff24..a493e64 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -375,6 +375,63 @@ def read_xport( return data_frame, metadata +@overload +def read_dta( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["pandas"] | None = ..., + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_dta( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["polars"] = "polars", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_dta( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + encoding: str | None = ..., + usecols: list[str] | None = ..., + user_missing: bool = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = ..., + row_offset: int = ..., + output_format: Literal["dict"] = "dict", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[dict[str, np.ndarray], metadata_container]": ... def read_dta( filename_path: "str | bytes | PathLike | FileLike", metadataonly: bool = False, @@ -388,7 +445,7 @@ def read_dta( disable_datetime_conversion: bool = False, row_limit: int = 0, row_offset: int = 0, - output_format: str | None = None, + output_format: Literal["pandas", "polars", "dict"] | None = None, extra_datetime_formats: list[str] | None = None, extra_date_formats: list[str] | None = None, extra_time_formats: list[str] | None = None, @@ -655,6 +712,57 @@ def read_sav( return data_frame, metadata +@overload +def read_por( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + usecols: list[str] | None = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = 0, + row_offset: int = 0, + output_format: Literal["pandas"] | None = ..., + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PandasDataFrame, metadata_container]": ... +@overload +def read_por( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + usecols: list[str] | None = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = 0, + row_offset: int = 0, + output_format: Literal["polars"] = "polars", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[PolarsDataFrame, metadata_container]": ... +@overload +def read_por( + filename_path: "str | bytes | PathLike | FileLike", + metadataonly: bool = ..., + dates_as_pandas_datetime: bool = ..., + apply_value_formats: bool = ..., + formats_as_category: bool = ..., + formats_as_ordered_category: bool = ..., + usecols: list[str] | None = ..., + disable_datetime_conversion: bool = ..., + row_limit: int = 0, + row_offset: int = 0, + output_format: Literal["dict"] = "dict", + extra_datetime_formats: list[str] | None = ..., + extra_date_formats: list[str] | None = ..., + extra_time_formats: list[str] | None = ..., +) -> "tuple[dict[str, np.ndarray], metadata_container]": ... def read_por( filename_path: "str | bytes | PathLike | FileLike", metadataonly: bool = False, @@ -666,7 +774,7 @@ def read_por( disable_datetime_conversion: bool = False, row_limit: int = 0, row_offset: int = 0, - output_format: str | None = None, + output_format: Literal["pandas", "polars", "dict"] | None = None, extra_datetime_formats: list[str] | None = None, extra_date_formats: list[str] | None = None, extra_time_formats: list[str] | None = None, @@ -885,7 +993,7 @@ def read_file_in_chunks( ---------- read_function : pyreadstat function a pyreadstat reading function - file_path : string + file_path : str, bytes, Path-like object or file-like object path to the file to be read chunksize : integer, optional size of the chunks to read @@ -1010,7 +1118,7 @@ def read_file_multiprocessing( ---------- read_function : pyreadstat function a pyreadstat reading function - file_path : string + file_path : str, bytes, Path-like object or file-like object path to the file to be read num_processes : integer, optional number of processes to spawn, by default the min 4 and the max cores on the computer @@ -1201,7 +1309,7 @@ def write_dta( variable_value_labels: dict[str, dict[int | float, str]] | None = None, missing_user_values: dict[str, list[str]] | None = None, variable_format: dict[str, str] | None = None, -): +) -> None: """ Writes a dataframe to a STATA dta file From d55b7ad12a6b1169134c75176cc096fe3314bb63 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 15:15:26 +0000 Subject: [PATCH 04/38] add typehint tests for read_file_in_chunks --- tests/typing_tests.pyi | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/typing_tests.pyi b/tests/typing_tests.pyi index 128149c..4d12c1c 100644 --- a/tests/typing_tests.pyi +++ b/tests/typing_tests.pyi @@ -51,3 +51,18 @@ def test_read_multiprocessing() -> None: reveal_type(df) # polars.dataframe.frame.DataFrame read_file_multiprocessing(noop, "file.sav", 1, 1) # wrong callable, should error + +def test_read_file_in_chunks() -> None: + df: pd.DataFrame | pl.DataFrame + def noop(a: int, /) -> int: + return a + + for df, meta in read_file_in_chunks(read_sav, "file.sav", metadataonly=True): + reveal_type(df) # pandas.core.frame.DataFrame + + for df, meta in read_file_in_chunks( + read_sav, "file.sav", metadataonly=True, output_format="polars" + ): + reveal_type(df) # polars.dataframe.frame.DataFrame + + read_file_in_chunks(noop, "file.sav", 1, 1) # wrong callable, should error From 84e3886a2f7693c3ff0eff9e6b18855f9420daff Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 15:54:30 +0000 Subject: [PATCH 05/38] type tests for all read-write functions --- tests/typing_tests.pyi | 161 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 151 insertions(+), 10 deletions(-) diff --git a/tests/typing_tests.pyi b/tests/typing_tests.pyi index 4d12c1c..ce59a05 100644 --- a/tests/typing_tests.pyi +++ b/tests/typing_tests.pyi @@ -4,38 +4,127 @@ import io from pathlib import Path from typing import reveal_type +import numpy as np import pandas as pd import polars as pl from pyreadstat import * def test_read_sav_default() -> None: + df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] + df, meta = read_sav("file.sav") reveal_type(df) # pandas.core.frame.DataFrame reveal_type(meta) # metadata_container -def test_read_sav_pandas_type() -> None: df, meta = read_sav("file.sav", output_format="pandas") reveal_type(df) # pandas.core.frame.DataFrame -def test_read_sav_polars_type() -> None: df, meta = read_sav("file.sav", output_format="polars") reveal_type(df) # polars.dataframe.frame.DataFrame -def test_read_sav_dict_type() -> None: df, meta = read_sav("file.sav", output_format="dict") reveal_type(df) # dict[str, ndarray] -def test_read_sav_buffer_type() -> None: buffer = io.BytesIO() df, meta = read_sav(buffer) -def test_write_sav_types() -> None: - pandas_df = pd.DataFrame() - polars_df = pl.DataFrame() - write_sav(pandas_df, "file.sav") - write_sav(polars_df, "file.sav") - write_sav(pandas_df, Path("file.sav")) +def test_read_dta_types() -> None: + df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] + + df, meta = read_dta("file.dta") + reveal_type(df) # pandas.core.frame.DataFrame + reveal_type(meta) # metadata_container + + df, meta = read_dta("file.dta", output_format="pandas") + reveal_type(df) # pandas.core.frame.DataFrame + + df, meta = read_dta("file.dta", output_format="polars") + reveal_type(df) # polars.dataframe.frame.DataFrame + + df, meta = read_dta("file.dta", output_format="dict") + reveal_type(df) # dict[str, ndarray] + + buffer = io.BytesIO() + df, meta = read_dta(buffer) + +def test_read_por_types() -> None: + df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] + + df, meta = read_por("file.por") + reveal_type(df) # pandas.core.frame.DataFrame + reveal_type(meta) # metadata_container + + df, meta = read_por("file.por", output_format="pandas") + reveal_type(df) # pandas.core.frame.DataFrame + + df, meta = read_por("file.por", output_format="polars") + reveal_type(df) # polars.dataframe.frame.DataFrame + + df, meta = read_por("file.por", output_format="dict") + reveal_type(df) # dict[str, ndarray] + + buffer = io.BytesIO() + df, meta = read_por(buffer) + +def test_read_sas7bdat_types() -> None: + df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] + + df, meta = read_sas7bdat("file.sas7bdat") + reveal_type(df) # pandas.core.frame.DataFrame + reveal_type(meta) # metadata_container + + df, meta = read_sas7bdat("file.sas7bdat", output_format="pandas") + reveal_type(df) # pandas.core.frame.DataFrame + + df, meta = read_sas7bdat("file.sas7bdat", output_format="polars") + reveal_type(df) # polars.dataframe.frame.DataFrame + + df, meta = read_sas7bdat("file.sas7bdat", output_format="dict") + reveal_type(df) # dict[str, ndarray] + + buffer = io.BytesIO() + df, meta = read_sas7bdat(buffer) + +def test_read_xport_types() -> None: + df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] + + df, meta = read_xport("file.xpt") + reveal_type(df) # pandas.core.frame.DataFrame + reveal_type(meta) # metadata_container + + df, meta = read_xport("file.xpt", output_format="pandas") + reveal_type(df) # pandas.core.frame.DataFrame + + df, meta = read_xport("file.xpt", output_format="polars") + reveal_type(df) # polars.dataframe.frame.DataFrame + + df, meta = read_xport("file.xpt", output_format="dict") + reveal_type(df) # dict[str, ndarray] + + buffer = io.BytesIO() + df, meta = read_xport(buffer) + +def test_read_sas7bcat_types() -> None: + df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] + + df, meta = read_sas7bcat("file.sas7bcat") + reveal_type( + df + ) # pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray] + reveal_type(meta) # metadata_container + + df, meta = read_sas7bcat("file.sas7bcat", output_format="pandas") + reveal_type(df) # pandas.core.frame.DataFrame + + df, meta = read_sas7bcat("file.sas7bcat", output_format="polars") + reveal_type(df) # polars.dataframe.frame.DataFrame + + df, meta = read_sas7bcat("file.sas7bcat", output_format="dict") + reveal_type(df) # dict[str, ndarray] + + buffer = io.BytesIO() + df, meta = read_sas7bcat(buffer) def test_read_multiprocessing() -> None: df: pd.DataFrame | pl.DataFrame @@ -66,3 +155,55 @@ def test_read_file_in_chunks() -> None: reveal_type(df) # polars.dataframe.frame.DataFrame read_file_in_chunks(noop, "file.sav", 1, 1) # wrong callable, should error + +def test_write_sav_types() -> None: + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + # Test writing with pandas DataFrame and string path + write_sav(pandas_df, "file.sav") + # Test writing with polars DataFrame and string path + write_sav(polars_df, "file.sav") + # Test writing with pandas DataFrame and Path object + write_sav(pandas_df, Path("file.sav")) + # Test writing with pandas DataFrame and BytesIO buffer + buffer = io.BytesIO() + write_sav(pandas_df, buffer) + +def test_write_dta_types() -> None: + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + # Test writing with pandas DataFrame and string path + write_dta(pandas_df, "file.dta") + # Test writing with polars DataFrame and string path + write_dta(polars_df, "file.dta") + # Test writing with pandas DataFrame and Path object + write_dta(pandas_df, Path("file.dta")) + # Test writing with pandas DataFrame and BytesIO buffer + buffer = io.BytesIO() + write_dta(pandas_df, buffer) + +def test_write_xport_types() -> None: + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + # Test writing with pandas DataFrame and string path + write_xport(pandas_df, "file.xpt") + # Test writing with polars DataFrame and string path + write_xport(polars_df, "file.xpt") + # Test writing with pandas DataFrame and Path object + write_xport(pandas_df, Path("file.xpt")) + # Test writing with pandas DataFrame and BytesIO buffer + buffer = io.BytesIO() + write_xport(pandas_df, buffer) + +def test_write_por_types() -> None: + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + # Test writing with pandas DataFrame and string path + write_por(pandas_df, "file.por") + # Test writing with polars DataFrame and string path + write_por(polars_df, "file.por") + # Test writing with pandas DataFrame and Path object + write_por(pandas_df, Path("file.por")) + # Test writing with pandas DataFrame and BytesIO buffer + buffer = io.BytesIO() + write_por(pandas_df, buffer) From 1ae8aa25668f2814489a209f0203236c6a6b3cb9 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 15:55:02 +0000 Subject: [PATCH 06/38] fixed run command --- tests/typing_tests.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/typing_tests.pyi b/tests/typing_tests.pyi index ce59a05..d7a94a5 100644 --- a/tests/typing_tests.pyi +++ b/tests/typing_tests.pyi @@ -1,4 +1,4 @@ -# Run with `mypy tests/typing_tests.py` +# Run with `mypy tests/typing_tests.pyi` import io from pathlib import Path From 917550922cd3f34309de5c12805dd44f6093ee33 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 16:01:17 +0000 Subject: [PATCH 07/38] prefer direct import for builtins when possible --- pyreadstat/pyclasses.py | 10 ++++------ pyreadstat/pyreadstat.py | 4 +--- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pyreadstat/pyclasses.py b/pyreadstat/pyclasses.py index 3d91341..43a2a2c 100644 --- a/pyreadstat/pyclasses.py +++ b/pyreadstat/pyclasses.py @@ -16,10 +16,8 @@ # Typing -from typing import TYPE_CHECKING, Literal, TypedDict - -if TYPE_CHECKING: - from datetime import datetime +from datetime import datetime +from typing import Literal, TypedDict class _MissingRange(TypedDict): @@ -69,6 +67,6 @@ def __init__(self) -> None: self.variable_measure: dict[ str, Literal["nominal", "ordinal", "scale", "unknown"] ] = dict() - self.creation_time: "datetime" = None # type: ignore[assignment] - self.modification_time: "datetime" = None # type: ignore[assignment] + self.creation_time: datetime = None # type: ignore[assignment] + self.modification_time: datetime = None # type: ignore[assignment] self.mr_sets: dict[str, MRSet] = dict() diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index a493e64..dde0ace 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -26,7 +26,7 @@ from .pyfunctions import set_value_labels, set_catalog_to_sas # Typing interface - +from os import PathLike from typing import ( TYPE_CHECKING, Any, @@ -42,8 +42,6 @@ from .pyclasses import metadata_container, _MissingRange if TYPE_CHECKING: - from os import PathLike - try: from pandas import DataFrame as PandasDataFrame except ImportError: From 0674be0e0aeff99041c3b2c64fae0300ad9768a3 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 16:34:05 +0000 Subject: [PATCH 08/38] cleanup stringified types & make MissingRange public --- pyreadstat/pyclasses.py | 9 +-- pyreadstat/pyreadstat.py | 119 +++++++++++++++++++-------------------- 2 files changed, 62 insertions(+), 66 deletions(-) diff --git a/pyreadstat/pyclasses.py b/pyreadstat/pyclasses.py index 43a2a2c..b9440f2 100644 --- a/pyreadstat/pyclasses.py +++ b/pyreadstat/pyclasses.py @@ -20,13 +20,14 @@ from typing import Literal, TypedDict -class _MissingRange(TypedDict): +class MissingRange(TypedDict): + """A dictionary to hold the definition of a missing range""" lo: float hi: float class MRSet(TypedDict): - """A dictionary to hold the definition of a multiple-response (MR) set.""" + """A dictionary to hold the definition of a multiple-response (MR) set""" type: Literal["D", "C"] is_dichotomy: bool @@ -57,8 +58,8 @@ def __init__(self) -> None: self.original_variable_types: dict[str, str] = dict() self.readstat_variable_types: dict[str, str] = dict() self.table_name: str = None # type: ignore[assignment] - self.missing_ranges: dict[str, list[int | float | str | _MissingRange]] = dict() - self.missing_user_values: dict[str, list[int | float | str | _MissingRange]] = ( + self.missing_ranges: dict[str, list[int | float | str | MissingRange]] = dict() + self.missing_user_values: dict[str, list[int | float | str | MissingRange]] = ( dict() ) self.variable_storage_width: dict[str, int] = dict() diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index dde0ace..03eed98 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -27,19 +27,11 @@ # Typing interface from os import PathLike -from typing import ( - TYPE_CHECKING, - Any, - Concatenate, - Literal, - TypeAlias, - overload, - Protocol, -) +from typing import TYPE_CHECKING, Concatenate, Literal, TypeAlias, overload, Protocol from narwhals.typing import IntoDataFrame -from .pyclasses import metadata_container, _MissingRange +from .pyclasses import metadata_container, MissingRange if TYPE_CHECKING: try: @@ -55,12 +47,15 @@ class FileLike(Protocol): + """Protocol for file-like objects accepted by pyreadstat""" + + # Should work with any file-like object that has read and seek methods, such as those returned by open() or io.BytesIO def read(self, size: int | None = -1, /) -> bytes: ... def seek(self, pos: int, whence: int = 0, /) -> int: ... PyreadstatReadFunction = Callable[ - Concatenate["str | bytes | PathLike | FileLike", ...], + Concatenate[str | bytes | PathLike | FileLike, ...], "tuple[DataFrame | dict[str, np.ndarray], metadata_container]", ] @@ -72,10 +67,10 @@ def seek(self, pos: int, whence: int = 0, /) -> int: ... @overload def read_sas7bdat( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., - catalog_file: "str | bytes | PathLike | FileLike | None" = ..., + catalog_file: str | bytes | PathLike | FileLike | None = ..., formats_as_category: bool = ..., formats_as_ordered_category: bool = ..., encoding: str | None = ..., @@ -91,10 +86,10 @@ def read_sas7bdat( ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_sas7bdat( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., - catalog_file: "str | bytes | PathLike | FileLike | None" = ..., + catalog_file: str | bytes | PathLike | FileLike | None = ..., formats_as_category: bool = ..., formats_as_ordered_category: bool = ..., encoding: str | None = ..., @@ -110,10 +105,10 @@ def read_sas7bdat( ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_sas7bdat( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., - catalog_file: "str | bytes | PathLike | FileLike | None" = ..., + catalog_file: str | bytes | PathLike | FileLike | None = ..., formats_as_category: bool = ..., formats_as_ordered_category: bool = ..., encoding: str | None = ..., @@ -126,12 +121,12 @@ def read_sas7bdat( extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., extra_time_formats: list[str] | None = ..., -) -> "tuple[dict[str, np.ndarray], metadata_container]": ... +) -> tuple[dict[str, np.ndarray], metadata_container]: ... def read_sas7bdat( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = False, dates_as_pandas_datetime: bool = False, - catalog_file: "str | bytes | PathLike | FileLike | None" = None, + catalog_file: str | bytes | PathLike | FileLike | None = None, formats_as_category: bool = True, formats_as_ordered_category: bool = False, encoding: str | None = None, @@ -248,7 +243,7 @@ def read_sas7bdat( @overload def read_xport( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., encoding: str | None = ..., @@ -263,7 +258,7 @@ def read_xport( ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_xport( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., encoding: str | None = ..., @@ -278,7 +273,7 @@ def read_xport( ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_xport( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., encoding: str | None = ..., @@ -290,9 +285,9 @@ def read_xport( extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., extra_time_formats: list[str] | None = ..., -) -> "tuple[dict[str, np.ndarray], metadata_container]": ... +) -> tuple[dict[str, np.ndarray], metadata_container]: ... def read_xport( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = False, dates_as_pandas_datetime: bool = False, encoding: str | None = None, @@ -375,7 +370,7 @@ def read_xport( @overload def read_dta( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -394,7 +389,7 @@ def read_dta( ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_dta( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -413,7 +408,7 @@ def read_dta( ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_dta( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -429,9 +424,9 @@ def read_dta( extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., extra_time_formats: list[str] | None = ..., -) -> "tuple[dict[str, np.ndarray], metadata_container]": ... +) -> tuple[dict[str, np.ndarray], metadata_container]: ... def read_dta( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = False, dates_as_pandas_datetime: bool = False, apply_value_formats: bool = False, @@ -543,7 +538,7 @@ def read_dta( @overload def read_sav( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -562,7 +557,7 @@ def read_sav( ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_sav( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -581,7 +576,7 @@ def read_sav( ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_sav( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -597,9 +592,9 @@ def read_sav( extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., extra_time_formats: list[str] | None = ..., -) -> "tuple[dict[str, np.ndarray], metadata_container]": ... +) -> tuple[dict[str, np.ndarray], metadata_container]: ... def read_sav( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = False, dates_as_pandas_datetime: bool = False, apply_value_formats: bool = False, @@ -615,7 +610,7 @@ def read_sav( extra_datetime_formats: list[str] | None = None, extra_date_formats: list[str] | None = None, extra_time_formats: list[str] | None = None, -) -> "tuple[DataFrame | dict[str, np.ndarray], Any]": +) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": r""" Read a SPSS sav or zsav (compressed) files @@ -712,7 +707,7 @@ def read_sav( @overload def read_por( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -729,7 +724,7 @@ def read_por( ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_por( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -746,7 +741,7 @@ def read_por( ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_por( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -760,9 +755,9 @@ def read_por( extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., extra_time_formats: list[str] | None = ..., -) -> "tuple[dict[str, np.ndarray], metadata_container]": ... +) -> tuple[dict[str, np.ndarray], metadata_container]: ... def read_por( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, metadataonly: bool = False, dates_as_pandas_datetime: bool = False, apply_value_formats: bool = False, @@ -861,24 +856,24 @@ def read_por( @overload def read_sas7bcat( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, encoding: str | None = ..., output_format: Literal["pandas"] | None = ..., ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_sas7bcat( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, encoding: str | None = ..., output_format: Literal["polars"] = "polars", ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_sas7bcat( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, encoding: str | None = ..., output_format: Literal["dict"] = "dict", -) -> "tuple[dict[str, np.ndarray], metadata_container]": ... +) -> tuple[dict[str, np.ndarray], metadata_container]: ... def read_sas7bcat( - filename_path: "str | bytes | PathLike | FileLike", + filename_path: str | bytes | PathLike | FileLike, encoding: str | None = None, output_format: Literal["pandas", "polars", "dict"] | None = None, ) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": @@ -930,7 +925,7 @@ def read_sas7bcat( @overload def read_file_in_chunks( read_function: PyreadstatReadFunction, - file_path: "str | bytes | PathLike | FileLike", + file_path: str | bytes | PathLike | FileLike, chunksize: int = ..., offset: int = ..., limit: int = ..., @@ -944,7 +939,7 @@ def read_file_in_chunks( @overload def read_file_in_chunks( read_function: PyreadstatReadFunction, - file_path: "str | bytes | PathLike | FileLike", + file_path: str | bytes | PathLike | FileLike, chunksize: int = ..., offset: int = ..., limit: int = ..., @@ -958,7 +953,7 @@ def read_file_in_chunks( @overload def read_file_in_chunks( read_function: PyreadstatReadFunction, - file_path: "str | bytes | PathLike | FileLike", + file_path: str | bytes | PathLike | FileLike, chunksize: int = ..., offset: int = ..., limit: int = ..., @@ -968,10 +963,10 @@ def read_file_in_chunks( *, output_format: Literal["dict"] = "dict", **kwargs, -) -> "Iterator[tuple[dict[str, np.ndarray], metadata_container]]": ... +) -> Iterator[tuple[dict[str, np.ndarray], metadata_container]]: ... def read_file_in_chunks( read_function: PyreadstatReadFunction, - file_path: "str | bytes | PathLike | FileLike", + file_path: str | bytes | PathLike | FileLike, chunksize: int = 100000, offset: int = 0, limit: int = 0, @@ -1072,7 +1067,7 @@ def read_file_in_chunks( @overload def read_file_multiprocessing( read_function: PyreadstatReadFunction, - file_path: "str | bytes | PathLike | FileLike", + file_path: str | bytes | PathLike | FileLike, num_processes: int | None = ..., num_rows: int | None = ..., *, @@ -1082,7 +1077,7 @@ def read_file_multiprocessing( @overload def read_file_multiprocessing( read_function: PyreadstatReadFunction, - file_path: "str | bytes | PathLike | FileLike", + file_path: str | bytes | PathLike | FileLike, num_processes: int | None = ..., num_rows: int | None = ..., *, @@ -1092,16 +1087,16 @@ def read_file_multiprocessing( @overload def read_file_multiprocessing( read_function: PyreadstatReadFunction, - file_path: "str | bytes | PathLike | FileLike", + file_path: str | bytes | PathLike | FileLike, num_processes: int | None = ..., num_rows: int | None = ..., *, output_format: Literal["dict"] = "dict", **kwargs, -) -> "tuple[dict[str, np.ndarray], metadata_container]": ... +) -> tuple[dict[str, np.ndarray], metadata_container]: ... def read_file_multiprocessing( read_function: PyreadstatReadFunction, - file_path: "str | bytes | PathLike | FileLike", + file_path: str | bytes | PathLike | FileLike, num_processes: int | None = None, num_rows: int | None = None, **kwargs, @@ -1208,15 +1203,15 @@ def read_file_multiprocessing( def write_sav( - df: "DataFrame", - dst_path: "str | bytes | PathLike | FileLike", + df: "DataFrame", # Can't be `IntoDataFrame` because columns get accessed via `__getitem__` + dst_path: str | bytes | PathLike | FileLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, compress: bool = False, row_compress: bool = False, note: str | list[str] | None = None, variable_value_labels: dict[str, dict[int | float, str]] | None = None, - missing_ranges: "dict[str, list[int | float | str | _MissingRange]] | None" = None, + missing_ranges: dict[str, list[int | float | str | MissingRange]] | None = None, variable_display_width: dict[str, int] | None = None, variable_measure: dict[str, str] | None = None, variable_format: dict[str, str] | None = None, @@ -1300,7 +1295,7 @@ def write_sav( def write_dta( df: IntoDataFrame, - dst_path: "str | bytes | PathLike | FileLike", + dst_path: str | bytes | PathLike | FileLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, version: int = 15, @@ -1356,7 +1351,7 @@ def write_dta( def write_xport( df: IntoDataFrame, - dst_path: "str | bytes | PathLike | FileLike", + dst_path: str | bytes | PathLike | FileLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, table_name: str | None = None, @@ -1407,7 +1402,7 @@ def write_xport( def write_por( df: IntoDataFrame, - dst_path: "str | bytes | PathLike | FileLike", + dst_path: str | bytes | PathLike | FileLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, variable_format: dict[str, str] | None = None, From c7f7e080776cdc57ca969e8b8cafa8b2272b1c51 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 18:34:46 +0000 Subject: [PATCH 09/38] type hints for worker --- pyreadstat/worker.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pyreadstat/worker.py b/pyreadstat/worker.py index ea003ba..88bc1ae 100644 --- a/pyreadstat/worker.py +++ b/pyreadstat/worker.py @@ -1,4 +1,3 @@ - # ############################################################################# # Copyright 2018 Hoffmann-La Roche # @@ -19,7 +18,18 @@ Functions to work with multiprocessing """ -def worker(inpt): +from os import PathLike +from typing import TYPE_CHECKING, Any, TypeAlias + +import numpy as np + +if TYPE_CHECKING: + from .pyreadstat import PyreadstatReadFunction, FileLike, DataFrame + +Input: TypeAlias = "tuple[PyreadstatReadFunction, str | bytes | PathLike | FileLike, int, int, dict[str, Any]]" + + +def worker(inpt: Input) -> "DataFrame | dict[str, np.ndarray]": read_function, path, row_offset, row_limit, kwargs = inpt df, meta = read_function(path, row_offset=row_offset, row_limit=row_limit, **kwargs) - return df \ No newline at end of file + return df From 2f419cef3f54efa2811553f540a20b9938c7dc89 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 18:35:04 +0000 Subject: [PATCH 10/38] better import sorting --- pyreadstat/pyreadstat.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index 03eed98..a56e9ce 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -16,22 +16,20 @@ from collections.abc import Callable, Iterator import multiprocessing as mp +from os import PathLike +from typing import TYPE_CHECKING, Concatenate, Literal, TypeAlias, overload, Protocol import narwhals.stable.v2 as nw import numpy as np +from narwhals.typing import IntoDataFrame from ._readstat_parser import parser_entry_point from ._readstat_writer import writer_entry_point, PyreadstatError from .worker import worker +from .pyclasses import metadata_container, MissingRange from .pyfunctions import set_value_labels, set_catalog_to_sas # Typing interface -from os import PathLike -from typing import TYPE_CHECKING, Concatenate, Literal, TypeAlias, overload, Protocol - -from narwhals.typing import IntoDataFrame - -from .pyclasses import metadata_container, MissingRange if TYPE_CHECKING: try: From 46f9540c391863252513bb6bb3f69375ebaab3de Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 18:35:31 +0000 Subject: [PATCH 11/38] added py.typed file to signal that the python interface is fully type-hinted --- pyreadstat/py.typed | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pyreadstat/py.typed diff --git a/pyreadstat/py.typed b/pyreadstat/py.typed new file mode 100644 index 0000000..e69de29 From 57ea86ca35dbca68facf6c0b91349347e5d36828 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 19:08:51 +0000 Subject: [PATCH 12/38] revert some formatting changes due to line length --- pyreadstat/pyclasses.py | 9 +++------ pyreadstat/pyfunctions.py | 33 +++++++-------------------------- pyreadstat/pyreadstat.py | 22 +++++----------------- 3 files changed, 15 insertions(+), 49 deletions(-) diff --git a/pyreadstat/pyclasses.py b/pyreadstat/pyclasses.py index b9440f2..905cdda 100644 --- a/pyreadstat/pyclasses.py +++ b/pyreadstat/pyclasses.py @@ -22,6 +22,7 @@ class MissingRange(TypedDict): """A dictionary to hold the definition of a missing range""" + lo: float hi: float @@ -59,15 +60,11 @@ def __init__(self) -> None: self.readstat_variable_types: dict[str, str] = dict() self.table_name: str = None # type: ignore[assignment] self.missing_ranges: dict[str, list[int | float | str | MissingRange]] = dict() - self.missing_user_values: dict[str, list[int | float | str | MissingRange]] = ( - dict() - ) + self.missing_user_values: dict[str, list[int | float | str | MissingRange]] = dict() self.variable_storage_width: dict[str, int] = dict() self.variable_display_width: dict[str, int] = dict() self.variable_alignment: dict[str, str] = dict() - self.variable_measure: dict[ - str, Literal["nominal", "ordinal", "scale", "unknown"] - ] = dict() + self.variable_measure: dict[str, Literal["nominal", "ordinal", "scale", "unknown"]] = dict() self.creation_time: datetime = None # type: ignore[assignment] self.modification_time: datetime = None # type: ignore[assignment] self.mr_sets: dict[str, MRSet] = dict() diff --git a/pyreadstat/pyfunctions.py b/pyreadstat/pyfunctions.py index ed477cc..24a5b39 100644 --- a/pyreadstat/pyfunctions.py +++ b/pyreadstat/pyfunctions.py @@ -53,10 +53,7 @@ def set_value_labels( labels = deepcopy(labels) if var_name in df_copy.columns: # unique does not work for polars Object - if ( - not df_copy.implementation.is_pandas() - and df_copy[var_name].dtype == nw.Object - ): + if not df_copy.implementation.is_pandas() and df_copy[var_name].dtype == nw.Object: unvals = list(set(df_copy[var_name].to_list())) else: unvals = df_copy[var_name].unique() @@ -72,20 +69,13 @@ def set_value_labels( elif not df_copy.implementation.is_pandas() and ( df_copy[var_name].dtype == nw.Object or not all( - [ - type(v) == type(list(labels.values())[0]) - for v in labels.values() - if v is not None - ] + [type(v) == type(list(labels.values())[0]) for v in labels.values() if v is not None] ) ): # polars is very difficult to convince to mix strings and numbers, so we have to do it this way temp = [labels[x] for x in df_copy[var_name]] newser = nw.new_series( - name=var_name, - values=temp, - dtype=nw.Object, - backend=df_copy.implementation, + name=var_name, values=temp, dtype=nw.Object, backend=df_copy.implementation ) df_copy = df_copy.with_columns(newser.alias(var_name)) if formats_as_category or formats_as_ordered_category: @@ -93,17 +83,12 @@ def set_value_labels( warnings.warn(msg, RuntimeWarning) continue # not sure if we get into this situation ever or what would exactly happen, maybe this is not needed? - elif ( - not df_copy.implementation.is_pandas() - and df_copy[var_name].dtype == nw.Unknown - ): + elif not df_copy.implementation.is_pandas() and df_copy[var_name].dtype == nw.Unknown: msg = f"It was not possible to apply value formats to variable '{var_name}' due to unknown/not supported data type" warnings.warn(msg, RuntimeWarning) continue else: - df_copy = df_copy.with_columns( - nw.col(var_name).replace_strict(labels) - ) + df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels)) if formats_as_ordered_category: categories = list(set(labels.values())) original_values = list(labels.keys()) @@ -114,13 +99,9 @@ def set_value_labels( if not revdict.get(curcat): revdict[curcat] = orival categories.sort(key=revdict.get) - df_copy = df_copy.with_columns( - nw.col(var_name).cast(nw.Enum(categories)) - ) + df_copy = df_copy.with_columns(nw.col(var_name).cast(nw.Enum(categories))) elif formats_as_category: - df_copy = df_copy.with_columns( - nw.col(var_name).cast(nw.Categorical) - ) + df_copy = df_copy.with_columns(nw.col(var_name).cast(nw.Categorical)) return df_copy.to_native() diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index a56e9ce..1231153 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -845,9 +845,7 @@ def read_por( metadata.file_format = parser_format if apply_value_formats: - data_frame = set_value_labels( - data_frame, metadata, formats_as_category=formats_as_category - ) + data_frame = set_value_labels(data_frame, metadata, formats_as_category=formats_as_category) return data_frame, metadata @@ -1054,9 +1052,7 @@ def read_file_in_chunks( **kwargs, ) else: - df, meta = read_function( - file_path, row_offset=offset, row_limit=chunksize, **kwargs - ) + df, meta = read_function(file_path, row_offset=offset, row_limit=chunksize, **kwargs) if len(df): yield df, meta offset += chunksize @@ -1155,10 +1151,7 @@ def read_file_multiprocessing( final, meta = read_function(file_path, **kwargs) numrows = min(max(numrows - row_offset, 0), row_limit) - divs = [ - numrows // num_processes + (1 if x < numrows % num_processes else 0) - for x in range(num_processes) - ] + divs = [numrows // num_processes + (1 if x < numrows % num_processes else 0) for x in range(num_processes)] offsets = list() prev_offset = row_offset prev_div = 0 @@ -1167,10 +1160,7 @@ def read_file_multiprocessing( prev_offset = offset prev_div = div offsets.append((offset, div)) - jobs = [ - (read_function, file_path, offset, chunksize, kwargs) - for offset, chunksize in offsets - ] + jobs = [(read_function, file_path, offset, chunksize, kwargs) for offset, chunksize in offsets] pool = mp.Pool(processes=num_processes) try: chunks = pool.map(worker, jobs) @@ -1270,9 +1260,7 @@ def write_sav( for col_name, col_format in variable_format.items(): if col_format in formats_presets.keys() and col_name in df.columns: var_width = str(len(str(max(df[col_name])))) - variable_format[col_name] = formats_presets[col_format].format( - var_width=var_width - ) + variable_format[col_name] = formats_presets[col_format].format(var_width=var_width) writer_entry_point( df, From 047096aabecbeb96239df1b9670effb61581a2a6 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 19:09:28 +0000 Subject: [PATCH 13/38] fix comment for type test --- tests/typing_tests.pyi | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/typing_tests.pyi b/tests/typing_tests.pyi index d7a94a5..c6659bf 100644 --- a/tests/typing_tests.pyi +++ b/tests/typing_tests.pyi @@ -109,9 +109,7 @@ def test_read_sas7bcat_types() -> None: df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] df, meta = read_sas7bcat("file.sas7bcat") - reveal_type( - df - ) # pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray] + reveal_type(df) # pandas.core.frame.DataFrame reveal_type(meta) # metadata_container df, meta = read_sas7bcat("file.sas7bcat", output_format="pandas") @@ -134,9 +132,7 @@ def test_read_multiprocessing() -> None: df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True) reveal_type(df) # pandas.core.frame.DataFrame - df, meta = read_file_multiprocessing( - read_sav, "file.sav", metadataonly=True, output_format="polars" - ) + df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True, output_format="polars") reveal_type(df) # polars.dataframe.frame.DataFrame read_file_multiprocessing(noop, "file.sav", 1, 1) # wrong callable, should error @@ -149,9 +145,7 @@ def test_read_file_in_chunks() -> None: for df, meta in read_file_in_chunks(read_sav, "file.sav", metadataonly=True): reveal_type(df) # pandas.core.frame.DataFrame - for df, meta in read_file_in_chunks( - read_sav, "file.sav", metadataonly=True, output_format="polars" - ): + for df, meta in read_file_in_chunks(read_sav, "file.sav", metadataonly=True, output_format="polars"): reveal_type(df) # polars.dataframe.frame.DataFrame read_file_in_chunks(noop, "file.sav", 1, 1) # wrong callable, should error From c9258f82692208c5d50c0902d36fa5a8719cff22 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 19:22:27 +0000 Subject: [PATCH 14/38] revert some formatting changes due to line length --- pyreadstat/pyreadstat.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index 1231153..bce8dfe 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -1252,10 +1252,7 @@ def write_sav( writer_format = "sav" # formats - formats_presets = { - "restricted_integer": "N{var_width}", - "integer": "F{var_width}.0", - } + formats_presets = {"restricted_integer": "N{var_width}", "integer": "F{var_width}.0"} if variable_format: for col_name, col_format in variable_format.items(): if col_format in formats_presets.keys() and col_name in df.columns: From 353f2133047bbc2eed28a3d518ab62139be6e1e5 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 9 Feb 2026 19:37:49 +0000 Subject: [PATCH 15/38] type tests for all other modules --- tests/typing_tests.pyi | 70 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tests/typing_tests.pyi b/tests/typing_tests.pyi index c6659bf..16724c0 100644 --- a/tests/typing_tests.pyi +++ b/tests/typing_tests.pyi @@ -201,3 +201,73 @@ def test_write_por_types() -> None: # Test writing with pandas DataFrame and BytesIO buffer buffer = io.BytesIO() write_por(pandas_df, buffer) + +def test_set_value_labels_types() -> None: + df = pd.DataFrame() + metadata = metadata_container() + + df = set_value_labels(df, metadata) + reveal_type(df) # pandas.core.frame.DataFrame + + df = set_value_labels(df, metadata, formats_as_category=True) + reveal_type(df) # pandas.core.frame.DataFrame + + df = set_value_labels(df, metadata, formats_as_ordered_category=True) + reveal_type(df) # pandas.core.frame.DataFrame + +def test_set_catalog_to_sas_types() -> None: + df = pd.DataFrame() + sas_metadata = metadata_container() + catalog_metadata = metadata_container() + + df, meta = set_catalog_to_sas(df, sas_metadata, catalog_metadata) + reveal_type(df) # pandas.core.frame.DataFrame + reveal_type(meta) # metadata_container + + df, meta = set_catalog_to_sas(df, sas_metadata, catalog_metadata, formats_as_category=True) + reveal_type(df) # pandas.core.frame.DataFrame + + df, meta = set_catalog_to_sas(df, sas_metadata, catalog_metadata, formats_as_ordered_category=True) + reveal_type(df) # pandas.core.frame.DataFrame + +def test_worker_types() -> None: + from pyreadstat.worker import Input, worker + + # Test with a valid input tuple + inpt: Input = (read_sav, "file.sav", 0, 100, {}) + result = worker(inpt) + reveal_type(result) # pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, np.ndarray] + + # Test with an invalid input tuple (wrong callable) + inpt_invalid: Input = (lambda x: "A", "file.sav", 0, 100, {}) + +def test_metadata_container_types() -> None: + from pyreadstat.pyclasses import metadata_container + + meta = metadata_container() + + meta.missing_ranges = { + "var1": [1, 5], + "var2": [{"hi": 1.0, "lo": 0.0}], + "var3": ["a", "b"], + } + meta.mr_sets = { + "set1": { + "type": "D", + "is_dichotomy": True, + "counted_value": 1, + "label": "Set 1", + "variable_list": ["var1", "var2"], + }, + } + meta.variable_measure = { + "var1": "nominal", + "var2": "ordinal", + "var3": "scale", + "var4": "unknown", + "var5": "another", # should error, not a valid Literal + } + + reveal_type(meta.creation_time) # datetime + reveal_type(meta.modification_time) # datetime + reveal_type(meta.missing_user_values) # dict[str, list[int | float | str | MissingRange]] From d1bac16606b134745bc48032095b832ca5a1278c Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Fri, 13 Feb 2026 12:51:18 +0000 Subject: [PATCH 16/38] fix inconsistencies with PathLike and FileLike. Fix chunk- and multi-read functions accepting FileLike incorrectly. --- pyreadstat/pyreadstat.py | 36 ++++++++++++++++++------------------ tests/typing_tests.pyi | 12 ------------ 2 files changed, 18 insertions(+), 30 deletions(-) diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index bce8dfe..1b62dbe 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -921,7 +921,7 @@ def read_sas7bcat( @overload def read_file_in_chunks( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike | FileLike, + file_path: str | bytes | PathLike, chunksize: int = ..., offset: int = ..., limit: int = ..., @@ -935,7 +935,7 @@ def read_file_in_chunks( @overload def read_file_in_chunks( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike | FileLike, + file_path: str | bytes | PathLike, chunksize: int = ..., offset: int = ..., limit: int = ..., @@ -949,7 +949,7 @@ def read_file_in_chunks( @overload def read_file_in_chunks( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike | FileLike, + file_path: str | bytes | PathLike, chunksize: int = ..., offset: int = ..., limit: int = ..., @@ -962,7 +962,7 @@ def read_file_in_chunks( ) -> Iterator[tuple[dict[str, np.ndarray], metadata_container]]: ... def read_file_in_chunks( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike | FileLike, + file_path: str | bytes | PathLike, chunksize: int = 100000, offset: int = 0, limit: int = 0, @@ -982,7 +982,7 @@ def read_file_in_chunks( ---------- read_function : pyreadstat function a pyreadstat reading function - file_path : str, bytes, Path-like object or file-like object + file_path : str, bytes or Path-like object path to the file to be read chunksize : integer, optional size of the chunks to read @@ -1061,7 +1061,7 @@ def read_file_in_chunks( @overload def read_file_multiprocessing( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike | FileLike, + file_path: str | bytes | PathLike, num_processes: int | None = ..., num_rows: int | None = ..., *, @@ -1071,7 +1071,7 @@ def read_file_multiprocessing( @overload def read_file_multiprocessing( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike | FileLike, + file_path: str | bytes | PathLike, num_processes: int | None = ..., num_rows: int | None = ..., *, @@ -1081,7 +1081,7 @@ def read_file_multiprocessing( @overload def read_file_multiprocessing( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike | FileLike, + file_path: str | bytes | PathLike, num_processes: int | None = ..., num_rows: int | None = ..., *, @@ -1090,7 +1090,7 @@ def read_file_multiprocessing( ) -> tuple[dict[str, np.ndarray], metadata_container]: ... def read_file_multiprocessing( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike | FileLike, + file_path: str | bytes | PathLike, num_processes: int | None = None, num_rows: int | None = None, **kwargs, @@ -1105,7 +1105,7 @@ def read_file_multiprocessing( ---------- read_function : pyreadstat function a pyreadstat reading function - file_path : str, bytes, Path-like object or file-like object + file_path : str, bytes or Path-like object path to the file to be read num_processes : integer, optional number of processes to spawn, by default the min 4 and the max cores on the computer @@ -1192,7 +1192,7 @@ def read_file_multiprocessing( def write_sav( df: "DataFrame", # Can't be `IntoDataFrame` because columns get accessed via `__getitem__` - dst_path: str | bytes | PathLike | FileLike, + dst_path: str | bytes | PathLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, compress: bool = False, @@ -1211,7 +1211,7 @@ def write_sav( ---------- df : dataframe dataframe to write to sav or zsav - dst_path : str or pathlib.Path + dst_path : str, bytes or Path-like object full path to the result sav or zsav file file_label : str, optional a label for the file @@ -1278,7 +1278,7 @@ def write_sav( def write_dta( df: IntoDataFrame, - dst_path: str | bytes | PathLike | FileLike, + dst_path: str | bytes | PathLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, version: int = 15, @@ -1293,7 +1293,7 @@ def write_dta( ---------- df : dataframe dataframe to write to sav or zsav - dst_path : str or pathlib.Path + dst_path : str, bytes or Path-like object full path to the result dta file file_label : str, optional a label for the file @@ -1334,7 +1334,7 @@ def write_dta( def write_xport( df: IntoDataFrame, - dst_path: str | bytes | PathLike | FileLike, + dst_path: str | bytes | PathLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, table_name: str | None = None, @@ -1351,7 +1351,7 @@ def write_xport( ---------- df : dataframe dataframe to write to xport - dst_path : str or pathlib.Path + dst_path : str, bytes or Path-like object full path to the result xport file file_label : str, optional a label for the file @@ -1385,7 +1385,7 @@ def write_xport( def write_por( df: IntoDataFrame, - dst_path: str | bytes | PathLike | FileLike, + dst_path: str | bytes | PathLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, variable_format: dict[str, str] | None = None, @@ -1397,7 +1397,7 @@ def write_por( ---------- df : dataframe data frame to write to por - dst_path : str or pathlib.Path + dst_path : str, bytes or Path-like object full path to the result por file file_label : str, optional a label for the file diff --git a/tests/typing_tests.pyi b/tests/typing_tests.pyi index 16724c0..9a64aa5 100644 --- a/tests/typing_tests.pyi +++ b/tests/typing_tests.pyi @@ -159,9 +159,6 @@ def test_write_sav_types() -> None: write_sav(polars_df, "file.sav") # Test writing with pandas DataFrame and Path object write_sav(pandas_df, Path("file.sav")) - # Test writing with pandas DataFrame and BytesIO buffer - buffer = io.BytesIO() - write_sav(pandas_df, buffer) def test_write_dta_types() -> None: pandas_df = pd.DataFrame() @@ -172,9 +169,6 @@ def test_write_dta_types() -> None: write_dta(polars_df, "file.dta") # Test writing with pandas DataFrame and Path object write_dta(pandas_df, Path("file.dta")) - # Test writing with pandas DataFrame and BytesIO buffer - buffer = io.BytesIO() - write_dta(pandas_df, buffer) def test_write_xport_types() -> None: pandas_df = pd.DataFrame() @@ -185,9 +179,6 @@ def test_write_xport_types() -> None: write_xport(polars_df, "file.xpt") # Test writing with pandas DataFrame and Path object write_xport(pandas_df, Path("file.xpt")) - # Test writing with pandas DataFrame and BytesIO buffer - buffer = io.BytesIO() - write_xport(pandas_df, buffer) def test_write_por_types() -> None: pandas_df = pd.DataFrame() @@ -198,9 +189,6 @@ def test_write_por_types() -> None: write_por(polars_df, "file.por") # Test writing with pandas DataFrame and Path object write_por(pandas_df, Path("file.por")) - # Test writing with pandas DataFrame and BytesIO buffer - buffer = io.BytesIO() - write_por(pandas_df, buffer) def test_set_value_labels_types() -> None: df = pd.DataFrame() From e734fd2a48ebe61a06125a0d3f80b26a80b03f35 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Fri, 13 Feb 2026 13:03:38 +0000 Subject: [PATCH 17/38] file format for write_xport must be 5 or 8 --- pyreadstat/pyreadstat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index 1b62dbe..aab965a 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -1338,7 +1338,7 @@ def write_xport( file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, table_name: str | None = None, - file_format_version: int = 8, + file_format_version: Literal[5, 8] = 8, variable_format: dict[str, str] | None = None, ) -> None: """ From 77e5c2fb8ea07ccbe0ab14f77107f072cc70820d Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Fri, 13 Feb 2026 13:23:09 +0000 Subject: [PATCH 18/38] sync type in docstring to type annotation --- pyreadstat/pyreadstat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index aab965a..38aee4c 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -151,8 +151,8 @@ def read_sas7bdat( metadata object. The data frame will be set with the correct column names but no data. dates_as_pandas_datetime : bool, optional by default False. If true dates will be transformed to pandas datetime64 instead of date, effective only for pandas. - catalog_file : str, optional - path to a sas7bcat file. By default is None. If not None, will parse the catalog file and replace the values + catalog_file : str, bytes, Path-like object or file-like object, optional + path to a sas7bcat file or file-like object. By default is None. If not None, will parse the catalog file and replace the values by the formats in the catalog, if any appropiate is found. If this is not the behavior you are looking for, Use read_sas7bcat to parse the catalog independently of the sas7bdat and set_catalog_to_sas to apply the resulting format into sas7bdat files. From 2c20526b16bf9b477686cc685171f8bab357e13c Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Fri, 13 Feb 2026 13:31:01 +0000 Subject: [PATCH 19/38] Add ParamSpec to PyreadstatReadFunction type definition --- pyreadstat/pyreadstat.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index 38aee4c..f07d237 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -17,7 +17,7 @@ from collections.abc import Callable, Iterator import multiprocessing as mp from os import PathLike -from typing import TYPE_CHECKING, Concatenate, Literal, TypeAlias, overload, Protocol +from typing import TYPE_CHECKING, Concatenate, Literal, ParamSpec, TypeAlias, overload, Protocol import narwhals.stable.v2 as nw import numpy as np @@ -52,8 +52,10 @@ def read(self, size: int | None = -1, /) -> bytes: ... def seek(self, pos: int, whence: int = 0, /) -> int: ... +_P = ParamSpec("_P") + PyreadstatReadFunction = Callable[ - Concatenate[str | bytes | PathLike | FileLike, ...], + Concatenate[str | bytes | PathLike | FileLike, _P], "tuple[DataFrame | dict[str, np.ndarray], metadata_container]", ] From c5346a0bce1f2177772e74b15865c172a71a69e9 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 16 Feb 2026 17:15:25 +0000 Subject: [PATCH 20/38] add py.typed to package files --- MANIFEST.in | 2 +- setup.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index e470bfc..b7eb7b5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,4 +5,4 @@ include *.pyx recursive-include pyreadstat *.pyx include *.pxd recursive-include pyreadstat *.pxd - +recursive-include pyreadstat py.typed diff --git a/setup.py b/setup.py index f1dc4a3..ce15d08 100644 --- a/setup.py +++ b/setup.py @@ -167,6 +167,8 @@ def is_python_lt_14(): ], ext_modules=extensions, packages=["pyreadstat"], + package_data={"pyreadstat": ["py.typed"]}, + include_package_data=True, data_files=data_files, install_requires=['narwhals>=2.10.1', 'numpy'], license="Apache-2.0", From a2e101aa600b0f6f4d2fab84523a0554b3326cb3 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 16 Feb 2026 17:16:00 +0000 Subject: [PATCH 21/38] change metadata_container to dataclass, added missing file_label attribute --- pyreadstat/pyclasses.py | 48 +++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/pyreadstat/pyclasses.py b/pyreadstat/pyclasses.py index 905cdda..bdf5ffc 100644 --- a/pyreadstat/pyclasses.py +++ b/pyreadstat/pyclasses.py @@ -16,6 +16,7 @@ # Typing +from dataclasses import dataclass, field from datetime import datetime from typing import Literal, TypedDict @@ -40,31 +41,32 @@ class MRSet(TypedDict): # Classes +@dataclass class metadata_container: """ This class holds metadata we want to give back to python """ - def __init__(self) -> None: - self.column_names: list[str] = list() - self.column_labels: list[str] = list() - self.column_names_to_labels: dict[str, str] = dict() - self.file_encoding: str = None # type: ignore[assignment] - self.number_columns: int = None # type: ignore[assignment] - self.number_rows: int = None # type: ignore[assignment] - self.variable_value_labels: dict[str, dict[float | int, str]] = dict() - self.value_labels: dict[str, dict[float | int, str]] = dict() - self.variable_to_label: dict[str, str] = dict() - self.notes: list[str] = list() - self.original_variable_types: dict[str, str] = dict() - self.readstat_variable_types: dict[str, str] = dict() - self.table_name: str = None # type: ignore[assignment] - self.missing_ranges: dict[str, list[int | float | str | MissingRange]] = dict() - self.missing_user_values: dict[str, list[int | float | str | MissingRange]] = dict() - self.variable_storage_width: dict[str, int] = dict() - self.variable_display_width: dict[str, int] = dict() - self.variable_alignment: dict[str, str] = dict() - self.variable_measure: dict[str, Literal["nominal", "ordinal", "scale", "unknown"]] = dict() - self.creation_time: datetime = None # type: ignore[assignment] - self.modification_time: datetime = None # type: ignore[assignment] - self.mr_sets: dict[str, MRSet] = dict() + column_names: list[str] = field(default_factory=list) + column_labels: list[str] = field(default_factory=list) + column_names_to_labels: dict[str, str] = field(default_factory=dict) + file_encoding: str = "" + file_label: str = "" + number_columns: int = 0 + number_rows: int = 0 + variable_value_labels: dict[str, dict[float | int, str]] = field(default_factory=dict) + value_labels: dict[str, dict[float | int, str]] = field(default_factory=dict) + variable_to_label: dict[str, str] = field(default_factory=dict) + notes: list[str] = field(default_factory=list) + original_variable_types: dict[str, str] = field(default_factory=dict) + readstat_variable_types: dict[str, str] = field(default_factory=dict) + table_name: str = "" + missing_ranges: dict[str, list[int | float | str | MissingRange]] = field(default_factory=dict) + missing_user_values: dict[str, list[int | float | str | MissingRange]] = field(default_factory=dict) + variable_storage_width: dict[str, int] = field(default_factory=dict) + variable_display_width: dict[str, int] = field(default_factory=dict) + variable_alignment: dict[str, str] = field(default_factory=dict) + variable_measure: dict[str, Literal["nominal", "ordinal", "scale", "unknown"]] = field(default_factory=dict) + creation_time: datetime = field(default_factory=datetime.now) + modification_time: datetime = field(default_factory=datetime.now) + mr_sets: dict[str, MRSet] = field(default_factory=dict) From 8ac87b70781674e1f3785f6b9179480262bae242 Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Mon, 16 Feb 2026 17:17:08 +0000 Subject: [PATCH 22/38] Change narwhals frame type to pandas/polars types and tweaks to path and numpy types --- pyreadstat/pyreadstat.py | 134 ++++++++++++++++++++------------------- 1 file changed, 69 insertions(+), 65 deletions(-) diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index f07d237..ef2f075 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -21,7 +21,7 @@ import narwhals.stable.v2 as nw import numpy as np -from narwhals.typing import IntoDataFrame +import numpy.typing as npt from ._readstat_parser import parser_entry_point from ._readstat_writer import writer_entry_point, PyreadstatError @@ -52,11 +52,15 @@ def read(self, size: int | None = -1, /) -> bytes: ... def seek(self, pos: int, whence: int = 0, /) -> int: ... -_P = ParamSpec("_P") +FilePathLike: TypeAlias = str | bytes | PathLike[str] | PathLike[bytes] +FilePathorBuffer: TypeAlias = FilePathLike | FileLike + +DictOutput = dict[str, npt.NDArray[np.generic]] +_P = ParamSpec("_P") PyreadstatReadFunction = Callable[ - Concatenate[str | bytes | PathLike | FileLike, _P], - "tuple[DataFrame | dict[str, np.ndarray], metadata_container]", + Concatenate[FilePathorBuffer, _P], + "tuple[DataFrame | DictOutput, metadata_container]" ] @@ -67,10 +71,10 @@ def seek(self, pos: int, whence: int = 0, /) -> int: ... @overload def read_sas7bdat( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., - catalog_file: str | bytes | PathLike | FileLike | None = ..., + catalog_file: FilePathorBuffer | None = ..., formats_as_category: bool = ..., formats_as_ordered_category: bool = ..., encoding: str | None = ..., @@ -86,10 +90,10 @@ def read_sas7bdat( ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_sas7bdat( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., - catalog_file: str | bytes | PathLike | FileLike | None = ..., + catalog_file: FilePathorBuffer | None = ..., formats_as_category: bool = ..., formats_as_ordered_category: bool = ..., encoding: str | None = ..., @@ -105,10 +109,10 @@ def read_sas7bdat( ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_sas7bdat( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., - catalog_file: str | bytes | PathLike | FileLike | None = ..., + catalog_file: FilePathorBuffer | None = ..., formats_as_category: bool = ..., formats_as_ordered_category: bool = ..., encoding: str | None = ..., @@ -121,12 +125,12 @@ def read_sas7bdat( extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., extra_time_formats: list[str] | None = ..., -) -> tuple[dict[str, np.ndarray], metadata_container]: ... +) -> tuple[DictOutput, metadata_container]: ... def read_sas7bdat( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = False, dates_as_pandas_datetime: bool = False, - catalog_file: str | bytes | PathLike | FileLike | None = None, + catalog_file: FilePathorBuffer | None = None, formats_as_category: bool = True, formats_as_ordered_category: bool = False, encoding: str | None = None, @@ -139,7 +143,7 @@ def read_sas7bdat( extra_datetime_formats: list[str] | None = None, extra_date_formats: list[str] | None = None, extra_time_formats: list[str] | None = None, -) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": +) -> "tuple[DataFrame | DictOutput, metadata_container]": r""" Read a SAS sas7bdat file. It accepts the path to a sas7bcat. @@ -243,7 +247,7 @@ def read_sas7bdat( @overload def read_xport( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., encoding: str | None = ..., @@ -258,7 +262,7 @@ def read_xport( ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_xport( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., encoding: str | None = ..., @@ -273,7 +277,7 @@ def read_xport( ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_xport( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., encoding: str | None = ..., @@ -285,9 +289,9 @@ def read_xport( extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., extra_time_formats: list[str] | None = ..., -) -> tuple[dict[str, np.ndarray], metadata_container]: ... +) -> tuple[DictOutput, metadata_container]: ... def read_xport( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = False, dates_as_pandas_datetime: bool = False, encoding: str | None = None, @@ -299,7 +303,7 @@ def read_xport( extra_datetime_formats: list[str] | None = None, extra_date_formats: list[str] | None = None, extra_time_formats: list[str] | None = None, -) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": +) -> "tuple[DataFrame | DictOutput, metadata_container]": r""" Read a SAS xport file. @@ -370,7 +374,7 @@ def read_xport( @overload def read_dta( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -389,7 +393,7 @@ def read_dta( ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_dta( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -408,7 +412,7 @@ def read_dta( ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_dta( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -424,9 +428,9 @@ def read_dta( extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., extra_time_formats: list[str] | None = ..., -) -> tuple[dict[str, np.ndarray], metadata_container]: ... +) -> tuple[DictOutput, metadata_container]: ... def read_dta( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = False, dates_as_pandas_datetime: bool = False, apply_value_formats: bool = False, @@ -442,7 +446,7 @@ def read_dta( extra_datetime_formats: list[str] | None = None, extra_date_formats: list[str] | None = None, extra_time_formats: list[str] | None = None, -) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": +) -> "tuple[DataFrame | DictOutput, metadata_container]": r""" Read a STATA dta file @@ -538,7 +542,7 @@ def read_dta( @overload def read_sav( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -557,7 +561,7 @@ def read_sav( ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_sav( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -576,7 +580,7 @@ def read_sav( ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_sav( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -592,9 +596,9 @@ def read_sav( extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., extra_time_formats: list[str] | None = ..., -) -> tuple[dict[str, np.ndarray], metadata_container]: ... +) -> tuple[DictOutput, metadata_container]: ... def read_sav( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = False, dates_as_pandas_datetime: bool = False, apply_value_formats: bool = False, @@ -610,7 +614,7 @@ def read_sav( extra_datetime_formats: list[str] | None = None, extra_date_formats: list[str] | None = None, extra_time_formats: list[str] | None = None, -) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": +) -> "tuple[DataFrame | DictOutput, metadata_container]": r""" Read a SPSS sav or zsav (compressed) files @@ -707,7 +711,7 @@ def read_sav( @overload def read_por( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -724,7 +728,7 @@ def read_por( ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_por( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -741,7 +745,7 @@ def read_por( ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_por( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = ..., dates_as_pandas_datetime: bool = ..., apply_value_formats: bool = ..., @@ -755,9 +759,9 @@ def read_por( extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., extra_time_formats: list[str] | None = ..., -) -> tuple[dict[str, np.ndarray], metadata_container]: ... +) -> tuple[DictOutput, metadata_container]: ... def read_por( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, metadataonly: bool = False, dates_as_pandas_datetime: bool = False, apply_value_formats: bool = False, @@ -771,7 +775,7 @@ def read_por( extra_datetime_formats: list[str] | None = None, extra_date_formats: list[str] | None = None, extra_time_formats: list[str] | None = None, -) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": +) -> "tuple[DataFrame | DictOutput, metadata_container]": r""" Read a SPSS por file. Files are assumed to be UTF-8 encoded, the encoding cannot be set to other. @@ -854,27 +858,27 @@ def read_por( @overload def read_sas7bcat( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, encoding: str | None = ..., output_format: Literal["pandas"] | None = ..., ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_sas7bcat( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, encoding: str | None = ..., output_format: Literal["polars"] = "polars", ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_sas7bcat( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, encoding: str | None = ..., output_format: Literal["dict"] = "dict", -) -> tuple[dict[str, np.ndarray], metadata_container]: ... +) -> tuple[DictOutput, metadata_container]: ... def read_sas7bcat( - filename_path: str | bytes | PathLike | FileLike, + filename_path: FilePathorBuffer, encoding: str | None = None, output_format: Literal["pandas", "polars", "dict"] | None = None, -) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": +) -> "tuple[DataFrame | DictOutput, metadata_container]": r""" Read a SAS sas7bcat file. The returning dataframe will be empty. The metadata object will contain a dictionary value_labels that contains the formats. When parsing the sas7bdat file, in the metadata, the dictionary @@ -923,7 +927,7 @@ def read_sas7bcat( @overload def read_file_in_chunks( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike, + file_path: FilePathLike, chunksize: int = ..., offset: int = ..., limit: int = ..., @@ -937,7 +941,7 @@ def read_file_in_chunks( @overload def read_file_in_chunks( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike, + file_path: FilePathLike, chunksize: int = ..., offset: int = ..., limit: int = ..., @@ -951,7 +955,7 @@ def read_file_in_chunks( @overload def read_file_in_chunks( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike, + file_path: FilePathLike, chunksize: int = ..., offset: int = ..., limit: int = ..., @@ -961,10 +965,10 @@ def read_file_in_chunks( *, output_format: Literal["dict"] = "dict", **kwargs, -) -> Iterator[tuple[dict[str, np.ndarray], metadata_container]]: ... +) -> Iterator[tuple[DictOutput, metadata_container]]: ... def read_file_in_chunks( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike, + file_path: FilePathLike, chunksize: int = 100000, offset: int = 0, limit: int = 0, @@ -972,7 +976,7 @@ def read_file_in_chunks( num_processes: int = 4, num_rows: int | None = None, **kwargs, -) -> "Iterator[tuple[DataFrame | dict[str, np.ndarray], metadata_container]]": +) -> "Iterator[tuple[DataFrame | DictOutput, metadata_container]]": """ Returns a generator that will allow to read a file in chunks. @@ -1063,7 +1067,7 @@ def read_file_in_chunks( @overload def read_file_multiprocessing( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike, + file_path: FilePathLike, num_processes: int | None = ..., num_rows: int | None = ..., *, @@ -1073,7 +1077,7 @@ def read_file_multiprocessing( @overload def read_file_multiprocessing( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike, + file_path: FilePathLike, num_processes: int | None = ..., num_rows: int | None = ..., *, @@ -1083,20 +1087,20 @@ def read_file_multiprocessing( @overload def read_file_multiprocessing( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike, + file_path: FilePathLike, num_processes: int | None = ..., num_rows: int | None = ..., *, output_format: Literal["dict"] = "dict", **kwargs, -) -> tuple[dict[str, np.ndarray], metadata_container]: ... +) -> tuple[DictOutput, metadata_container]: ... def read_file_multiprocessing( read_function: PyreadstatReadFunction, - file_path: str | bytes | PathLike, + file_path: FilePathLike, num_processes: int | None = None, num_rows: int | None = None, **kwargs, -) -> "tuple[DataFrame | dict[str, np.ndarray], metadata_container]": +) -> "tuple[DataFrame | DictOutput, metadata_container]": """ Reads a file in parallel using multiprocessing. For Xport, Por and some defective sav files where the number of rows in the dataset canot be obtained from the metadata, @@ -1193,8 +1197,8 @@ def read_file_multiprocessing( def write_sav( - df: "DataFrame", # Can't be `IntoDataFrame` because columns get accessed via `__getitem__` - dst_path: str | bytes | PathLike, + df: "DataFrame", + dst_path: FilePathLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, compress: bool = False, @@ -1279,8 +1283,8 @@ def write_sav( def write_dta( - df: IntoDataFrame, - dst_path: str | bytes | PathLike, + df: "DataFrame", + dst_path: FilePathLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, version: int = 15, @@ -1335,8 +1339,8 @@ def write_dta( def write_xport( - df: IntoDataFrame, - dst_path: str | bytes | PathLike, + df: "DataFrame", + dst_path: FilePathLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, table_name: str | None = None, @@ -1386,11 +1390,11 @@ def write_xport( def write_por( - df: IntoDataFrame, - dst_path: str | bytes | PathLike, + df: "DataFrame", + dst_path: FilePathLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, - variable_format: dict[str, str] | None = None, + variable_format: dict[str, str] | None = None ) -> None: """ Writes a dataframe to a SPSS POR file. From b82fc504b866ae5b98f0fbc94f7b7e0ccc267036 Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Tue, 24 Feb 2026 21:26:57 +0000 Subject: [PATCH 23/38] Revert metadata values to optional and set MRSet["counted_value"] as optional too --- pyreadstat/pyclasses.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pyreadstat/pyclasses.py b/pyreadstat/pyclasses.py index bdf5ffc..478002d 100644 --- a/pyreadstat/pyclasses.py +++ b/pyreadstat/pyclasses.py @@ -33,7 +33,7 @@ class MRSet(TypedDict): type: Literal["D", "C"] is_dichotomy: bool - counted_value: int + counted_value: int | None label: str variable_list: list[str] @@ -50,23 +50,23 @@ class metadata_container: column_names: list[str] = field(default_factory=list) column_labels: list[str] = field(default_factory=list) column_names_to_labels: dict[str, str] = field(default_factory=dict) - file_encoding: str = "" - file_label: str = "" + file_encoding: str | None = None + file_label: str | None = None number_columns: int = 0 - number_rows: int = 0 + number_rows: int | None = None variable_value_labels: dict[str, dict[float | int, str]] = field(default_factory=dict) value_labels: dict[str, dict[float | int, str]] = field(default_factory=dict) variable_to_label: dict[str, str] = field(default_factory=dict) notes: list[str] = field(default_factory=list) original_variable_types: dict[str, str] = field(default_factory=dict) readstat_variable_types: dict[str, str] = field(default_factory=dict) - table_name: str = "" + table_name: str | None = None missing_ranges: dict[str, list[int | float | str | MissingRange]] = field(default_factory=dict) missing_user_values: dict[str, list[int | float | str | MissingRange]] = field(default_factory=dict) variable_storage_width: dict[str, int] = field(default_factory=dict) variable_display_width: dict[str, int] = field(default_factory=dict) variable_alignment: dict[str, str] = field(default_factory=dict) variable_measure: dict[str, Literal["nominal", "ordinal", "scale", "unknown"]] = field(default_factory=dict) - creation_time: datetime = field(default_factory=datetime.now) - modification_time: datetime = field(default_factory=datetime.now) + creation_time: datetime | None = None + modification_time: datetime | None = None mr_sets: dict[str, MRSet] = field(default_factory=dict) From 49eea5749753c2251d861408af475a619815a988 Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Tue, 24 Feb 2026 21:27:29 +0000 Subject: [PATCH 24/38] implement type test cases for pytest --- tests/test_mypy_setup.ini | 4 + tests/test_typing.yml | 349 ++++++++++++++++++++++++++++++++++++++ tests/typing_tests.pyi | 261 ---------------------------- 3 files changed, 353 insertions(+), 261 deletions(-) create mode 100644 tests/test_mypy_setup.ini create mode 100644 tests/test_typing.yml delete mode 100644 tests/typing_tests.pyi diff --git a/tests/test_mypy_setup.ini b/tests/test_mypy_setup.ini new file mode 100644 index 0000000..2868ef8 --- /dev/null +++ b/tests/test_mypy_setup.ini @@ -0,0 +1,4 @@ +[mypy] +ignore_missing_imports = True +strict_optional = True +follow_imports = silent \ No newline at end of file diff --git a/tests/test_typing.yml b/tests/test_typing.yml new file mode 100644 index 0000000..2e1bdfa --- /dev/null +++ b/tests/test_typing.yml @@ -0,0 +1,349 @@ +# requires pytest-mypy-plugins +# command to run: pytest tests/test_typing.yml --mypy-ini-file=tests/test_mypy_setup.ini --mypy-only-local-stub + +# yaml-language-server: $schema=https://raw.githubusercontent.com/typeddjango/pytest-mypy-plugins/master/pytest_mypy_plugins/schema.json + +- case: read_sav_default_types + main: | + from pyreadstat import read_sav + df, meta = read_sav("file.sav") + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_sav_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + main: | + from pyreadstat import read_sav + df: object + df, meta = read_sav("file.sav", output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_sav_buffer_types + main: | + import io + from pyreadstat import read_sav + buffer = io.BytesIO() + df, meta = read_sav(buffer) + +- case: read_dta_default_types + main: | + from pyreadstat import read_dta + df, meta = read_dta("file.dta") + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_dta_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + main: | + from pyreadstat import read_dta + df, meta = read_dta("file.dta", output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_dta_buffer_types + main: | + import io + from pyreadstat import read_dta + buffer = io.BytesIO() + df, meta = read_dta(buffer) + +- case: read_por_default_types + main: | + from pyreadstat import read_por + df, meta = read_por("file.por") + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_por_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + main: | + from pyreadstat import read_por + df, meta = read_por("file.por", output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_por_buffer_types + main: | + import io + from pyreadstat import read_por + buffer = io.BytesIO() + df, meta = read_por(buffer) + +- case: read_sas7bdat_default_types + main: | + from pyreadstat import read_sas7bdat + df, meta = read_sas7bdat("file.sas7bdat") + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_sas7bdat_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + main: | + from pyreadstat import read_sas7bdat + df, meta = read_sas7bdat("file.sas7bdat", output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_sas7bdat_buffer_types + main: | + import io + from pyreadstat import read_sas7bdat + buffer = io.BytesIO() + df, meta = read_sas7bdat(buffer) + +- case: read_sas7bcat_default_types + main: | + from pyreadstat import read_sas7bcat + df, meta = read_sas7bcat("file.sas7bcat") + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_xport_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + main: | + from pyreadstat import read_xport + df, meta = read_xport("file.xpt", output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_xport_buffer_types + main: | + import io + from pyreadstat import read_xport + buffer = io.BytesIO() + df, meta = read_xport(buffer) + +- case: read_sas7bcat_default_types + main: | + from pyreadstat import read_sas7bcat + df, meta = read_sas7bcat("file.sas7bcat") + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_sas7bcat_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + main: | + from pyreadstat import read_sas7bcat + df, meta = read_sas7bcat("file.sas7bcat", output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_sas7bcat_buffer_types + main: | + import io + from pyreadstat import read_sas7bcat + buffer = io.BytesIO() + df, meta = read_sas7bcat(buffer) + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + +- case: read_file_multiprocessing_default_types + main: | + from pyreadstat import read_file_multiprocessing, read_sav + df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True) + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_file_multiprocessing_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + main: | + from pyreadstat import read_file_multiprocessing, read_sav + df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True, output_format="polars") + reveal_type(df) # N: Revealed type is "polars.dataframe.frame.DataFrame" + +- case: read_file_multiprocessing_invalid_callable + main: | + from pyreadstat import read_file_multiprocessing + def noop(a: int) -> int: + return a + read_file_multiprocessing(noop, "file.sav", 1, 1) # E: Argument 1 to "read_file_multiprocessing" has incompatible type "Callable[[int], int]"; expected "def (str | bytes | PathLike[str] | PathLike[bytes] | FileLike, /, *Any, **Any) -> tuple[pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray[tuple[int, ...], dtype[generic[Any]]]], metadata_container]" [arg-type] + out: | + main:4: note: This is likely because "noop" has named arguments: "a". Consider marking them positional-only + +- case: read_file_in_chunks_default_types + main: | + from pyreadstat import read_file_in_chunks, read_sav + for df, meta in read_file_in_chunks(read_sav, "file.sav", metadataonly=True): + reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" + reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" + +- case: read_file_in_chunks_output_types + parametrized: + - output_format: "pandas" + expected_type: "pandas.core.frame.DataFrame" + - output_format: "polars" + expected_type: "polars.dataframe.frame.DataFrame" + - output_format: "dict" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + main: | + from pyreadstat import read_file_in_chunks, read_sav + for df, meta in read_file_in_chunks(read_sav, "file.sav", metadataonly=True, output_format="{{ output_format }}"): + reveal_type(df) # N: Revealed type is "{{ expected_type }}" + +- case: read_file_in_chunks_invalid_callable + main: | + from pyreadstat import read_file_in_chunks + def noop(a: int) -> int: + return a + read_file_in_chunks(noop, "file.sav", 1, 1) # E: Argument 1 to "read_file_in_chunks" has incompatible type "Callable[[int], int]"; expected "def (str | bytes | PathLike[str] | PathLike[bytes] | FileLike, /, *Any, **Any) -> tuple[pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray[tuple[int, ...], dtype[generic[Any]]]], metadata_container]" [arg-type] + out: | + main:4: note: This is likely because "noop" has named arguments: "a". Consider marking them positional-only + +- case: write_sav_types + main: | + import pandas as pd + import polars as pl + from pathlib import Path + from pyreadstat import write_sav + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + write_sav(pandas_df, "file.sav") + write_sav(polars_df, "file.sav") + write_sav(pandas_df, Path("file.sav")) + +- case: write_dta_types + main: | + import pandas as pd + import polars as pl + from pathlib import Path + from pyreadstat import write_dta + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + write_dta(pandas_df, "file.dta") + write_dta(polars_df, "file.dta") + write_dta(pandas_df, Path("file.dta")) + +- case: write_xport_types + main: | + import pandas as pd + import polars as pl + from pathlib import Path + from pyreadstat import write_xport + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + write_xport(pandas_df, "file.xpt") + write_xport(polars_df, "file.xpt") + write_xport(pandas_df, Path("file.xpt")) + +- case: write_por_types + main: | + import pandas as pd + import polars as pl + from pathlib import Path + from pyreadstat import write_por + pandas_df = pd.DataFrame() + polars_df = pl.DataFrame() + write_por(pandas_df, "file.por") + write_por(polars_df, "file.por") + write_por(pandas_df, Path("file.por")) + +- case: set_value_labels_types + parametrized: + - backend: "pandas" + expected_value: "pandas.core.frame.DataFrame" + - backend: "polars" + expected_value: "polars.dataframe.frame.DataFrame" + main: | + import {{ backend }} + from pyreadstat import set_value_labels, metadata_container + df = {{ backend }}.DataFrame() + metadata = metadata_container() + df = set_value_labels(df, metadata) + reveal_type(df) # N: Revealed type is "{{ expected_value }}" + +- case: set_catalog_to_sas_types + parametrized: + - backend: "pandas" + expected_value: "pandas.core.frame.DataFrame" + - backend: "polars" + expected_value: "polars.dataframe.frame.DataFrame" + main: | + import {{ backend }} + from pyreadstat import set_catalog_to_sas, metadata_container + df = {{ backend }}.DataFrame() + sas_metadata = metadata_container() + catalog_metadata = metadata_container() + df, meta = set_catalog_to_sas(df, sas_metadata, catalog_metadata) + reveal_type(df) # N: Revealed type is "{{ expected_value }}" + +- case: worker_types + main: | + from pyreadstat import read_sav + from pyreadstat.worker import Input, worker + inpt: Input = (read_sav, "test_data/file.sav", 0, 100, {}) + result = worker(inpt) + reveal_type(result) # N: Revealed type is "pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | builtins.dict[builtins.str, numpy.ndarray[Any, Any]]" + inpt_invalid: Input = (lambda x: "A", "test_data/file.sav", 0, 100, {}) # E: Incompatible types in assignment (expression has type "tuple[Callable[[Any], str], str, int, int, dict[str, Any]]", variable has type "tuple[def (str | bytes | PathLike[str] | PathLike[bytes] | FileLike, /, *Any, **Any) -> tuple[pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray[tuple[int, ...], dtype[generic[Any]]]], metadata_container], str | bytes | PathLike[Any] | FileLike, int, int, dict[str, Any]]") [assignment] + out: | + main:6: error: Cannot infer type of lambda [misc] + +- case: metadata_container_types + main: | + from pyreadstat.pyclasses import metadata_container + meta = metadata_container() + meta.missing_ranges = { + "var1": [1, 5], + "var2": [{"hi": 1.0, "lo": 0.0}], + "var3": ["a", "b"], + "var4": {1, 2, 3}, # E: Dict entry 3 has incompatible type "str": "set[int]"; expected "str": "list[int | float | str | MissingRange]" [dict-item] + } + meta.mr_sets = { + "set1": { + "type": "D", + "is_dichotomy": True, + "counted_value": 1, + "label": "Set 1", + "variable_list": ["var1", "var2"], + }, + "set2": { + "type": "C", + "is_dichotomy": False, + "counted_value": None, + "label": "Set 2", + "variable_list": ["var3"], + }, + "set3": {} # E: Missing keys ("type", "is_dichotomy", "counted_value", "label", "variable_list") for TypedDict "MRSet" [typeddict-item] + } + meta.variable_measure = { + "var1": "nominal", + "var2": "ordinal", + "var3": "scale", + "var4": "unknown", + "var5": "another", # E: Dict entry 4 has incompatible type "str": "Literal['another']"; expected "str": "Literal['nominal', 'ordinal', 'scale', 'unknown']" [dict-item] + } diff --git a/tests/typing_tests.pyi b/tests/typing_tests.pyi deleted file mode 100644 index 9a64aa5..0000000 --- a/tests/typing_tests.pyi +++ /dev/null @@ -1,261 +0,0 @@ -# Run with `mypy tests/typing_tests.pyi` - -import io -from pathlib import Path -from typing import reveal_type - -import numpy as np -import pandas as pd -import polars as pl - -from pyreadstat import * - -def test_read_sav_default() -> None: - df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] - - df, meta = read_sav("file.sav") - reveal_type(df) # pandas.core.frame.DataFrame - reveal_type(meta) # metadata_container - - df, meta = read_sav("file.sav", output_format="pandas") - reveal_type(df) # pandas.core.frame.DataFrame - - df, meta = read_sav("file.sav", output_format="polars") - reveal_type(df) # polars.dataframe.frame.DataFrame - - df, meta = read_sav("file.sav", output_format="dict") - reveal_type(df) # dict[str, ndarray] - - buffer = io.BytesIO() - df, meta = read_sav(buffer) - -def test_read_dta_types() -> None: - df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] - - df, meta = read_dta("file.dta") - reveal_type(df) # pandas.core.frame.DataFrame - reveal_type(meta) # metadata_container - - df, meta = read_dta("file.dta", output_format="pandas") - reveal_type(df) # pandas.core.frame.DataFrame - - df, meta = read_dta("file.dta", output_format="polars") - reveal_type(df) # polars.dataframe.frame.DataFrame - - df, meta = read_dta("file.dta", output_format="dict") - reveal_type(df) # dict[str, ndarray] - - buffer = io.BytesIO() - df, meta = read_dta(buffer) - -def test_read_por_types() -> None: - df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] - - df, meta = read_por("file.por") - reveal_type(df) # pandas.core.frame.DataFrame - reveal_type(meta) # metadata_container - - df, meta = read_por("file.por", output_format="pandas") - reveal_type(df) # pandas.core.frame.DataFrame - - df, meta = read_por("file.por", output_format="polars") - reveal_type(df) # polars.dataframe.frame.DataFrame - - df, meta = read_por("file.por", output_format="dict") - reveal_type(df) # dict[str, ndarray] - - buffer = io.BytesIO() - df, meta = read_por(buffer) - -def test_read_sas7bdat_types() -> None: - df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] - - df, meta = read_sas7bdat("file.sas7bdat") - reveal_type(df) # pandas.core.frame.DataFrame - reveal_type(meta) # metadata_container - - df, meta = read_sas7bdat("file.sas7bdat", output_format="pandas") - reveal_type(df) # pandas.core.frame.DataFrame - - df, meta = read_sas7bdat("file.sas7bdat", output_format="polars") - reveal_type(df) # polars.dataframe.frame.DataFrame - - df, meta = read_sas7bdat("file.sas7bdat", output_format="dict") - reveal_type(df) # dict[str, ndarray] - - buffer = io.BytesIO() - df, meta = read_sas7bdat(buffer) - -def test_read_xport_types() -> None: - df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] - - df, meta = read_xport("file.xpt") - reveal_type(df) # pandas.core.frame.DataFrame - reveal_type(meta) # metadata_container - - df, meta = read_xport("file.xpt", output_format="pandas") - reveal_type(df) # pandas.core.frame.DataFrame - - df, meta = read_xport("file.xpt", output_format="polars") - reveal_type(df) # polars.dataframe.frame.DataFrame - - df, meta = read_xport("file.xpt", output_format="dict") - reveal_type(df) # dict[str, ndarray] - - buffer = io.BytesIO() - df, meta = read_xport(buffer) - -def test_read_sas7bcat_types() -> None: - df: pd.DataFrame | pl.DataFrame | dict[str, np.ndarray] - - df, meta = read_sas7bcat("file.sas7bcat") - reveal_type(df) # pandas.core.frame.DataFrame - reveal_type(meta) # metadata_container - - df, meta = read_sas7bcat("file.sas7bcat", output_format="pandas") - reveal_type(df) # pandas.core.frame.DataFrame - - df, meta = read_sas7bcat("file.sas7bcat", output_format="polars") - reveal_type(df) # polars.dataframe.frame.DataFrame - - df, meta = read_sas7bcat("file.sas7bcat", output_format="dict") - reveal_type(df) # dict[str, ndarray] - - buffer = io.BytesIO() - df, meta = read_sas7bcat(buffer) - -def test_read_multiprocessing() -> None: - df: pd.DataFrame | pl.DataFrame - def noop(a: int, /) -> int: - return a - - df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True) - reveal_type(df) # pandas.core.frame.DataFrame - - df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True, output_format="polars") - reveal_type(df) # polars.dataframe.frame.DataFrame - - read_file_multiprocessing(noop, "file.sav", 1, 1) # wrong callable, should error - -def test_read_file_in_chunks() -> None: - df: pd.DataFrame | pl.DataFrame - def noop(a: int, /) -> int: - return a - - for df, meta in read_file_in_chunks(read_sav, "file.sav", metadataonly=True): - reveal_type(df) # pandas.core.frame.DataFrame - - for df, meta in read_file_in_chunks(read_sav, "file.sav", metadataonly=True, output_format="polars"): - reveal_type(df) # polars.dataframe.frame.DataFrame - - read_file_in_chunks(noop, "file.sav", 1, 1) # wrong callable, should error - -def test_write_sav_types() -> None: - pandas_df = pd.DataFrame() - polars_df = pl.DataFrame() - # Test writing with pandas DataFrame and string path - write_sav(pandas_df, "file.sav") - # Test writing with polars DataFrame and string path - write_sav(polars_df, "file.sav") - # Test writing with pandas DataFrame and Path object - write_sav(pandas_df, Path("file.sav")) - -def test_write_dta_types() -> None: - pandas_df = pd.DataFrame() - polars_df = pl.DataFrame() - # Test writing with pandas DataFrame and string path - write_dta(pandas_df, "file.dta") - # Test writing with polars DataFrame and string path - write_dta(polars_df, "file.dta") - # Test writing with pandas DataFrame and Path object - write_dta(pandas_df, Path("file.dta")) - -def test_write_xport_types() -> None: - pandas_df = pd.DataFrame() - polars_df = pl.DataFrame() - # Test writing with pandas DataFrame and string path - write_xport(pandas_df, "file.xpt") - # Test writing with polars DataFrame and string path - write_xport(polars_df, "file.xpt") - # Test writing with pandas DataFrame and Path object - write_xport(pandas_df, Path("file.xpt")) - -def test_write_por_types() -> None: - pandas_df = pd.DataFrame() - polars_df = pl.DataFrame() - # Test writing with pandas DataFrame and string path - write_por(pandas_df, "file.por") - # Test writing with polars DataFrame and string path - write_por(polars_df, "file.por") - # Test writing with pandas DataFrame and Path object - write_por(pandas_df, Path("file.por")) - -def test_set_value_labels_types() -> None: - df = pd.DataFrame() - metadata = metadata_container() - - df = set_value_labels(df, metadata) - reveal_type(df) # pandas.core.frame.DataFrame - - df = set_value_labels(df, metadata, formats_as_category=True) - reveal_type(df) # pandas.core.frame.DataFrame - - df = set_value_labels(df, metadata, formats_as_ordered_category=True) - reveal_type(df) # pandas.core.frame.DataFrame - -def test_set_catalog_to_sas_types() -> None: - df = pd.DataFrame() - sas_metadata = metadata_container() - catalog_metadata = metadata_container() - - df, meta = set_catalog_to_sas(df, sas_metadata, catalog_metadata) - reveal_type(df) # pandas.core.frame.DataFrame - reveal_type(meta) # metadata_container - - df, meta = set_catalog_to_sas(df, sas_metadata, catalog_metadata, formats_as_category=True) - reveal_type(df) # pandas.core.frame.DataFrame - - df, meta = set_catalog_to_sas(df, sas_metadata, catalog_metadata, formats_as_ordered_category=True) - reveal_type(df) # pandas.core.frame.DataFrame - -def test_worker_types() -> None: - from pyreadstat.worker import Input, worker - - # Test with a valid input tuple - inpt: Input = (read_sav, "file.sav", 0, 100, {}) - result = worker(inpt) - reveal_type(result) # pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, np.ndarray] - - # Test with an invalid input tuple (wrong callable) - inpt_invalid: Input = (lambda x: "A", "file.sav", 0, 100, {}) - -def test_metadata_container_types() -> None: - from pyreadstat.pyclasses import metadata_container - - meta = metadata_container() - - meta.missing_ranges = { - "var1": [1, 5], - "var2": [{"hi": 1.0, "lo": 0.0}], - "var3": ["a", "b"], - } - meta.mr_sets = { - "set1": { - "type": "D", - "is_dichotomy": True, - "counted_value": 1, - "label": "Set 1", - "variable_list": ["var1", "var2"], - }, - } - meta.variable_measure = { - "var1": "nominal", - "var2": "ordinal", - "var3": "scale", - "var4": "unknown", - "var5": "another", # should error, not a valid Literal - } - - reveal_type(meta.creation_time) # datetime - reveal_type(meta.modification_time) # datetime - reveal_type(meta.missing_user_values) # dict[str, list[int | float | str | MissingRange]] From cf383f8ca66347427cc45732d73842195903437e Mon Sep 17 00:00:00 2001 From: nachomaiz Date: Thu, 5 Mar 2026 15:55:10 +0000 Subject: [PATCH 25/38] fix number_columns type default to None --- pyreadstat/pyclasses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyreadstat/pyclasses.py b/pyreadstat/pyclasses.py index 478002d..11121db 100644 --- a/pyreadstat/pyclasses.py +++ b/pyreadstat/pyclasses.py @@ -52,7 +52,7 @@ class metadata_container: column_names_to_labels: dict[str, str] = field(default_factory=dict) file_encoding: str | None = None file_label: str | None = None - number_columns: int = 0 + number_columns: int | None = None number_rows: int | None = None variable_value_labels: dict[str, dict[float | int, str]] = field(default_factory=dict) value_labels: dict[str, dict[float | int, str]] = field(default_factory=dict) From 48f0d9fb1f3e7713412f934355234bf5c976ce38 Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Sun, 22 Mar 2026 18:02:16 +0000 Subject: [PATCH 26/38] fix numpy typing for Python 3.11+ --- tests/test_typing.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/test_typing.yml b/tests/test_typing.yml index 2e1bdfa..dfee1f2 100644 --- a/tests/test_typing.yml +++ b/tests/test_typing.yml @@ -1,5 +1,5 @@ # requires pytest-mypy-plugins -# command to run: pytest tests/test_typing.yml --mypy-ini-file=tests/test_mypy_setup.ini --mypy-only-local-stub +# command to run: pytest tests/test_typing.yml --mypy-ini-file=tests/test_mypy_setup.ini # yaml-language-server: $schema=https://raw.githubusercontent.com/typeddjango/pytest-mypy-plugins/master/pytest_mypy_plugins/schema.json @@ -17,7 +17,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" main: | from pyreadstat import read_sav df: object @@ -45,7 +45,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" main: | from pyreadstat import read_dta df, meta = read_dta("file.dta", output_format="{{ output_format }}") @@ -72,7 +72,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" main: | from pyreadstat import read_por df, meta = read_por("file.por", output_format="{{ output_format }}") @@ -99,7 +99,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" main: | from pyreadstat import read_sas7bdat df, meta = read_sas7bdat("file.sas7bdat", output_format="{{ output_format }}") @@ -126,7 +126,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" main: | from pyreadstat import read_xport df, meta = read_xport("file.xpt", output_format="{{ output_format }}") @@ -153,7 +153,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" main: | from pyreadstat import read_sas7bcat df, meta = read_sas7bcat("file.sas7bcat", output_format="{{ output_format }}") @@ -181,7 +181,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" main: | from pyreadstat import read_file_multiprocessing, read_sav df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True, output_format="polars") @@ -192,7 +192,7 @@ from pyreadstat import read_file_multiprocessing def noop(a: int) -> int: return a - read_file_multiprocessing(noop, "file.sav", 1, 1) # E: Argument 1 to "read_file_multiprocessing" has incompatible type "Callable[[int], int]"; expected "def (str | bytes | PathLike[str] | PathLike[bytes] | FileLike, /, *Any, **Any) -> tuple[pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray[tuple[int, ...], dtype[generic[Any]]]], metadata_container]" [arg-type] + read_file_multiprocessing(noop, "file.sav", 1, 1) # E: Argument 1 to "read_file_multiprocessing" has incompatible type "Callable[[int], int]"; expected "def (str | bytes | PathLike[str] | PathLike[bytes] | FileLike, /, *Any, **Any) -> tuple[pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray[tuple[Any, ...], dtype[generic[Any]]]], metadata_container]" [arg-type] out: | main:4: note: This is likely because "noop" has named arguments: "a". Consider marking them positional-only @@ -210,7 +210,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[builtins.int, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" main: | from pyreadstat import read_file_in_chunks, read_sav for df, meta in read_file_in_chunks(read_sav, "file.sav", metadataonly=True, output_format="{{ output_format }}"): @@ -221,7 +221,7 @@ from pyreadstat import read_file_in_chunks def noop(a: int) -> int: return a - read_file_in_chunks(noop, "file.sav", 1, 1) # E: Argument 1 to "read_file_in_chunks" has incompatible type "Callable[[int], int]"; expected "def (str | bytes | PathLike[str] | PathLike[bytes] | FileLike, /, *Any, **Any) -> tuple[pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray[tuple[int, ...], dtype[generic[Any]]]], metadata_container]" [arg-type] + read_file_in_chunks(noop, "file.sav", 1, 1) # E: Argument 1 to "read_file_in_chunks" has incompatible type "Callable[[int], int]"; expected "def (str | bytes | PathLike[str] | PathLike[bytes] | FileLike, /, *Any, **Any) -> tuple[pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray[tuple[Any, ...], dtype[generic[Any]]]], metadata_container]" [arg-type] out: | main:4: note: This is likely because "noop" has named arguments: "a". Consider marking them positional-only @@ -308,8 +308,8 @@ from pyreadstat.worker import Input, worker inpt: Input = (read_sav, "test_data/file.sav", 0, 100, {}) result = worker(inpt) - reveal_type(result) # N: Revealed type is "pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | builtins.dict[builtins.str, numpy.ndarray[Any, Any]]" - inpt_invalid: Input = (lambda x: "A", "test_data/file.sav", 0, 100, {}) # E: Incompatible types in assignment (expression has type "tuple[Callable[[Any], str], str, int, int, dict[str, Any]]", variable has type "tuple[def (str | bytes | PathLike[str] | PathLike[bytes] | FileLike, /, *Any, **Any) -> tuple[pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray[tuple[int, ...], dtype[generic[Any]]]], metadata_container], str | bytes | PathLike[Any] | FileLike, int, int, dict[str, Any]]") [assignment] + reveal_type(result) # N: Revealed type is "pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[Any]]]" + inpt_invalid: Input = (lambda x: "A", "test_data/file.sav", 0, 100, {}) # ER: Incompatible types in assignment (expression has type "tuple[Callable[[Any], str], str, int, int, dict[str, Any]]", variable has type "tuple[def (str | bytes | PathLike[str] | PathLike[bytes] | FileLike, /, *Any, **Any) -> tuple[pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray[tuple[Any, ...], dtype[generic[Any]]]], metadata_container], str | bytes | PathLike[Any] | FileLike, int, int, dict[str, Any]]") [assignment] out: | main:6: error: Cannot infer type of lambda [misc] From 60bde002c04dcffb415c3a7cf535fa4cb4fe944d Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Sun, 22 Mar 2026 18:02:59 +0000 Subject: [PATCH 27/38] Fix duplicated test and missing xport test --- tests/test_typing.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_typing.yml b/tests/test_typing.yml index dfee1f2..811768e 100644 --- a/tests/test_typing.yml +++ b/tests/test_typing.yml @@ -112,10 +112,10 @@ buffer = io.BytesIO() df, meta = read_sas7bdat(buffer) -- case: read_sas7bcat_default_types +- case: read_xport_default_types main: | - from pyreadstat import read_sas7bcat - df, meta = read_sas7bcat("file.sas7bcat") + from pyreadstat import read_xport + df, meta = read_xport("file.xpt") reveal_type(df) # N: Revealed type is "pandas.core.frame.DataFrame" reveal_type(meta) # N: Revealed type is "pyreadstat.pyclasses.metadata_container" From 9d6a60ac6fe6e0bde02d16e1d8bc43546d2b61eb Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Tue, 7 Apr 2026 19:17:34 +0100 Subject: [PATCH 28/38] removed file-like type hint from worker args --- pyreadstat/worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyreadstat/worker.py b/pyreadstat/worker.py index 88bc1ae..7a3a3ac 100644 --- a/pyreadstat/worker.py +++ b/pyreadstat/worker.py @@ -24,9 +24,9 @@ import numpy as np if TYPE_CHECKING: - from .pyreadstat import PyreadstatReadFunction, FileLike, DataFrame + from .pyreadstat import PyreadstatReadFunction, DataFrame -Input: TypeAlias = "tuple[PyreadstatReadFunction, str | bytes | PathLike | FileLike, int, int, dict[str, Any]]" +Input: TypeAlias = "tuple[PyreadstatReadFunction, str | bytes | PathLike, int, int, dict[str, Any]]" def worker(inpt: Input) -> "DataFrame | dict[str, np.ndarray]": From 4997d7a9b5a530c17971718de5e242e67b36bc0f Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Tue, 7 Apr 2026 19:20:21 +0100 Subject: [PATCH 29/38] add missing type annotations and retype dict output to dict of lists --- pyreadstat/pyreadstat.py | 60 +++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index ef2f075..9d4627c 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -17,11 +17,10 @@ from collections.abc import Callable, Iterator import multiprocessing as mp from os import PathLike -from typing import TYPE_CHECKING, Concatenate, Literal, ParamSpec, TypeAlias, overload, Protocol +from typing import TYPE_CHECKING, Any, Concatenate, Literal, ParamSpec, TypeAlias, overload, Protocol import narwhals.stable.v2 as nw import numpy as np -import numpy.typing as npt from ._readstat_parser import parser_entry_point from ._readstat_writer import writer_entry_point, PyreadstatError @@ -34,11 +33,10 @@ if TYPE_CHECKING: try: from pandas import DataFrame as PandasDataFrame - except ImportError: - pass - try: from polars import DataFrame as PolarsDataFrame except ImportError: + # Typing doesn't execute the import. + # Missing imports resolve to `Unknown`, so pass DataFrame: TypeAlias = PandasDataFrame | PolarsDataFrame @@ -55,12 +53,12 @@ def seek(self, pos: int, whence: int = 0, /) -> int: ... FilePathLike: TypeAlias = str | bytes | PathLike[str] | PathLike[bytes] FilePathorBuffer: TypeAlias = FilePathLike | FileLike -DictOutput = dict[str, npt.NDArray[np.generic]] +DictOutput = dict[str, list[Any]] _P = ParamSpec("_P") -PyreadstatReadFunction = Callable[ +PyreadstatReadFunction: TypeAlias = Callable[ Concatenate[FilePathorBuffer, _P], - "tuple[DataFrame | DictOutput, metadata_container]" + "tuple[DataFrame | dict[str, list[Any]], metadata_container]", ] @@ -719,8 +717,8 @@ def read_por( formats_as_ordered_category: bool = ..., usecols: list[str] | None = ..., disable_datetime_conversion: bool = ..., - row_limit: int = 0, - row_offset: int = 0, + row_limit: int = ..., + row_offset: int = ..., output_format: Literal["pandas"] | None = ..., extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., @@ -736,8 +734,8 @@ def read_por( formats_as_ordered_category: bool = ..., usecols: list[str] | None = ..., disable_datetime_conversion: bool = ..., - row_limit: int = 0, - row_offset: int = 0, + row_limit: int = ..., + row_offset: int = ..., output_format: Literal["polars"] = "polars", extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., @@ -753,8 +751,8 @@ def read_por( formats_as_ordered_category: bool = ..., usecols: list[str] | None = ..., disable_datetime_conversion: bool = ..., - row_limit: int = 0, - row_offset: int = 0, + row_limit: int = ..., + row_offset: int = ..., output_format: Literal["dict"] = "dict", extra_datetime_formats: list[str] | None = ..., extra_date_formats: list[str] | None = ..., @@ -926,7 +924,7 @@ def read_sas7bcat( @overload def read_file_in_chunks( - read_function: PyreadstatReadFunction, + read_function: PyreadstatReadFunction[_P], file_path: FilePathLike, chunksize: int = ..., offset: int = ..., @@ -936,11 +934,11 @@ def read_file_in_chunks( num_rows: int | None = ..., *, output_format: Literal["pandas"] | None = ..., - **kwargs, + **kwargs: Any, ) -> "Iterator[tuple[PandasDataFrame, metadata_container]]": ... @overload def read_file_in_chunks( - read_function: PyreadstatReadFunction, + read_function: PyreadstatReadFunction[_P], file_path: FilePathLike, chunksize: int = ..., offset: int = ..., @@ -950,11 +948,11 @@ def read_file_in_chunks( num_rows: int | None = ..., *, output_format: Literal["polars"] = "polars", - **kwargs, + **kwargs: Any, ) -> "Iterator[tuple[PolarsDataFrame, metadata_container]]": ... @overload def read_file_in_chunks( - read_function: PyreadstatReadFunction, + read_function: PyreadstatReadFunction[_P], file_path: FilePathLike, chunksize: int = ..., offset: int = ..., @@ -964,10 +962,10 @@ def read_file_in_chunks( num_rows: int | None = ..., *, output_format: Literal["dict"] = "dict", - **kwargs, + **kwargs: Any, ) -> Iterator[tuple[DictOutput, metadata_container]]: ... def read_file_in_chunks( - read_function: PyreadstatReadFunction, + read_function: PyreadstatReadFunction[_P], file_path: FilePathLike, chunksize: int = 100000, offset: int = 0, @@ -975,7 +973,7 @@ def read_file_in_chunks( multiprocess: bool = False, num_processes: int = 4, num_rows: int | None = None, - **kwargs, + **kwargs: Any, ) -> "Iterator[tuple[DataFrame | DictOutput, metadata_container]]": """ Returns a generator that will allow to read a file in chunks. @@ -1066,40 +1064,40 @@ def read_file_in_chunks( @overload def read_file_multiprocessing( - read_function: PyreadstatReadFunction, + read_function: PyreadstatReadFunction[_P], file_path: FilePathLike, num_processes: int | None = ..., num_rows: int | None = ..., *, output_format: Literal["pandas"] | None = ..., - **kwargs, + **kwargs: Any, ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_file_multiprocessing( - read_function: PyreadstatReadFunction, + read_function: PyreadstatReadFunction[_P], file_path: FilePathLike, num_processes: int | None = ..., num_rows: int | None = ..., *, output_format: Literal["polars"] = "polars", - **kwargs, + **kwargs: Any, ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_file_multiprocessing( - read_function: PyreadstatReadFunction, + read_function: PyreadstatReadFunction[_P], file_path: FilePathLike, num_processes: int | None = ..., num_rows: int | None = ..., *, output_format: Literal["dict"] = "dict", - **kwargs, + **kwargs: Any, ) -> tuple[DictOutput, metadata_container]: ... def read_file_multiprocessing( - read_function: PyreadstatReadFunction, + read_function: PyreadstatReadFunction[_P], file_path: FilePathLike, num_processes: int | None = None, num_rows: int | None = None, - **kwargs, + **kwargs: Any, ) -> "tuple[DataFrame | DictOutput, metadata_container]": """ Reads a file in parallel using multiprocessing. @@ -1394,7 +1392,7 @@ def write_por( dst_path: FilePathLike, file_label: str = "", column_labels: list[str] | dict[str, str] | None = None, - variable_format: dict[str, str] | None = None + variable_format: dict[str, str] | None = None, ) -> None: """ Writes a dataframe to a SPSS POR file. From e0973bbb5a48ae594f97446361817b9a7294ad30 Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Tue, 7 Apr 2026 19:20:34 +0100 Subject: [PATCH 30/38] add polars to docstrings --- pyreadstat/pyfunctions.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pyreadstat/pyfunctions.py b/pyreadstat/pyfunctions.py index 24a5b39..fb769c0 100644 --- a/pyreadstat/pyfunctions.py +++ b/pyreadstat/pyfunctions.py @@ -26,20 +26,20 @@ def set_value_labels( Parameters ---------- - dataframe : pandas dataframe + dataframe : pandas or polars dataframe resulting from parsing a file metadata : dictionary resulting from parsing a file formats_as_category : bool, optional - defaults to True. If True the variables having formats will be transformed into pandas categories. + defaults to True. If True the variables having formats will be transformed into pandas or polars categories. formats_as_ordered_category : bool, optional - defaults to False. If True the variables having formats will be transformed into pandas ordered categories. + defaults to False. If True the variables having formats will be transformed into pandas or polars ordered categories. it has precedence over formats_as_category, meaning if this is True, it will take effect irrespective of the value of formats_as_category. Returns ------- - df_copy : pandas dataframe + df_copy : pandas or polars dataframe a copy of the original dataframe with the values changed, if appropiate formats were found, unaltered otherwise """ @@ -120,22 +120,22 @@ def set_catalog_to_sas( Parameters ---------- - sas_dataframe : pandas dataframe + sas_dataframe : pandas or polars dataframe resulting from parsing a sas7bdat file sas_metadata : pyreadstat metadata object resulting from parsing a sas7bdat file catalog_metadata : pyreadstat metadata object resulting from parsing a sas7bcat (catalog) file formats_as_category : bool, optional - defaults to True. If True the variables having formats will be transformed into pandas categories. + defaults to True. If True the variables having formats will be transformed into pandas or polars categories. formats_as_ordered_category : bool, optional - defaults to False. If True the variables having formats will be transformed into pandas ordered categories. + defaults to False. If True the variables having formats will be transformed into pandas or polars ordered categories. it has precedence over formats_as_category, meaning if this is True, it will take effect irrespective of the value of formats_as_category. Returns ------- - df_copy : pandas dataframe + df_copy : pandas or polars dataframe a copy of the original dataframe with the values changed, if appropriate formats were found, unaltered otherwise metadata : dict From 769dbe639369e31560500247abd0cc1805548852 Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Tue, 7 Apr 2026 19:20:51 +0100 Subject: [PATCH 31/38] fix missing parametrized values from test --- tests/test_typing.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_typing.yml b/tests/test_typing.yml index 811768e..0ffe661 100644 --- a/tests/test_typing.yml +++ b/tests/test_typing.yml @@ -184,8 +184,8 @@ expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" main: | from pyreadstat import read_file_multiprocessing, read_sav - df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True, output_format="polars") - reveal_type(df) # N: Revealed type is "polars.dataframe.frame.DataFrame" + df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True, output_format="{{ output_format }}") + reveal_type(df) # N: Revealed type is "{{ expected_type }}" - case: read_file_multiprocessing_invalid_callable main: | From e884e88976091727b12a2f19d0172dadede0e09d Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Wed, 8 Apr 2026 19:30:18 +0100 Subject: [PATCH 32/38] Fix type erasure on failed backend imports and remove unused types --- pyreadstat/pyreadstat.py | 50 +++++++++++++++++++++++----------------- pyreadstat/worker.py | 6 ++--- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/pyreadstat/pyreadstat.py b/pyreadstat/pyreadstat.py index 55b2fe8..d94f742 100644 --- a/pyreadstat/pyreadstat.py +++ b/pyreadstat/pyreadstat.py @@ -18,12 +18,12 @@ import multiprocessing as mp from itertools import chain from os import PathLike -from typing import TYPE_CHECKING, Any, Concatenate, Literal, ParamSpec, TypeAlias, overload, Protocol +from typing import TYPE_CHECKING, Any, Concatenate, Literal, TypeAlias, overload, Protocol import narwhals.stable.v2 as nw -from ._readstat_parser import parser_entry_point -from ._readstat_writer import writer_entry_point, PyreadstatError +from ._readstat_parser import parser_entry_point, PyreadstatError +from ._readstat_writer import writer_entry_point from .worker import worker from .pyclasses import metadata_container, MissingRange from .pyfunctions import set_value_labels, set_catalog_to_sas @@ -31,16 +31,26 @@ # Typing interface if TYPE_CHECKING: + # Setup type aliases for the public interface. + # These are not executed at runtime, but they help type checkers understand + # the expected types of the public functions and classes. + + # Since pyreadstat can work with both pandas and polars, we define a DataFrame type that can be either. try: - from pandas import DataFrame as PandasDataFrame - from polars import DataFrame as PolarsDataFrame + from pandas import DataFrame as PandasDataFrame # type: ignore except ImportError: - # Typing doesn't execute the import. - # Missing imports resolve to `Unknown`, so - pass + # Define a dummy DataFrame class to avoid accepting any type as PandasDataFrame when pandas is not installed + class PandasDataFrame: + pass - DataFrame: TypeAlias = PandasDataFrame | PolarsDataFrame + try: + from polars import DataFrame as PolarsDataFrame # type: ignore + except ImportError: + # Define a dummy DataFrame class to avoid accepting any type as PolarsDataFrame when polars is not installed + class PolarsDataFrame: + pass +DataFrame: TypeAlias = "PandasDataFrame | PolarsDataFrame" # Define type at runtime for introspection class FileLike(Protocol): """Protocol for file-like objects accepted by pyreadstat""" @@ -53,12 +63,10 @@ def seek(self, pos: int, whence: int = 0, /) -> int: ... FilePathLike: TypeAlias = str | bytes | PathLike[str] | PathLike[bytes] FilePathorBuffer: TypeAlias = FilePathLike | FileLike -DictOutput = dict[str, list[Any]] +DictOutput: TypeAlias = dict[str, list[Any]] -_P = ParamSpec("_P") PyreadstatReadFunction: TypeAlias = Callable[ - Concatenate[FilePathorBuffer, _P], - "tuple[DataFrame | dict[str, list[Any]], metadata_container]", + Concatenate[FilePathorBuffer, ...], "tuple[DataFrame | DictOutput, metadata_container]" ] @@ -924,7 +932,7 @@ def read_sas7bcat( @overload def read_file_in_chunks( - read_function: PyreadstatReadFunction[_P], + read_function: PyreadstatReadFunction, file_path: FilePathLike, chunksize: int = ..., offset: int = ..., @@ -938,7 +946,7 @@ def read_file_in_chunks( ) -> "Iterator[tuple[PandasDataFrame, metadata_container]]": ... @overload def read_file_in_chunks( - read_function: PyreadstatReadFunction[_P], + read_function: PyreadstatReadFunction, file_path: FilePathLike, chunksize: int = ..., offset: int = ..., @@ -952,7 +960,7 @@ def read_file_in_chunks( ) -> "Iterator[tuple[PolarsDataFrame, metadata_container]]": ... @overload def read_file_in_chunks( - read_function: PyreadstatReadFunction[_P], + read_function: PyreadstatReadFunction, file_path: FilePathLike, chunksize: int = ..., offset: int = ..., @@ -965,7 +973,7 @@ def read_file_in_chunks( **kwargs: Any, ) -> Iterator[tuple[DictOutput, metadata_container]]: ... def read_file_in_chunks( - read_function: PyreadstatReadFunction[_P], + read_function: PyreadstatReadFunction, file_path: FilePathLike, chunksize: int = 100000, offset: int = 0, @@ -1064,7 +1072,7 @@ def read_file_in_chunks( @overload def read_file_multiprocessing( - read_function: PyreadstatReadFunction[_P], + read_function: PyreadstatReadFunction, file_path: FilePathLike, num_processes: int | None = ..., num_rows: int | None = ..., @@ -1074,7 +1082,7 @@ def read_file_multiprocessing( ) -> "tuple[PandasDataFrame, metadata_container]": ... @overload def read_file_multiprocessing( - read_function: PyreadstatReadFunction[_P], + read_function: PyreadstatReadFunction, file_path: FilePathLike, num_processes: int | None = ..., num_rows: int | None = ..., @@ -1084,7 +1092,7 @@ def read_file_multiprocessing( ) -> "tuple[PolarsDataFrame, metadata_container]": ... @overload def read_file_multiprocessing( - read_function: PyreadstatReadFunction[_P], + read_function: PyreadstatReadFunction, file_path: FilePathLike, num_processes: int | None = ..., num_rows: int | None = ..., @@ -1093,7 +1101,7 @@ def read_file_multiprocessing( **kwargs: Any, ) -> tuple[DictOutput, metadata_container]: ... def read_file_multiprocessing( - read_function: PyreadstatReadFunction[_P], + read_function: PyreadstatReadFunction, file_path: FilePathLike, num_processes: int | None = None, num_rows: int | None = None, diff --git a/pyreadstat/worker.py b/pyreadstat/worker.py index 7a3a3ac..546763c 100644 --- a/pyreadstat/worker.py +++ b/pyreadstat/worker.py @@ -21,15 +21,13 @@ from os import PathLike from typing import TYPE_CHECKING, Any, TypeAlias -import numpy as np - if TYPE_CHECKING: - from .pyreadstat import PyreadstatReadFunction, DataFrame + from .pyreadstat import PyreadstatReadFunction, DataFrame, DictOutput Input: TypeAlias = "tuple[PyreadstatReadFunction, str | bytes | PathLike, int, int, dict[str, Any]]" -def worker(inpt: Input) -> "DataFrame | dict[str, np.ndarray]": +def worker(inpt: Input) -> "DataFrame | DictOutput": read_function, path, row_offset, row_limit, kwargs = inpt df, meta = read_function(path, row_offset=row_offset, row_limit=row_limit, **kwargs) return df From ff63ed5ca8f71a45745621b9823d4cde33f0ccbc Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Wed, 8 Apr 2026 19:30:46 +0100 Subject: [PATCH 33/38] change type tests to use simplified dict output --- tests/test_typing.yml | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/tests/test_typing.yml b/tests/test_typing.yml index 0ffe661..d38483a 100644 --- a/tests/test_typing.yml +++ b/tests/test_typing.yml @@ -17,7 +17,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" main: | from pyreadstat import read_sav df: object @@ -45,7 +45,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" main: | from pyreadstat import read_dta df, meta = read_dta("file.dta", output_format="{{ output_format }}") @@ -72,7 +72,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" main: | from pyreadstat import read_por df, meta = read_por("file.por", output_format="{{ output_format }}") @@ -99,7 +99,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" main: | from pyreadstat import read_sas7bdat df, meta = read_sas7bdat("file.sas7bdat", output_format="{{ output_format }}") @@ -126,7 +126,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" main: | from pyreadstat import read_xport df, meta = read_xport("file.xpt", output_format="{{ output_format }}") @@ -153,7 +153,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" main: | from pyreadstat import read_sas7bcat df, meta = read_sas7bcat("file.sas7bcat", output_format="{{ output_format }}") @@ -181,7 +181,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" main: | from pyreadstat import read_file_multiprocessing, read_sav df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True, output_format="{{ output_format }}") @@ -190,11 +190,9 @@ - case: read_file_multiprocessing_invalid_callable main: | from pyreadstat import read_file_multiprocessing - def noop(a: int) -> int: + def noop(a: int, /) -> int: return a - read_file_multiprocessing(noop, "file.sav", 1, 1) # E: Argument 1 to "read_file_multiprocessing" has incompatible type "Callable[[int], int]"; expected "def (str | bytes | PathLike[str] | PathLike[bytes] | FileLike, /, *Any, **Any) -> tuple[pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray[tuple[Any, ...], dtype[generic[Any]]]], metadata_container]" [arg-type] - out: | - main:4: note: This is likely because "noop" has named arguments: "a". Consider marking them positional-only + read_file_multiprocessing(noop, "file.sav", 1, 1) # ER: Argument 1 to "read_file_multiprocessing" has incompatible type .+ - case: read_file_in_chunks_default_types main: | @@ -210,7 +208,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[numpy.generic[Any]]]]" + expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" main: | from pyreadstat import read_file_in_chunks, read_sav for df, meta in read_file_in_chunks(read_sav, "file.sav", metadataonly=True, output_format="{{ output_format }}"): @@ -219,11 +217,9 @@ - case: read_file_in_chunks_invalid_callable main: | from pyreadstat import read_file_in_chunks - def noop(a: int) -> int: + def noop(a: int, /) -> int: return a - read_file_in_chunks(noop, "file.sav", 1, 1) # E: Argument 1 to "read_file_in_chunks" has incompatible type "Callable[[int], int]"; expected "def (str | bytes | PathLike[str] | PathLike[bytes] | FileLike, /, *Any, **Any) -> tuple[pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray[tuple[Any, ...], dtype[generic[Any]]]], metadata_container]" [arg-type] - out: | - main:4: note: This is likely because "noop" has named arguments: "a". Consider marking them positional-only + read_file_in_chunks(noop, "file.sav", 1, 1) # ER: Argument 1 to "read_file_in_chunks" has incompatible type .+ - case: write_sav_types main: | @@ -308,10 +304,10 @@ from pyreadstat.worker import Input, worker inpt: Input = (read_sav, "test_data/file.sav", 0, 100, {}) result = worker(inpt) - reveal_type(result) # N: Revealed type is "pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | builtins.dict[builtins.str, numpy.ndarray[builtins.tuple[Any, ...], numpy.dtype[Any]]]" - inpt_invalid: Input = (lambda x: "A", "test_data/file.sav", 0, 100, {}) # ER: Incompatible types in assignment (expression has type "tuple[Callable[[Any], str], str, int, int, dict[str, Any]]", variable has type "tuple[def (str | bytes | PathLike[str] | PathLike[bytes] | FileLike, /, *Any, **Any) -> tuple[pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, ndarray[tuple[Any, ...], dtype[generic[Any]]]], metadata_container], str | bytes | PathLike[Any] | FileLike, int, int, dict[str, Any]]") [assignment] - out: | - main:6: error: Cannot infer type of lambda [misc] + reveal_type(result) # N: Revealed type is "pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | builtins.dict[builtins.str, builtins.list[Any]]" + def noop(a: int, /) -> int: + return a + inpt_invalid: Input = (noop, "test_data/file.sav", 0, 100, {}) # ER: Incompatible types in assignment .+ - case: metadata_container_types main: | From 985e7996fa014fe8e53fedb56e825a4c3caad317 Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Wed, 8 Apr 2026 19:31:05 +0100 Subject: [PATCH 34/38] remove unused import --- pyreadstat/pyfunctions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyreadstat/pyfunctions.py b/pyreadstat/pyfunctions.py index fb769c0..a64301e 100644 --- a/pyreadstat/pyfunctions.py +++ b/pyreadstat/pyfunctions.py @@ -2,7 +2,7 @@ Functions written in pure python """ -from copy import deepcopy, copy +from copy import deepcopy import warnings import narwhals.stable.v2 as nw From 82a282fd331ba8da809642e1e3a68c071b18361f Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Wed, 8 Apr 2026 19:31:51 +0100 Subject: [PATCH 35/38] add __all__ imports to package init --- pyreadstat/__init__.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/pyreadstat/__init__.py b/pyreadstat/__init__.py index 8ecd1c4..b66544c 100644 --- a/pyreadstat/__init__.py +++ b/pyreadstat/__init__.py @@ -14,12 +14,33 @@ # limitations under the License. # ############################################################################# -from .pyreadstat import read_sav, read_sas7bdat, read_xport, read_dta, read_sav, read_por, read_sas7bcat + +from .pyreadstat import read_sav, read_sas7bdat, read_xport, read_dta, read_por, read_sas7bcat from .pyreadstat import write_sav, write_dta, write_xport, write_por from .pyreadstat import read_file_in_chunks, read_file_multiprocessing from .pyclasses import metadata_container from ._readstat_parser import ReadstatError +from .pyreadstat import PyreadstatError from .pyfunctions import set_value_labels, set_catalog_to_sas __version__ = "1.3.4" +__all__ = ( + "read_sav", + "read_sas7bdat", + "read_xport", + "read_dta", + "read_por", + "read_sas7bcat", + "write_sav", + "write_dta", + "write_xport", + "write_por", + "read_file_in_chunks", + "read_file_multiprocessing", + "metadata_container", + "ReadstatError", + "PyreadstatError", + "set_value_labels", + "set_catalog_to_sas", +) \ No newline at end of file From 3acaae4662a9e112c6003322ff084fb62acc757f Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Wed, 8 Apr 2026 19:32:23 +0100 Subject: [PATCH 36/38] Add dependency groups to pyproject and document how to test with extra dependencies --- how_to_test.md | 32 ++++++++++++++++++++++++++++---- pyproject.toml | 19 +++++++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/how_to_test.md b/how_to_test.md index b3c8bea..e870f79 100644 --- a/how_to_test.md +++ b/how_to_test.md @@ -1,11 +1,35 @@ +# How to test pyreadstat + +## Dependencies for testing + +Additional dependencies for testing can be installed with: + +```shell +pip install --group dev --group test +``` + +## Running tests + If you have installed pyreadstat on your environment, enter this folder and do: -```python +```shell python3 tests/test_basic.py ``` -If you have built in place do +If you have built in place, do: -``` +```shell python3 tests/test_basic.py --inplace -``` \ No newline at end of file +``` + +Type hint tests can be run with: + +```shell +pytest tests/test_typing.yml --mypy-ini-file=tests/test_mypy_setup.ini +``` + +To run all tests in place, do: + +```shell +python tests/test_basic.py --inplace && python tests/test_narwhalified.py --inplace --backend=pandas && python tests/test_narwhalified.py --inplace --backend=polars && pytest tests/test_http_integration.py && pytest tests/test_typing.yml --mypy-ini-file=tests/test_mypy_setup.ini +``` diff --git a/pyproject.toml b/pyproject.toml index bc1f36d..79213ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,3 +5,22 @@ requires = [ "cython" ] build-backend = "setuptools.build_meta" + +[dependency-groups] +dev = [ + "setuptools>=80.0.0", + "numpy>=2.0.0", + "pandas>=2.0.0", + "polars>=1.30.0", + "cython>=3.0.0", + "narwhals>=2.10.1", +] +test = [ + "pytest>=8.0.0", + "mypy>=1.19.0", + "pytest-mypy-plugins>=4.0.0", + "pandas-stubs>=2.0.0", + "pandas>=2.0.0", + "polars>=1.30.0", + "narwhals>=2.10.1", +] From 0ccdba080052142755a59e4ca43acfa7f352f797 Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Wed, 8 Apr 2026 20:01:50 +0100 Subject: [PATCH 37/38] import PyreadstatError from the correct module --- pyreadstat/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyreadstat/__init__.py b/pyreadstat/__init__.py index b66544c..89474f9 100644 --- a/pyreadstat/__init__.py +++ b/pyreadstat/__init__.py @@ -19,8 +19,7 @@ from .pyreadstat import write_sav, write_dta, write_xport, write_por from .pyreadstat import read_file_in_chunks, read_file_multiprocessing from .pyclasses import metadata_container -from ._readstat_parser import ReadstatError -from .pyreadstat import PyreadstatError +from ._readstat_parser import ReadstatError, PyreadstatError from .pyfunctions import set_value_labels, set_catalog_to_sas __version__ = "1.3.4" From 0850c0dc418fa0e3768874c072c65349bdb26468 Mon Sep 17 00:00:00 2001 From: Ignacio Maiz Date: Thu, 9 Apr 2026 19:54:48 +0100 Subject: [PATCH 38/38] Bump mypy to 1.20 and fix checks for changed error outputs --- pyproject.toml | 2 +- tests/test_typing.yml | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 79213ec..17092d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ dev = [ ] test = [ "pytest>=8.0.0", - "mypy>=1.19.0", + "mypy>=1.20.0", "pytest-mypy-plugins>=4.0.0", "pandas-stubs>=2.0.0", "pandas>=2.0.0", diff --git a/tests/test_typing.yml b/tests/test_typing.yml index d38483a..996e25a 100644 --- a/tests/test_typing.yml +++ b/tests/test_typing.yml @@ -17,7 +17,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" + expected_type: "dict[str, list[Any]]" main: | from pyreadstat import read_sav df: object @@ -45,7 +45,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" + expected_type: "dict[str, list[Any]]" main: | from pyreadstat import read_dta df, meta = read_dta("file.dta", output_format="{{ output_format }}") @@ -72,7 +72,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" + expected_type: "dict[str, list[Any]]" main: | from pyreadstat import read_por df, meta = read_por("file.por", output_format="{{ output_format }}") @@ -99,7 +99,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" + expected_type: "dict[str, list[Any]]" main: | from pyreadstat import read_sas7bdat df, meta = read_sas7bdat("file.sas7bdat", output_format="{{ output_format }}") @@ -126,7 +126,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" + expected_type: "dict[str, list[Any]]" main: | from pyreadstat import read_xport df, meta = read_xport("file.xpt", output_format="{{ output_format }}") @@ -153,7 +153,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" + expected_type: "dict[str, list[Any]]" main: | from pyreadstat import read_sas7bcat df, meta = read_sas7bcat("file.sas7bcat", output_format="{{ output_format }}") @@ -181,7 +181,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" + expected_type: "dict[str, list[Any]]" main: | from pyreadstat import read_file_multiprocessing, read_sav df, meta = read_file_multiprocessing(read_sav, "file.sav", metadataonly=True, output_format="{{ output_format }}") @@ -208,7 +208,7 @@ - output_format: "polars" expected_type: "polars.dataframe.frame.DataFrame" - output_format: "dict" - expected_type: "builtins.dict[builtins.str, builtins.list[Any]]" + expected_type: "dict[str, list[Any]]" main: | from pyreadstat import read_file_in_chunks, read_sav for df, meta in read_file_in_chunks(read_sav, "file.sav", metadataonly=True, output_format="{{ output_format }}"): @@ -304,7 +304,7 @@ from pyreadstat.worker import Input, worker inpt: Input = (read_sav, "test_data/file.sav", 0, 100, {}) result = worker(inpt) - reveal_type(result) # N: Revealed type is "pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | builtins.dict[builtins.str, builtins.list[Any]]" + reveal_type(result) # N: Revealed type is "pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | dict[str, list[Any]]" def noop(a: int, /) -> int: return a inpt_invalid: Input = (noop, "test_data/file.sav", 0, 100, {}) # ER: Incompatible types in assignment .+ @@ -334,7 +334,7 @@ "label": "Set 2", "variable_list": ["var3"], }, - "set3": {} # E: Missing keys ("type", "is_dichotomy", "counted_value", "label", "variable_list") for TypedDict "MRSet" [typeddict-item] + "set3": {} # E: Missing keys ("counted_value", "is_dichotomy", "label", "type", "variable_list") for TypedDict "MRSet" [typeddict-item] } meta.variable_measure = { "var1": "nominal",