Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
71dc48d
first pass at typing public interface
nachomaiz Feb 6, 2026
ce93b59
remove Optional types since they will always be returned with actual …
nachomaiz Feb 9, 2026
34ac76e
finish overloads for all read_ functions
nachomaiz Feb 9, 2026
d55b7ad
add typehint tests for read_file_in_chunks
nachomaiz Feb 9, 2026
84e3886
type tests for all read-write functions
nachomaiz Feb 9, 2026
1ae8aa2
fixed run command
nachomaiz Feb 9, 2026
9175509
prefer direct import for builtins when possible
nachomaiz Feb 9, 2026
0674be0
cleanup stringified types & make MissingRange public
nachomaiz Feb 9, 2026
c7f7e08
type hints for worker
nachomaiz Feb 9, 2026
2f419ce
better import sorting
nachomaiz Feb 9, 2026
46f9540
added py.typed file to signal that the python interface is fully type…
nachomaiz Feb 9, 2026
57ea86c
revert some formatting changes due to line length
nachomaiz Feb 9, 2026
047096a
fix comment for type test
nachomaiz Feb 9, 2026
c9258f8
revert some formatting changes due to line length
nachomaiz Feb 9, 2026
353f213
type tests for all other modules
nachomaiz Feb 9, 2026
d1bac16
fix inconsistencies with PathLike and FileLike. Fix chunk- and multi-…
nachomaiz Feb 13, 2026
e734fd2
file format for write_xport must be 5 or 8
nachomaiz Feb 13, 2026
77e5c2f
sync type in docstring to type annotation
nachomaiz Feb 13, 2026
2c20526
Add ParamSpec to PyreadstatReadFunction type definition
nachomaiz Feb 13, 2026
c5346a0
add py.typed to package files
nachomaiz Feb 16, 2026
a2e101a
change metadata_container to dataclass, added missing file_label attr…
nachomaiz Feb 16, 2026
8ac87b7
Change narwhals frame type to pandas/polars types and tweaks to path …
nachomaiz Feb 16, 2026
b82fc50
Revert metadata values to optional and set MRSet["counted_value"] as …
nachomaiz Feb 24, 2026
49eea57
implement type test cases for pytest
nachomaiz Feb 24, 2026
cf383f8
fix number_columns type default to None
nachomaiz Mar 5, 2026
48f0d9f
fix numpy typing for Python 3.11+
nachomaiz Mar 22, 2026
60bde00
Fix duplicated test and missing xport test
nachomaiz Mar 22, 2026
9d6a60a
removed file-like type hint from worker args
nachomaiz Apr 7, 2026
4997d7a
add missing type annotations and retype dict output to dict of lists
nachomaiz Apr 7, 2026
e0973bb
add polars to docstrings
nachomaiz Apr 7, 2026
769dbe6
fix missing parametrized values from test
nachomaiz Apr 7, 2026
f5adb42
Merge remote-tracking branch 'upstream/pyfile_dev' into pyfile_typehints
nachomaiz Apr 7, 2026
e884e88
Fix type erasure on failed backend imports and remove unused types
nachomaiz Apr 8, 2026
ff63ed5
change type tests to use simplified dict output
nachomaiz Apr 8, 2026
985e799
remove unused import
nachomaiz Apr 8, 2026
82a282f
add __all__ imports to package init
nachomaiz Apr 8, 2026
3acaae4
Add dependency groups to pyproject and document how to test with extr…
nachomaiz Apr 8, 2026
0ccdba0
import PyreadstatError from the correct module
nachomaiz Apr 8, 2026
0850c0d
Bump mypy to 1.20 and fix checks for changed error outputs
nachomaiz Apr 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ include *.pyx
recursive-include pyreadstat *.pyx
include *.pxd
recursive-include pyreadstat *.pxd

recursive-include pyreadstat py.typed
32 changes: 28 additions & 4 deletions how_to_test.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,35 @@
# How to test pyreadstat

## Dependencies for testing

Additional dependencies for testing can be installed with:

```shell
pip install --group dev --group test
```

## Running tests

If you have installed pyreadstat on your environment, enter this folder and do:

```python
```shell
python3 tests/test_basic.py
```

If you have built in place do
If you have built in place, do:

```
```shell
python3 tests/test_basic.py --inplace
```
```

Type hint tests can be run with:

```shell
pytest tests/test_typing.yml --mypy-ini-file=tests/test_mypy_setup.ini
```

To run all tests in place, do:

```shell
python tests/test_basic.py --inplace && python tests/test_narwhalified.py --inplace --backend=pandas && python tests/test_narwhalified.py --inplace --backend=polars && pytest tests/test_http_integration.py && pytest tests/test_typing.yml --mypy-ini-file=tests/test_mypy_setup.ini
```
19 changes: 19 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,22 @@ requires = [
"cython"
]
build-backend = "setuptools.build_meta"

[dependency-groups]
dev = [
"setuptools>=80.0.0",
"numpy>=2.0.0",
"pandas>=2.0.0",
"polars>=1.30.0",
"cython>=3.0.0",
"narwhals>=2.10.1",
]
test = [
"pytest>=8.0.0",
"mypy>=1.20.0",
"pytest-mypy-plugins>=4.0.0",
"pandas-stubs>=2.0.0",
"pandas>=2.0.0",
"polars>=1.30.0",
"narwhals>=2.10.1",
]
24 changes: 22 additions & 2 deletions pyreadstat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,32 @@
# limitations under the License.
# #############################################################################

from .pyreadstat import read_sav, read_sas7bdat, read_xport, read_dta, read_sav, read_por, read_sas7bcat

from .pyreadstat import read_sav, read_sas7bdat, read_xport, read_dta, read_por, read_sas7bcat
from .pyreadstat import write_sav, write_dta, write_xport, write_por
from .pyreadstat import read_file_in_chunks, read_file_multiprocessing
from .pyclasses import metadata_container
from ._readstat_parser import ReadstatError
from ._readstat_parser import ReadstatError, PyreadstatError
from .pyfunctions import set_value_labels, set_catalog_to_sas

__version__ = "1.3.4"

__all__ = (
"read_sav",
"read_sas7bdat",
"read_xport",
"read_dta",
"read_por",
"read_sas7bcat",
"write_sav",
"write_dta",
"write_xport",
"write_por",
"read_file_in_chunks",
"read_file_multiprocessing",
"metadata_container",
"ReadstatError",
"PyreadstatError",
"set_value_labels",
"set_catalog_to_sas",
)
Empty file added pyreadstat/py.typed
Empty file.
74 changes: 51 additions & 23 deletions pyreadstat/pyclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,31 +14,59 @@
# limitations under the License.
# #############################################################################

# Typing

from dataclasses import dataclass, field
from datetime import datetime
from typing import Literal, TypedDict


class MissingRange(TypedDict):
"""A dictionary to hold the definition of a missing range"""

lo: float
hi: float


class MRSet(TypedDict):
"""A dictionary to hold the definition of a multiple-response (MR) set"""

type: Literal["D", "C"]
is_dichotomy: bool
counted_value: int | None
label: str
variable_list: list[str]


# Classes


@dataclass
class metadata_container:
"""
This class holds metadata we want to give back to python
"""
def __init__(self):
self.column_names = list()
self.column_labels = list()
self.column_names_to_labels = dict()
self.file_encoding = None
self.number_columns = None
self.number_rows = None
self.variable_value_labels = dict()
self.value_labels = dict()
self.variable_to_label = dict()
self.notes = list()
self.original_variable_types = dict()
self.readstat_variable_types = dict()
self.table_name = None
self.missing_ranges = dict()
self.missing_user_values = dict()
self.variable_storage_width = dict()
self.variable_display_width = dict()
self.variable_alignment = dict()
self.variable_measure = dict()
self.creation_time = None
self.modification_time = None
self.mr_sets = dict()

column_names: list[str] = field(default_factory=list)
column_labels: list[str] = field(default_factory=list)
column_names_to_labels: dict[str, str] = field(default_factory=dict)
file_encoding: str | None = None
file_label: str | None = None
number_columns: int | None = None
number_rows: int | None = None
variable_value_labels: dict[str, dict[float | int, str]] = field(default_factory=dict)
value_labels: dict[str, dict[float | int, str]] = field(default_factory=dict)
variable_to_label: dict[str, str] = field(default_factory=dict)
notes: list[str] = field(default_factory=list)
original_variable_types: dict[str, str] = field(default_factory=dict)
readstat_variable_types: dict[str, str] = field(default_factory=dict)
table_name: str | None = None
missing_ranges: dict[str, list[int | float | str | MissingRange]] = field(default_factory=dict)
missing_user_values: dict[str, list[int | float | str | MissingRange]] = field(default_factory=dict)
variable_storage_width: dict[str, int] = field(default_factory=dict)
variable_display_width: dict[str, int] = field(default_factory=dict)
variable_alignment: dict[str, str] = field(default_factory=dict)
variable_measure: dict[str, Literal["nominal", "ordinal", "scale", "unknown"]] = field(default_factory=dict)
creation_time: datetime | None = None
modification_time: datetime | None = None
mr_sets: dict[str, MRSet] = field(default_factory=dict)
71 changes: 49 additions & 22 deletions pyreadstat/pyfunctions.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,45 @@
"""
Functions written in pure python
"""
from copy import deepcopy, copy

from copy import deepcopy
import warnings

import narwhals.stable.v2 as nw
from narwhals.typing import IntoDataFrameT

from .pyclasses import metadata_container

# Functions to deal with value labels

def set_value_labels(dataframe, metadata, formats_as_category=True, formats_as_ordered_category=False):

def set_value_labels(
dataframe: IntoDataFrameT,
metadata: metadata_container,
formats_as_category: bool = True,
formats_as_ordered_category: bool = False,
) -> IntoDataFrameT:
"""
Changes the values in the dataframe according to the value formats in the metadata.
It will return a copy of the dataframe. If no appropiate formats were found, the result will be an unchanged copy
of the original dataframe.

Parameters
----------
dataframe : pandas dataframe
dataframe : pandas or polars dataframe
resulting from parsing a file
metadata : dictionary
resulting from parsing a file
formats_as_category : bool, optional
defaults to True. If True the variables having formats will be transformed into pandas categories.
defaults to True. If True the variables having formats will be transformed into pandas or polars categories.
formats_as_ordered_category : bool, optional
defaults to False. If True the variables having formats will be transformed into pandas ordered categories.
defaults to False. If True the variables having formats will be transformed into pandas or polars ordered categories.
it has precedence over formats_as_category, meaning if this is True, it will take effect irrespective of
the value of formats_as_category.

Returns
-------
df_copy : pandas dataframe
df_copy : pandas or polars dataframe
a copy of the original dataframe with the values changed, if appropiate formats were found, unaltered
otherwise
"""
Expand All @@ -51,20 +61,29 @@ def set_value_labels(dataframe, metadata, formats_as_category=True, formats_as_o
if uval not in labels:
labels[uval] = uval
# if all values are null, there will be nothing to replace. However we cannot do replace_strict on null dtype, it raises an error
if not df_copy.implementation.is_pandas() and (len(df_copy[var_name])==df_copy[var_name].null_count()):
if not df_copy.implementation.is_pandas() and (
len(df_copy[var_name]) == df_copy[var_name].null_count()
):
continue
# replace_strict requires that all the values are in the map. Could not get map_batches or when/then/otherwise to work
elif not df_copy.implementation.is_pandas() and (df_copy[var_name].dtype==nw.Object or not all([type(v)==type(list(labels.values())[0]) for v in labels.values() if v is not None])):
elif not df_copy.implementation.is_pandas() and (
df_copy[var_name].dtype == nw.Object
or not all(
[type(v) == type(list(labels.values())[0]) for v in labels.values() if v is not None]
)
):
# polars is very difficult to convince to mix strings and numbers, so we have to do it this way
temp = [labels[x] for x in df_copy[var_name]]
newser = nw.new_series(name=var_name, values= temp, dtype=nw.Object, backend=df_copy.implementation)
newser = nw.new_series(
name=var_name, values=temp, dtype=nw.Object, backend=df_copy.implementation
)
df_copy = df_copy.with_columns(newser.alias(var_name))
if formats_as_category or formats_as_ordered_category:
msg = f"You requested formats_as_category=True or formats_as_ordered_category=True, but it was not possible to cast variable '{var_name}' to category"
warnings.warn(msg, RuntimeWarning)
continue
# not sure if we get into this situation ever or what would exactly happen, maybe this is not needed?
elif not df_copy.implementation.is_pandas() and df_copy[var_name].dtype==nw.Unknown:
elif not df_copy.implementation.is_pandas() and df_copy[var_name].dtype == nw.Unknown:
msg = f"It was not possible to apply value formats to variable '{var_name}' due to unknown/not supported data type"
warnings.warn(msg, RuntimeWarning)
continue
Expand All @@ -74,7 +93,7 @@ def set_value_labels(dataframe, metadata, formats_as_category=True, formats_as_o
categories = list(set(labels.values()))
original_values = list(labels.keys())
original_values.sort()
revdict= dict()
revdict = dict()
for orival in original_values:
curcat = labels.get(orival)
if not revdict.get(curcat):
Expand All @@ -84,34 +103,39 @@ def set_value_labels(dataframe, metadata, formats_as_category=True, formats_as_o
elif formats_as_category:
df_copy = df_copy.with_columns(nw.col(var_name).cast(nw.Categorical))


return df_copy.to_native()

def set_catalog_to_sas(sas_dataframe, sas_metadata, catalog_metadata, formats_as_category=True,
formats_as_ordered_category=False):

def set_catalog_to_sas(
sas_dataframe: IntoDataFrameT,
sas_metadata: metadata_container,
catalog_metadata: metadata_container,
formats_as_category: bool = True,
formats_as_ordered_category: bool = False,
) -> tuple[IntoDataFrameT, metadata_container]:
"""
Changes the values in the dataframe and sas_metadata according to the formats in the catalog.
It will return a copy of the dataframe and metadata. If no appropriate formats were found, the result will
be an unchanged copy of the original dataframe.

Parameters
----------
sas_dataframe : pandas dataframe
sas_dataframe : pandas or polars dataframe
resulting from parsing a sas7bdat file
sas_metadata : pyreadstat metadata object
resulting from parsing a sas7bdat file
catalog_metadata : pyreadstat metadata object
resulting from parsing a sas7bcat (catalog) file
formats_as_category : bool, optional
defaults to True. If True the variables having formats will be transformed into pandas categories.
defaults to True. If True the variables having formats will be transformed into pandas or polars categories.
formats_as_ordered_category : bool, optional
defaults to False. If True the variables having formats will be transformed into pandas ordered categories.
defaults to False. If True the variables having formats will be transformed into pandas or polars ordered categories.
it has precedence over formats_as_category, meaning if this is True, it will take effect irrespective of
the value of formats_as_category.

Returns
-------
df_copy : pandas dataframe
df_copy : pandas or polars dataframe
a copy of the original dataframe with the values changed, if appropriate formats were found, unaltered
otherwise
metadata : dict
Expand All @@ -122,8 +146,12 @@ def set_catalog_to_sas(sas_dataframe, sas_metadata, catalog_metadata, formats_as
catalog_metadata_copy = deepcopy(catalog_metadata)
metadata = deepcopy(sas_metadata)
metadata.value_labels = catalog_metadata_copy.value_labels
df_copy = set_value_labels(sas_dataframe, metadata, formats_as_category=formats_as_category,
formats_as_ordered_category=formats_as_ordered_category)
df_copy = set_value_labels(
sas_dataframe,
metadata,
formats_as_category=formats_as_category,
formats_as_ordered_category=formats_as_ordered_category,
)

variable_value_labels = dict()
for var_name, var_label in metadata.variable_to_label.items():
Expand All @@ -133,9 +161,8 @@ def set_catalog_to_sas(sas_dataframe, sas_metadata, catalog_metadata, formats_as
metadata.variable_value_labels = variable_value_labels

else:
#df_copy = sas_dataframe.copy()
# df_copy = sas_dataframe.copy()
df_copy = nw.from_native(sas_dataframe).clone().to_native()
metadata = deepcopy(sas_metadata)

return df_copy, metadata

Loading